## Learning to split data into test and train

In [1]:
# Video: https://www.youtube.com/watch?v=xgDs0scjuuQ
# There are 2 types of things, dependant and independant data

In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
# importing dataset
dataset = pd.read_csv('cars.csv', delimiter=';')
df = dataset.head(11)
df

Unnamed: 0,Car,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model,Origin
0,STRING,DOUBLE,INT,DOUBLE,DOUBLE,DOUBLE,DOUBLE,INT,CAT
1,Chevrolet Chevelle Malibu,18.0,8,307.0,130.0,3504.,12.0,70,US
2,Buick Skylark 320,15.0,8,350.0,165.0,3693.,11.5,70,US
3,Plymouth Satellite,18.0,8,318.0,150.0,3436.,11.0,70,US
4,AMC Rebel SST,16.0,8,304.0,150.0,3433.,12.0,70,US
5,Ford Torino,17.0,8,302.0,140.0,3449.,10.5,70,US
6,Ford Galaxie 500,15.0,8,429.0,198.0,4341.,10.0,70,US
7,Chevrolet Impala,14.0,8,454.0,220.0,4354.,9.0,70,US
8,Plymouth Fury iii,14.0,8,440.0,215.0,4312.,8.5,70,US
9,Pontiac Catalina,14.0,8,455.0,225.0,4425.,10.0,70,US


In [7]:
# create a new column 'buy' which shows if the user bought the car or not. This is a decision column. 0 if the user doesnt buy and 1 if the user buys the car. 
# the influencing of this column would be the displacement, hoursepower and the weight

buy = ['FLOAT',0,0,0,0,0,1,1,1,1,0]
df['Buy'] = buy
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Buy'] = buy


Unnamed: 0,Car,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model,Origin,Buy
0,STRING,DOUBLE,INT,DOUBLE,DOUBLE,DOUBLE,DOUBLE,INT,CAT,FLOAT
1,Chevrolet Chevelle Malibu,18.0,8,307.0,130.0,3504.,12.0,70,US,0
2,Buick Skylark 320,15.0,8,350.0,165.0,3693.,11.5,70,US,0
3,Plymouth Satellite,18.0,8,318.0,150.0,3436.,11.0,70,US,0
4,AMC Rebel SST,16.0,8,304.0,150.0,3433.,12.0,70,US,0
5,Ford Torino,17.0,8,302.0,140.0,3449.,10.5,70,US,0
6,Ford Galaxie 500,15.0,8,429.0,198.0,4341.,10.0,70,US,1
7,Chevrolet Impala,14.0,8,454.0,220.0,4354.,9.0,70,US,1
8,Plymouth Fury iii,14.0,8,440.0,215.0,4312.,8.5,70,US,1
9,Pontiac Catalina,14.0,8,455.0,225.0,4425.,10.0,70,US,1


In [8]:
# creating two variables which will have the values of independant and dependant variables.

independent = df.iloc[1:, 1:7].values
independent

array([['18.0', '8', '307.0', '130.0', '3504.', '12.0'],
       ['15.0', '8', '350.0', '165.0', '3693.', '11.5'],
       ['18.0', '8', '318.0', '150.0', '3436.', '11.0'],
       ['16.0', '8', '304.0', '150.0', '3433.', '12.0'],
       ['17.0', '8', '302.0', '140.0', '3449.', '10.5'],
       ['15.0', '8', '429.0', '198.0', '4341.', '10.0'],
       ['14.0', '8', '454.0', '220.0', '4354.', '9.0'],
       ['14.0', '8', '440.0', '215.0', '4312.', '8.5'],
       ['14.0', '8', '455.0', '225.0', '4425.', '10.0'],
       ['15.0', '8', '390.0', '190.0', '3850.', '8.5']], dtype=object)

In [9]:
# this is the dependant variable, which is the buy column
dependent = df.iloc[1:, 9:].values
dependent

array([[0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [0]], dtype=object)

In [14]:
# test data size is 20%
# random state defines the data will be selected at random
# if random state is 0, then the results would be the same
x_train, x_test, y_train, y_test = train_test_split(independent, dependent, test_size = 0.2, random_state = 5)

In [15]:
x_train

array([['18.0', '8', '318.0', '150.0', '3436.', '11.0'],
       ['17.0', '8', '302.0', '140.0', '3449.', '10.5'],
       ['14.0', '8', '440.0', '215.0', '4312.', '8.5'],
       ['15.0', '8', '350.0', '165.0', '3693.', '11.5'],
       ['18.0', '8', '307.0', '130.0', '3504.', '12.0'],
       ['14.0', '8', '455.0', '225.0', '4425.', '10.0'],
       ['14.0', '8', '454.0', '220.0', '4354.', '9.0'],
       ['16.0', '8', '304.0', '150.0', '3433.', '12.0']], dtype=object)

In [16]:
x_test

array([['15.0', '8', '390.0', '190.0', '3850.', '8.5'],
       ['15.0', '8', '429.0', '198.0', '4341.', '10.0']], dtype=object)

In [17]:
y_test

array([[0],
       [1]], dtype=object)

In [18]:
y_train

array([[0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [1],
       [0]], dtype=object)