## 2. Machine Learning for Regression


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv'

In [3]:
# Load the data into a DataFrame
df = pd.read_csv(data)

In [4]:
df.head(3)

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350


### Split and shuffle data

Data need to shuffle and split for training, validation and test.

Why it need to shuffle? 

If the dataset is sorted by some feature (for example, make, year, region, or time), then simply taking the first 60% for training and the rest for validation/test will not represent the whole population.
Example, the dataset is sorted by make, the training data will like ['bmw','bmw',...,'ford',..., 'toyota',...]. 

If we take the first 60% as the training set, it will mostly contain BMW and Ford.
The validation set may contain mostly Toyota.

As a result, the model will learn well for BMW and Ford but will perform poorly on Toyota, because it never saw enough Toyota samples during training.

Shuffling makes the data well-mixed, so each subset (training, validation, and test) has a similar distribution of all makes.

For time-series data (like stock, electricity consumption), we should not shuffle, because the data has a natural order.





In [5]:
# get length of dataframe
n = len(df)
print(n)

11914


In [6]:
# generate array sequence from 0 to n-1
idx = np.arange(n)
print(idx)

[    0     1     2 ... 11911 11912 11913]


In [7]:
# shuffle the array idx`
# use a random seed to make the shuffling reproducible
np.random.seed(2)
np.random.shuffle(idx)

print(idx)

[2735 6720 5878 ... 6637 2575 7336]


In [8]:
# split the data into train, val, test. 60% train, 20% val, 20% test
n_test = int(n * 0.2)
n_val = int(n * 0.2)
n_train = n - n_val - n_test

print(n_train, n_val, n_test)

7150 2382 2382


In [9]:
# split the df into train, val, test using the idx array

# df_train = idx 0..n_train
df_train = df.iloc[idx[:n_train]]

# df_val = idx n_train..n_train+n_val
df_val = df.iloc[idx[n_train:n_train + n_val]]

# df_test = idx n_train+n_val..end
df_test = df.iloc[idx[n_train + n_val:]]

print(len(df_train), len(df_val), len(df_test))


7150 2382 2382


In [10]:
# preview df_train
df_train.head(10)

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
2735,Chevrolet,Cobalt,2008,regular unleaded,148.0,4.0,MANUAL,front wheel drive,2.0,,Compact,Coupe,33,24,1385,14410
6720,Toyota,Matrix,2012,regular unleaded,132.0,4.0,AUTOMATIC,front wheel drive,4.0,Hatchback,Compact,4dr Hatchback,32,25,2031,19685
5878,Subaru,Impreza,2016,regular unleaded,148.0,4.0,AUTOMATIC,all wheel drive,4.0,Hatchback,Compact,4dr Hatchback,37,28,640,19795
11190,Volkswagen,Vanagon,1991,regular unleaded,90.0,4.0,MANUAL,rear wheel drive,3.0,,Large,Passenger Minivan,18,16,873,2000
4554,Ford,F-150,2017,flex-fuel (unleaded/E85),385.0,8.0,AUTOMATIC,four wheel drive,4.0,Flex Fuel,Large,Crew Cab Pickup,21,15,5657,56260
8001,Volkswagen,Rabbit,2008,regular unleaded,170.0,5.0,MANUAL,front wheel drive,4.0,Hatchback,Compact,4dr Hatchback,29,22,873,17575
2882,Bentley,Continental GTC,2013,premium unleaded (required),500.0,8.0,AUTOMATIC,all wheel drive,2.0,"Exotic,Luxury,High-Performance",Midsize,Convertible,24,14,520,191400
649,BMW,6 Series,2015,premium unleaded (required),315.0,6.0,AUTOMATIC,rear wheel drive,2.0,"Luxury,Performance",Midsize,Coupe,32,21,3916,76100
616,Maybach,57,2012,premium unleaded (required),543.0,12.0,AUTOMATIC,rear wheel drive,4.0,"Exotic,Luxury",Large,Sedan,16,10,67,379050
4459,Ford,F-150 Heritage,2004,regular unleaded,202.0,6.0,MANUAL,four wheel drive,2.0,,Large,Regular Cab Pickup,18,13,5657,26030


In [11]:
# preview df_val
df_val.head(10)

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
2779,Chevrolet,Colorado,2015,regular unleaded,200.0,4.0,AUTOMATIC,four wheel drive,4.0,,Compact,Extended Cab Pickup,25,19,1385,26885
3708,Mercedes-Benz,E-Class,2017,premium unleaded (required),241.0,4.0,AUTOMATIC,all wheel drive,4.0,Luxury,Midsize,Sedan,29,22,617,54650
4794,Ford,Focus,2017,flex-fuel (unleaded/E85),160.0,4.0,MANUAL,front wheel drive,4.0,Flex Fuel,Compact,Sedan,36,26,5657,16775
10498,Acura,TLX,2016,premium unleaded (recommended),290.0,6.0,AUTOMATIC,front wheel drive,4.0,Luxury,Midsize,Sedan,34,21,204,42600
1880,Volkswagen,Beetle Convertible,2016,regular unleaded,170.0,4.0,AUTOMATIC,front wheel drive,2.0,,Compact,Convertible,34,25,873,25995
7587,Toyota,Pickup,1995,regular unleaded,116.0,4.0,MANUAL,rear wheel drive,2.0,,Compact,Regular Cab Pickup,25,19,2031,2000
7035,Nissan,Murano,2016,regular unleaded,260.0,6.0,AUTOMATIC,all wheel drive,4.0,Crossover,Midsize,4dr SUV,28,21,2009,34400
892,Saab,900,1996,regular unleaded,185.0,4.0,MANUAL,front wheel drive,2.0,"Luxury,Performance",Compact,Convertible,26,18,376,2000
4911,Land Rover,Freelander,2004,regular unleaded,174.0,6.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Luxury",Compact,4dr SUV,19,16,258,25330
10011,Dodge,Stratus,2004,regular unleaded,200.0,6.0,MANUAL,front wheel drive,4.0,Performance,Midsize,Sedan,25,17,1851,22370


In [12]:
# preview df_test
df_test.head(10)

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
11195,GMC,Vandura,1994,regular unleaded,165.0,6.0,AUTOMATIC,rear wheel drive,3.0,,Compact,Cargo Van,20,15,549,2000
673,Mercedes-Benz,600-Class,1993,regular unleaded,389.0,12.0,AUTOMATIC,rear wheel drive,2.0,Luxury,Large,Coupe,15,11,617,3211
11270,Toyota,Venza,2013,regular unleaded,268.0,6.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Performance",Midsize,Wagon,25,18,2031,31120
752,Volvo,740,1992,regular unleaded,114.0,4.0,AUTOMATIC,rear wheel drive,4.0,Luxury,Midsize,Sedan,26,18,870,2000
3137,Ford,Crown Victoria,2010,flex-fuel (unleaded/E85),224.0,8.0,AUTOMATIC,rear wheel drive,4.0,Flex Fuel,Large,Sedan,24,16,5657,29905
1027,Audi,A3,2015,premium unleaded (recommended),220.0,4.0,AUTOMATED_MANUAL,all wheel drive,2.0,Luxury,Compact,Convertible,32,23,3105,41150
8133,Dodge,Ram Cargo,2002,regular unleaded,175.0,6.0,AUTOMATIC,rear wheel drive,3.0,,Compact,Cargo Van,15,13,1851,19275
10823,Mazda,Tribute,2011,regular unleaded,171.0,4.0,AUTOMATIC,all wheel drive,4.0,Crossover,Compact,4dr SUV,26,20,586,25495
3018,Chevrolet,Corvette,2017,premium unleaded (recommended),460.0,8.0,MANUAL,rear wheel drive,2.0,High-Performance,Compact,Convertible,25,16,1385,69450
8994,Hyundai,Santa Fe,2016,regular unleaded,290.0,6.0,AUTOMATIC,front wheel drive,4.0,Crossover,Midsize,4dr SUV,25,18,1439,34500
