### READ DATA 

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("vehicles/vehicles.csv", low_memory = False)
data.head()

Unnamed: 0,Make,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550
2,AM General,Post Office DJ5 2WD,1985,2.5,4.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,16,17,16,555.4375,2100
3,AM General,Post Office DJ8 2WD,1985,4.2,6.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550
4,ASC Incorporated,GNX,1987,3.8,6.0,Automatic 4-spd,Rear-Wheel Drive,Midsize Cars,Premium,20.600625,14,21,16,555.4375,2550


### RENAME COLUMNS 

In [3]:
data.columns = ['Manufacturer','Model','Year','Displacement',
                'Cylinders','Transmission','Drivetrain',
                'Vehicle Class','Fuel Type','Fuel Barrels/Year',
                'City MPG','Highway MPG','Combined MPG',
                'CO2 Emission Grams/Mile','Fuel Cost/Year']

In [6]:
data = data.rename(columns={'Manufacturer': 'Make',
                            'Displacement': 'Engine Displacement'})


### CHANGING COLUMN ORDER 

In [7]:
column_order = ['Year','Make','Model','Vehicle Class',
                'Transmission','Drivetrain','Fuel Type',
                'Cylinders','Engine Displacement','Fuel Barrels/Year',
                'City MPG','Highway MPG','Combined MPG',
                'CO2 Emission Grams/Mile','Fuel Cost/Year']

data = data[column_order]

In [9]:
data.head()

Unnamed: 0,Year,Make,Model,Vehicle Class,Transmission,Drivetrain,Fuel Type,Cylinders,Engine Displacement,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
0,1984,AM General,DJ Po Vehicle 2WD,Special Purpose Vehicle 2WD,Automatic 3-spd,2-Wheel Drive,Regular,4.0,2.5,19.388824,18,17,17,522.764706,1950
1,1984,AM General,FJ8c Post Office,Special Purpose Vehicle 2WD,Automatic 3-spd,2-Wheel Drive,Regular,6.0,4.2,25.354615,13,13,13,683.615385,2550
2,1985,AM General,Post Office DJ5 2WD,Special Purpose Vehicle 2WD,Automatic 3-spd,Rear-Wheel Drive,Regular,4.0,2.5,20.600625,16,17,16,555.4375,2100
3,1985,AM General,Post Office DJ8 2WD,Special Purpose Vehicle 2WD,Automatic 3-spd,Rear-Wheel Drive,Regular,6.0,4.2,25.354615,13,13,13,683.615385,2550
4,1987,ASC Incorporated,GNX,Midsize Cars,Automatic 4-spd,Rear-Wheel Drive,Premium,6.0,3.8,20.600625,14,21,16,555.4375,2550


### FILTERING 

In [10]:
filtered = data[(data['Make']=='Ford') &
                (data['Cylinders']>=6) &
                (data['Combined MPG'] < 18)]
filtered.head()

Unnamed: 0,Year,Make,Model,Vehicle Class,Transmission,Drivetrain,Fuel Type,Cylinders,Engine Displacement,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
11442,1986,Ford,Aerostar Van,Vans,Automatic 4-spd,Rear-Wheel Drive,Regular,6.0,2.8,19.388824,15,21,17,522.764706,1950
11450,1988,Ford,Aerostar Van,Vans,Automatic 4-spd,Rear-Wheel Drive,Regular,6.0,3.0,19.388824,15,20,17,522.764706,1950
11452,1989,Ford,Aerostar Van,Vans,Automatic 4-spd,Rear-Wheel Drive,Regular,6.0,3.0,19.388824,15,21,17,522.764706,1950
11456,1990,Ford,Aerostar Van,Vans,Automatic 4-spd,Rear-Wheel Drive,Regular,6.0,4.0,19.388824,15,20,17,522.764706,1950
11459,1991,Ford,Aerostar Van,Vans,Automatic 4-spd,Rear-Wheel Drive,Regular,6.0,4.0,19.388824,15,20,17,522.764706,1950


### BINNING

In [12]:
mpg_labels = ['Very Low', 'Low', 'Moderate', 'High', 'Very High']

In [16]:
# Equal width bins: the range for each bin is the same size.
bins = pd.cut(data['Combined MPG'], 5, labels=mpg_labels)
bins.head(10)

0         Low
1    Very Low
2    Very Low
3    Very Low
4    Very Low
5         Low
6         Low
7         Low
8         Low
9         Low
Name: Combined MPG, dtype: category
Categories (5, object): [Very Low < Low < Moderate < High < Very High]

In [17]:
# Equal frequency bins: approximately the same number of records in each bin.
bins = pd.qcut(data['Combined MPG'], 5, labels=mpg_labels)
bins.head(10)

0         Low
1    Very Low
2    Very Low
3    Very Low
4    Very Low
5        High
6        High
7    Moderate
8        High
9        High
Name: Combined MPG, dtype: category
Categories (5, object): [Very Low < Low < Moderate < High < Very High]

In [18]:
# Custom: the user defines the cutoffs
cutoffs = [7,14,21,23,30,40]
bins = pd.cut(data['Combined MPG'], cutoffs, labels=mpg_labels)
bins.head(10)

0         Low
1    Very Low
2         Low
3    Very Low
4         Low
5    Moderate
6        High
7         Low
8    Moderate
9        High
Name: Combined MPG, dtype: category
Categories (5, object): [Very Low < Low < Moderate < High < Very High]

### CONDITIONAL CATEGORIES 

In [23]:
data.loc[data['Transmission'].str.startswith('A'), 'TransType'] = 'Automatic'
data.loc[data['Transmission'].str.startswith('M'), 'TransType'] = 'Manual'

### ONE-HOT ENCODING CATEGORICAL VARIABLES 

In [24]:
data['Drivetrain']

0           2-Wheel Drive
1           2-Wheel Drive
2        Rear-Wheel Drive
3        Rear-Wheel Drive
4        Rear-Wheel Drive
               ...       
35947    Rear-Wheel Drive
35948    Rear-Wheel Drive
35949    Rear-Wheel Drive
35950    Rear-Wheel Drive
35951    Rear-Wheel Drive
Name: Drivetrain, Length: 35952, dtype: object

In [26]:
drivetrain = pd.get_dummies(data['Drivetrain'])
drivetrain.head(10)

Unnamed: 0,2-Wheel Drive,"2-Wheel Drive, Front",4-Wheel Drive,4-Wheel or All-Wheel Drive,All-Wheel Drive,Front-Wheel Drive,Part-time 4-Wheel Drive,Rear-Wheel Drive
0,1,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,1
5,0,0,0,0,0,1,0,0
6,0,0,0,0,0,1,0,0
7,0,0,0,0,0,1,0,0
8,0,0,0,0,0,1,0,0
9,0,0,0,0,0,1,0,0


### COMBINE DATAFRAMES 

In [28]:
#groupby
avg_mpg = data.groupby('Make', as_index=False)['Combined MPG'].mean() 
avg_mpg

Unnamed: 0,Make,Combined MPG
0,AM General,14.750000
1,ASC Incorporated,16.000000
2,Acura,21.506623
3,Alfa Romeo,19.512195
4,American Motors Corporation,17.681818
...,...,...
122,Volkswagen,24.093601
123,Volvo,20.605300
124,Wallace Environmental,13.875000
125,Yugo,25.000000


In [30]:
#pivot tables
data.pivot_table(index='Make', values='Combined MPG', aggfunc=np.mean)

Unnamed: 0_level_0,Combined MPG
Make,Unnamed: 1_level_1
AM General,14.750000
ASC Incorporated,16.000000
Acura,21.506623
Alfa Romeo,19.512195
American Motors Corporation,17.681818
...,...
Volkswagen,24.093601
Volvo,20.605300
Wallace Environmental,13.875000
Yugo,25.000000


In [32]:
#JOIN
data = pd.merge(data, avg_mpg, on='Make')
data.head()

Unnamed: 0,Year,Make,Model,Vehicle Class,Transmission,Drivetrain,Fuel Type,Cylinders,Engine Displacement,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG_x,CO2 Emission Grams/Mile,Fuel Cost/Year,TransType,Combined MPG_y,Combined MPG
0,1984,AM General,DJ Po Vehicle 2WD,Special Purpose Vehicle 2WD,Automatic 3-spd,2-Wheel Drive,Regular,4.0,2.5,19.388824,18,17,17,522.764706,1950,Automatic,14.75,14.75
1,1984,AM General,FJ8c Post Office,Special Purpose Vehicle 2WD,Automatic 3-spd,2-Wheel Drive,Regular,6.0,4.2,25.354615,13,13,13,683.615385,2550,Automatic,14.75,14.75
2,1985,AM General,Post Office DJ5 2WD,Special Purpose Vehicle 2WD,Automatic 3-spd,Rear-Wheel Drive,Regular,4.0,2.5,20.600625,16,17,16,555.4375,2100,Automatic,14.75,14.75
3,1985,AM General,Post Office DJ8 2WD,Special Purpose Vehicle 2WD,Automatic 3-spd,Rear-Wheel Drive,Regular,6.0,4.2,25.354615,13,13,13,683.615385,2550,Automatic,14.75,14.75
4,1987,ASC Incorporated,GNX,Midsize Cars,Automatic 4-spd,Rear-Wheel Drive,Premium,6.0,3.8,20.600625,14,21,16,555.4375,2550,Automatic,16.0,16.0


In [35]:
lexus = data[data['Make']=='Lexus']
audi = data[data['Make']=='Audi']
lexus_audi = pd.concat([lexus, audi], axis=0) #axis=0 means we are adding rows, axis=1 means we are adding columns

In [36]:
lexus_audi

Unnamed: 0,Year,Make,Model,Vehicle Class,Transmission,Drivetrain,Fuel Type,Cylinders,Engine Displacement,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG_x,CO2 Emission Grams/Mile,Fuel Cost/Year,TransType,Combined MPG_y,Combined MPG
21128,2011,Lexus,CT 200h,Compact Cars,Automatic (variable gear ratios),Front-Wheel Drive,Regular,4.0,1.8,7.847857,43,40,42,211.595238,800,Automatic,20.982368,20.982368
21129,2012,Lexus,CT 200h,Compact Cars,Automatic (variable gear ratios),Front-Wheel Drive,Regular,4.0,1.8,7.847857,43,40,42,211.595238,800,Automatic,20.982368,20.982368
21130,2013,Lexus,CT 200h,Compact Cars,Automatic (variable gear ratios),Front-Wheel Drive,Regular,4.0,1.8,7.847857,43,40,42,211.000000,800,Automatic,20.982368,20.982368
21131,2014,Lexus,CT 200h,Compact Cars,Automatic (variable gear ratios),Front-Wheel Drive,Regular,4.0,1.8,7.847857,43,40,42,179.000000,800,Automatic,20.982368,20.982368
21132,2015,Lexus,CT 200h,Compact Cars,Automatic (variable gear ratios),Front-Wheel Drive,Regular,4.0,1.8,7.847857,43,40,42,211.000000,800,Automatic,20.982368,20.982368
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1388,2013,Audi,allroad quattro,Small Station Wagons,Automatic (S8),All-Wheel Drive,Premium,4.0,2.0,14.330870,20,27,23,394.000000,1750,Automatic,20.325843,20.325843
1389,2013,Audi,allroad quattro,Small Station Wagons,Automatic (S8),All-Wheel Drive,Premium or E85,4.0,2.0,14.330870,20,27,23,394.000000,1750,Automatic,20.325843,20.325843
1390,2014,Audi,allroad quattro,Small Station Wagons,Automatic (S8),All-Wheel Drive,Premium or E85,4.0,2.0,14.330870,20,27,23,394.000000,1750,Automatic,20.325843,20.325843
1391,2015,Audi,allroad quattro,Small Station Wagons,Automatic (S8),All-Wheel Drive,Premium or E85,4.0,2.0,13.733750,21,28,24,373.000000,1700,Automatic,20.325843,20.325843


### MELTING DATA INTO LONG FORMAT 

In [39]:
#transforming columns into categorical variables into rows// functions nice for graphics
melted = pd.melt(data, id_vars=['Year', 'Make', 'Model'],
                value_vars=['City MPG', 'Highway MPG', 'Combined MPG'])
melted.head(30)

Unnamed: 0,Year,Make,Model,variable,value
0,1984,AM General,DJ Po Vehicle 2WD,City MPG,18.0
1,1984,AM General,FJ8c Post Office,City MPG,13.0
2,1985,AM General,Post Office DJ5 2WD,City MPG,16.0
3,1985,AM General,Post Office DJ8 2WD,City MPG,13.0
4,1987,ASC Incorporated,GNX,City MPG,14.0
5,1997,Acura,2.2CL/3.0CL,City MPG,20.0
6,1997,Acura,2.2CL/3.0CL,City MPG,22.0
7,1997,Acura,2.2CL/3.0CL,City MPG,18.0
8,1998,Acura,2.3CL/3.0CL,City MPG,19.0
9,1998,Acura,2.3CL/3.0CL,City MPG,21.0
