In [1]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'

import helper as h

from importlib import reload

In [2]:
df = pd.read_csv('../data/cars.csv')
df.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [3]:
cols = ['Make','Model','Year','Engine HP', 'Engine Cylinders',\
    'Transmission Type', 'Vehicle Style', 'highway MPG', 'city mpg', 'MSRP']
df = df[cols]

In [4]:
df.columns = df.columns.str.replace(' ', '_').str.lower()

In [5]:
df.columns

Index(['make', 'model', 'year', 'engine_hp', 'engine_cylinders',
       'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg',
       'msrp'],
      dtype='object')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11914 entries, 0 to 11913
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   make               11914 non-null  object 
 1   model              11914 non-null  object 
 2   year               11914 non-null  int64  
 3   engine_hp          11845 non-null  float64
 4   engine_cylinders   11884 non-null  float64
 5   transmission_type  11914 non-null  object 
 6   vehicle_style      11914 non-null  object 
 7   highway_mpg        11914 non-null  int64  
 8   city_mpg           11914 non-null  int64  
 9   msrp               11914 non-null  int64  
dtypes: float64(2), int64(4), object(4)
memory usage: 930.9+ KB


### Question 1

What is the most frequent observation (mode) for the column `transmission_type`?

- `AUTOMATIC`
- `MANUAL`
- `AUTOMATED_MANUAL`
- `DIRECT_DRIVE`

In [7]:
df.transmission_type.value_counts() # AUTOMATIC

AUTOMATIC           8266
MANUAL              2935
AUTOMATED_MANUAL     626
DIRECT_DRIVE          68
UNKNOWN               19
Name: transmission_type, dtype: int64

### Question 2

Create the [correlation matrix](https://www.google.com/search?q=correlation+matrix) for the numerical features of your dataset. 
In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.

What are the two features that have the biggest correlation in this dataset?

- `engine_hp` and `year`
- `engine_hp` and `engine_cylinders`
- `highway_mpg` and `engine_cylinders`
- `highway_mpg` and `city_mpg`


In [8]:
# highway_mpg and city_mpg
df[['engine_hp', 'engine_cylinders', 'year', 'highway_mpg', 'city_mpg']].corr()

Unnamed: 0,engine_hp,engine_cylinders,year,highway_mpg,city_mpg
engine_hp,1.0,0.779988,0.351794,-0.406563,-0.439371
engine_cylinders,0.779988,1.0,-0.041479,-0.621606,-0.600776
year,0.351794,-0.041479,1.0,0.25824,0.198171
highway_mpg,-0.406563,-0.621606,0.25824,1.0,0.886829
city_mpg,-0.439371,-0.600776,0.198171,0.886829,1.0


### Make `price` binary

* Now we need to turn the `price` variable from numeric into a binary format.
* Let's create a variable `above_average` which is `1` if the `price` is above its mean value and `0` otherwise.

In [9]:
df.msrp.mean()

40594.737032063116

In [10]:
df['above_average'] = np.where(df.msrp > df.msrp.mean(), 1, 0)
df.head(2)

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,msrp,above_average
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135,1
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650,1


### Split the data

* Split your data in train/val/test sets with 60%/20%/20% distribution.
* Use Scikit-Learn for that (the `train_test_split` function) and set the seed to `42`.
* Make sure that the target value (`above_average`) is not in your dataframe.

In [11]:
df.engine_cylinders = df.engine_cylinders.fillna(0)
df.engine_hp = df.engine_hp.fillna(0)

In [12]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

y_train = df_train.above_average
y_val = df_val.above_average
y_test = df_test.above_average

del df_train['above_average']
del df_val['above_average']
del df_test['above_average']
del df_train['msrp']
del df_val['msrp']
del df_test['msrp']

### Question 3

* Calculate the mutual information score between `above_average` and other categorical variables in our dataset. 
  Use the training set only.
* Round the scores to 2 decimals using `round(score, 2)`.

Which of these variables has the lowest mutual information score?
  
- `make`
- `model`
- `transmission_type`
- `vehicle_style`

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11914 entries, 0 to 11913
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   make               11914 non-null  object 
 1   model              11914 non-null  object 
 2   year               11914 non-null  int64  
 3   engine_hp          11914 non-null  float64
 4   engine_cylinders   11914 non-null  float64
 5   transmission_type  11914 non-null  object 
 6   vehicle_style      11914 non-null  object 
 7   highway_mpg        11914 non-null  int64  
 8   city_mpg           11914 non-null  int64  
 9   msrp               11914 non-null  int64  
 10  above_average      11914 non-null  int64  
dtypes: float64(2), int64(5), object(4)
memory usage: 1024.0+ KB


In [14]:
categorical = ['make', 'model', 'transmission_type', 'vehicle_style']

In [15]:
def calculate_mi(series):
    return mutual_info_score(series, y_train)

In [16]:
# lowest -> transmission type
df_train[categorical].apply(calculate_mi).sort_values(ascending=False).round(2)

model                0.46
make                 0.24
vehicle_style        0.08
transmission_type    0.02
dtype: float64

### Question 4

* Now let's train a logistic regression.
* Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
* Fit the model on the training dataset.
    - To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
    - `model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)`
* Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

In [17]:
df_train.head(1)

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg
3972,Mitsubishi,Endeavor,2011,225.0,6.0,AUTOMATIC,4dr SUV,19,15


In [18]:
dict_train = df_train.to_dict(orient='records')
dict_val = df_val.to_dict(orient='records')

In [19]:
dv = DictVectorizer(sparse=False)
dv.fit(dict_train)
X_train = dv.transform(dict_train)
X_val = dv.transform(dict_val)

In [21]:
X_train.shape

(7148, 943)

In [22]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

In [23]:
(y_pred == y_val).mean()

0.9345362987830466

### Question 5 

* Let's find the least useful feature using the *feature elimination* technique.
* Train a model with all these features (using the same parameters as in Q4).
* Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
* For each feature, calculate the difference between the original accuracy and the accuracy without the feature. 

Which of following feature has the smallest difference?

- `year`
- `engine_hp`
- `transmission_type`
- `city_mpg`

> **Note**: the difference doesn't have to be positive

In [24]:
df_train.columns

Index(['make', 'model', 'year', 'engine_hp', 'engine_cylinders',
       'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg'],
      dtype='object')

In [25]:
cols1 = ['make', 'model', 'engine_hp', 'engine_cylinders', 'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg']

In [26]:
dict_train = df_train[cols1].to_dict(orient='records')
dict_val = df_val[cols1].to_dict(orient='records')

dv = DictVectorizer(sparse=False)
dv.fit(dict_train)
X_train = dv.transform(dict_train)
X_val = dv.transform(dict_val)

model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

(y_pred == y_val).mean()

0.9475451112043642

In [27]:
# 0.02 improvment
0.95 - 0.93

0.019999999999999907

In [28]:
cols2 = ['make', 'model', 'year', 'engine_cylinders',
       'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg']

In [29]:
dict_train = df_train[cols2].to_dict(orient='records')
dict_val = df_val[cols2].to_dict(orient='records')

dv = DictVectorizer(sparse=False)
dv.fit(dict_train)
X_train = dv.transform(dict_train)
X_val = dv.transform(dict_val)

model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

(y_pred == y_val).mean() # almost 0

0.9299202685690307

In [30]:
cols3 = ['make', 'model', 'year', 'engine_hp', 'engine_cylinders',
       'vehicle_style', 'highway_mpg', 'city_mpg']

In [31]:
dict_train = df_train[cols3].to_dict(orient='records')
dict_val = df_val[cols3].to_dict(orient='records')

dv = DictVectorizer(sparse=False)
dv.fit(dict_train)
X_train = dv.transform(dict_train)
X_val = dv.transform(dict_val)

model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

(y_pred == y_val).mean() 

0.9458665547629039

In [32]:
cols4 = ['make', 'model', 'year', 'engine_hp', 'engine_cylinders',
       'transmission_type', 'vehicle_style', 'highway_mpg']

In [33]:
dict_train = df_train[cols4].to_dict(orient='records')
dict_val = df_val[cols4].to_dict(orient='records')

dv = DictVectorizer(sparse=False)
dv.fit(dict_train)
X_train = dv.transform(dict_train)
X_val = dv.transform(dict_val)

model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

(y_pred == y_val).mean() 

0.9458665547629039

In [43]:
# answer -> engine_hp

### Question 6

* For this question, we'll see how to use a linear regression model from Scikit-Learn.
* We'll need to use the original column `price`. Apply the logarithmic transformation to this column.
* Fit the Ridge regression model on the training data with a solver `'sag'`. Set the seed to `42`.
* This model also has a parameter `alpha`. Let's try the following values: `[0, 0.01, 0.1, 1, 10]`.
* Round your RMSE scores to 3 decimal digits.

Which of these alphas leads to the best RMSE on the validation set?

- 0
- 0.01
- 0.1
- 1
- 10

> **Note**: If there are multiple options, select the smallest `alpha`.

In [34]:
from sklearn.linear_model import Ridge

In [35]:
df.head(2)

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,msrp,above_average
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135,1
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650,1


In [36]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

y_train = df_train.msrp
y_val = df_val.msrp
y_test = df_test.msrp

del df_train['above_average']
del df_val['above_average']
del df_test['above_average']
del df_train['msrp']
del df_val['msrp']
del df_test['msrp']

In [37]:
df_train.head(1)

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg
3972,Mitsubishi,Endeavor,2011,225.0,6.0,AUTOMATIC,4dr SUV,19,15


In [38]:
dict_train = df_train.to_dict(orient='records')
dict_val = df_val.to_dict(orient='records')

dv = DictVectorizer(sparse=False)
dv.fit(dict_train)
X_train = dv.transform(dict_train)
X_val = dv.transform(dict_val)

In [39]:
X_train.shape

(7148, 943)

In [40]:
y_train = np.log1p(y_train)

In [41]:
y_val = np.log1p(y_val)

In [42]:
def rmse(y, y_pred):
    ''' 
    y - actual prices
    y_pred - predicted prices

    calculates RMSE score
    '''
    error = y - y_pred
    mse = (error ** 2).mean()
    return np.sqrt(mse)

Everything below works sloooooooow

In [56]:
# for alpha in [0, 0.01, 0.1, 1, 10]:
#     model = Ridge(alpha=alpha, solver='sag', random_state=42)
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_val)
#     score = rmse(y_val, y_pred)
#     print(f'alpha: {alpha}, rmse: {score}')

In [48]:
model = Ridge(alpha=0, solver='sag', max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)



In [46]:
rmse(y_val, y_pred).round(3)

0.487

In [49]:
# after max iterations limit
rmse(y_val, y_pred).round(3)

0.487

In [50]:
model = Ridge(alpha=0.01, solver='sag', random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
rmse(y_val, y_pred).round(3)



0.487

In [51]:
model = Ridge(alpha=0.1, solver='sag', random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
rmse(y_val, y_pred).round(3)



0.487

In [52]:
model = Ridge(alpha=1, solver='sag', random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
rmse(y_val, y_pred).round(3)



0.487

In [53]:
model = Ridge(alpha=10, solver='sag', random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
rmse(y_val, y_pred).round(3)



0.487