### Homework - Week 3

In [33]:
import pandas as pd
import numpy as np
import sys, os

from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import DictVectorizer

### Prepare the Data

In [2]:
df_raw = pd.read_csv(r"https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv")
df_raw.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [3]:
data = df_raw[['Make', 'Model', 'Year', 'Engine HP', 'Engine Cylinders', 'Transmission Type', 'Vehicle Style', 'highway MPG', 'city mpg', 'MSRP']].copy()

Data Preparation

- Select only the features from above and transform their names.

In [4]:
data.columns = data.columns.str.replace(' ','_').str.lower()

- Fill in the missing values of the selected features with 0

In [5]:
data.isnull().sum()

make                  0
model                 0
year                  0
engine_hp            69
engine_cylinders     30
transmission_type     0
vehicle_style         0
highway_mpg           0
city_mpg              0
msrp                  0
dtype: int64

In [6]:
data = data.fillna(0)

In [7]:
data.isnull().sum()

make                 0
model                0
year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_style        0
highway_mpg          0
city_mpg             0
msrp                 0
dtype: int64

Rename `MSRP` variable to `price`

In [8]:
data = data.rename({'msrp': 'price'}, axis=1)

In [9]:
data.columns

Index(['make', 'model', 'year', 'engine_hp', 'engine_cylinders',
       'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg',
       'price'],
      dtype='object')

### Question 1
 What is the most frequent observation (mode) for the column `transmission_type`?

In [10]:
data['transmission_type'].value_counts().idxmax()

'AUTOMATIC'

### Question 2

Create the correlation matrix for the numerical features of your dataset. 
In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.

In [11]:
data.dtypes

make                  object
model                 object
year                   int64
engine_hp            float64
engine_cylinders     float64
transmission_type     object
vehicle_style         object
highway_mpg            int64
city_mpg               int64
price                  int64
dtype: object

In [12]:
numeric_cols = ['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg', 'price']

corr_matrix = data[numeric_cols].corr()

In [13]:
corr_matrix

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg,price
year,1.0,0.338714,-0.040708,0.25824,0.198171,0.22759
engine_hp,0.338714,1.0,0.774851,-0.415707,-0.424918,0.650095
engine_cylinders,-0.040708,0.774851,1.0,-0.614541,-0.587306,0.526274
highway_mpg,0.25824,-0.415707,-0.614541,1.0,0.886829,-0.160043
city_mpg,0.198171,-0.424918,-0.587306,0.886829,1.0,-0.157676
price,0.22759,0.650095,0.526274,-0.160043,-0.157676,1.0


In [19]:
#Take the absolute value because we are trying to find the biggest correlation which can be a strong negative relationship

corr_dict = {
    'engine_hp - year' : abs(corr_matrix.loc['engine_hp', 'year']),
    'engine_hp - engine_cylinders' : abs(corr_matrix.loc['engine_hp', 'engine_cylinders']),
    'highway_mpg - engine_cylinders' : abs(corr_matrix.loc['highway_mpg', 'engine_cylinders']),
    'highway_mpg - city_mpg' : abs(corr_matrix.loc['highway_mpg', 'city_mpg'])
}

In [21]:
corr_dict

{'engine_hp - year': 0.3387141847624335,
 'engine_hp - engine_cylinders': 0.7748509807813221,
 'highway_mpg - engine_cylinders': 0.6145414173953352,
 'highway_mpg - city_mpg': 0.8868294962591425}

In [20]:
print(f"The combination with the highest correlation is: {max(corr_dict, key = lambda x: corr_dict[x])}")

The combination with the highest correlation is: highway_mpg - city_mpg


Make `price` binary
- We need to turn the `price` variable from numeric into a binary format
- Let's create a variable `above_average` which is `1` if the `price` is above its mean value and `0` otherwise

In [22]:
mean_price = data.price.mean()

In [24]:
#Check to make sure the value counts below represent the same distribution of above and below average
print(f"Above Average: {len(data.query('price > @mean_price'))}, Below Average: {len(data.query('price <= @mean_price'))}, Total Records: {len(data)}")

Above Average: 3269, Below Average: 8645, Total Records: 11914


In [25]:
data['above_average'] = data['price'].apply(lambda x: 1 if x > mean_price else 0)

In [26]:
data.above_average.value_counts()

above_average
0    8645
1    3269
Name: count, dtype: int64

Split the data

- Split your data in train/val/test sets with 60%/20%/20% distribution.
- Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
- Make sure that the target value (above_average) is not in your dataframe.

In [27]:
y = data['above_average'].values

#drop price because above_average is based on the variable, and that may over inflate the model's accuracy metrics
X = data.drop(['above_average', 'price'], axis=1)

In [28]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

### Question 3

- Calculate the mutual information score between above_average and other categorical variables in our dataset. Use the training set only
- Round the scores to 2 decimals using round(score, 2)

In [29]:
X_train.dtypes

make                  object
model                 object
year                   int64
engine_hp            float64
engine_cylinders     float64
transmission_type     object
vehicle_style         object
highway_mpg            int64
city_mpg               int64
dtype: object

In [30]:
categorical_cols = ['make', 'model', 'transmission_type', 'vehicle_style']
cat_m_score = {}

for col in categorical_cols:
    m_score = round(mutual_info_score(y_train, X_train[col]),2)
    
    cat_m_score[col] = m_score     
    print(f"{col}: {m_score}")

make: 0.24
model: 0.46
transmission_type: 0.02
vehicle_style: 0.08


In [31]:
print(f"The column with the lowest mutual information score: {min(cat_m_score, key = lambda x: cat_m_score[x])}")

The column with the lowest mutual information score: transmission_type


### Question 4

- Train logistic regression
- Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
- Fit model on training dataset
    - To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
    - `model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)`
- Calculate the accuracy on the validation dataset and round it to 2 decimal digits.


In [32]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)

In [34]:
# enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
# X_train_encoded = enc.fit_transform(X_train)
dv = DictVectorizer(sparse=False)


X_train_encoded = dv.fit_transform(X_train.to_dict(orient='records'))

In [35]:
model.fit(X_train_encoded, y_train)

In [36]:
X_val_encoded = dv.transform(X_val.to_dict(orient='records'))

In [37]:
y_pred = model.predict(X_val_encoded)

acc_score = round(accuracy_score(y_val, y_pred),2)
print(f"Accuracy score for logistic model: {acc_score}")


Accuracy score for logistic model: 0.95


### Question 5

- Let's find the least useful feature using the feature elimination technique.
- Train a model with all these features (using the same parameters as in Q4).
- Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
- For each feature, calculate the difference between the original accuracy and the accuracy without the feature.

In [38]:
y_pred = model.predict(X_val_encoded)

all_features_score = round(accuracy_score(y_val, y_pred),2)
all_features_score

0.95

In [41]:
features_removed = ['city_mpg', 'engine_hp', 'transmission_type', 'year']
feat_dict = {}

for feat in features_removed:
    log_model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    #enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    dv = DictVectorizer(sparse=False)

    X_feat_train = X_train.drop(feat, axis=1)
    X_feat_enc_train = dv.fit_transform(X_feat_train.to_dict(orient='records'))

    log_model.fit(X_feat_enc_train, y_train)

    X_feat_val = X_val.drop(feat, axis=1)
    X_feat_enc_val = dv.transform(X_feat_val.to_dict(orient='records'))

    y_feat_pred = log_model.predict(X_feat_enc_val)

    feat_score_delta = abs(accuracy_score(y_val, y_feat_pred) - all_features_score)

    print(f"feat dropped: {feat}, abs. diff. to total features scores: {feat_score_delta}")
    
    feat_dict[feat] = feat_score_delta

feat dropped: city_mpg, abs. diff. to total features scores: 0.01756189676877884
feat dropped: engine_hp, abs. diff. to total features scores: 0.02217792698279475
feat dropped: transmission_type, abs. diff. to total features scores: 0.004972723457826178
feat dropped: year, abs. diff. to total features scores: 0.0020352496852705793


In [43]:
print(f"The feature with the smallest difference to overall accuracy: {min(feat_dict, key = lambda x: feat_dict[x])}")

The feature with the smallest difference to overall accuracy: year


### Question 6

- For this question, we'll see how to use a linear regression model from Scikit-Learn.
- We'll need to use the original column price. Apply the logarithmic transformation to this column.
- Fit the Ridge regression model on the training data with a solver 'sag'. Set the seed to 42.
- This model also has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10].
- Round your RMSE scores to 3 decimal digits.

In [53]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

In [46]:
data = df_raw[['Make', 'Model', 'Year', 'Engine HP', 'Engine Cylinders', 'Transmission Type', 'Vehicle Style', 'highway MPG', 'city mpg', 'MSRP']].copy()

data.columns = data.columns.str.replace(' ','_').str.lower()
data = data.rename({'msrp': 'price'}, axis=1)
data = data.fillna(0)

In [49]:
y = np.log1p(data['price'].values)
X = data.drop('price', axis=1)

In [50]:
X_train_full, X_test, y_train_full, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=42)

In [56]:
alpha_list = [0, 0.01, 0.1, 1, 10]
alpha_dict = {}

for alpha in alpha_list:
    
    model = Ridge(solver='sag', alpha=alpha, max_iter=1_000)
    enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    scaler = StandardScaler()
    #dv = DictVectorizer(sparse=False)

    X_train_enc = enc.fit_transform(X_train)
    X_train_enc_scaled = scaler.fit_transform(X_train_enc)
    model.fit(X_train_enc_scaled, y_train)

    X_val_enc = enc.transform(X_val)
    X_val_enc_scaled = scaler.transform(X_val_enc)
    y_pred = model.predict(X_val_enc_scaled)

    rmse = round(mean_squared_error(y_val, y_pred, squared=False),3)

    print(f"alpha value: {alpha}, rmse {rmse}")
    alpha_dict[alpha] = rmse 
    





alpha value: 0, rmse 0.142


In [54]:
alpha_dict

{0: 0.117, 0.01: 0.117, 0.1: 0.117, 1: 0.131, 10: 0.197}

In [55]:
print(f"The alpha value that generates the lowest value is: {min(alpha_dict, key = lambda x: alpha_dict[x])}") 

The alpha value that generates the lowest value is: 0
