# ML Zoomcamp 2023, Homework 3 (classification)


In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mutual_info_score
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction import DictVectorizer

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv')
df.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [3]:
df = df[['Make','Model','Year','Engine HP','Engine Cylinders','Transmission Type','Vehicle Style','highway MPG','city mpg','MSRP']]

In [4]:
df.columns = df.columns.str.replace(' ', '_').str.lower()
df.fillna(0, inplace=True)
df.rename(columns = {'msrp': 'price'}, inplace=True)


In [5]:
df.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


In [6]:
df['transmission_type'].describe()

count         11914
unique            5
top       AUTOMATIC
freq           8266
Name: transmission_type, dtype: object

AUTOMATIC is the most frequent value of transmission_type. 

In [7]:
df.corr()

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg,price
year,1.0,0.338714,-0.040708,0.25824,0.198171,0.22759
engine_hp,0.338714,1.0,0.774851,-0.415707,-0.424918,0.650095
engine_cylinders,-0.040708,0.774851,1.0,-0.614541,-0.587306,0.526274
highway_mpg,0.25824,-0.415707,-0.614541,1.0,0.886829,-0.160043
city_mpg,0.198171,-0.424918,-0.587306,0.886829,1.0,-0.157676
price,0.22759,0.650095,0.526274,-0.160043,-0.157676,1.0


The pair highway_mpg and city_mpg has the greatest correlation in this dataset. 

In [8]:
mean_price = df['price'].mean()
df['above_average'] = df['price'].apply(lambda x: 1 if x > mean_price else 0) 

In [9]:
SEED = 42
y = df.above_average.values
X = df.drop(['above_average', 'price'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = SEED)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 0.5, random_state = SEED)

In [10]:
categorical = ['transmission_type','vehicle_style','make','model']
for c in categorical: 
    print("MI between above_average and",c, round(mutual_info_score(X_train[c], y_train),2))

MI between above_average and transmission_type 0.02
MI between above_average and vehicle_style 0.08
MI between above_average and make 0.24
MI between above_average and model 0.46


Hence, above_average and transmission_type have the lowest mutual info score. 

In [11]:
train_dict = X_train.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)
X_train1 = dv.transform(train_dict)
val_dict = X_val.to_dict(orient='records')
X_val1 = dv.transform(val_dict) 

In [12]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=SEED)
model.fit(X_train1,y_train)

LogisticRegression(C=10, max_iter=1000, random_state=42, solver='liblinear')

In [13]:
round(model.score(X_val1,y_val),2)

0.95

The accuracy score of this model is 0.95. 

In [14]:
X_train_wo_year = X_train.drop(['year'], axis=1)
X_val_wo_year = X_val.drop(['year'], axis=1)
train_dict = X_train_wo_year.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)
X_train1 = dv.transform(train_dict)
val_dict = X_val_wo_year.to_dict(orient='records')
X_val1 = dv.transform(val_dict) 


model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=SEED)
model.fit(X_train1,y_train)
round(model.score(X_val1,y_val),2)

0.94

In [15]:
X_train_wo_eh = X_train.drop(['engine_hp'], axis=1)
X_val_wo_eh = X_val.drop(['engine_hp'], axis=1)
train_dict = X_train_wo_eh.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)
X_train1 = dv.transform(train_dict)
val_dict = X_val_wo_eh.to_dict(orient='records')
X_val1 = dv.transform(val_dict) 


model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=SEED)
model.fit(X_train1,y_train)
round(model.score(X_val1,y_val),2)

0.93

In [16]:
X_train_wo_tt = X_train.drop(['transmission_type'], axis=1)
X_val_wo_tt = X_val.drop(['transmission_type'], axis=1)
train_dict = X_train_wo_tt.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)
X_train1 = dv.transform(train_dict)
val_dict = X_val_wo_tt.to_dict(orient='records')
X_val1 = dv.transform(val_dict) 


model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=SEED)
model.fit(X_train1,y_train)
round(model.score(X_val1,y_val),2)

0.95

In [17]:
X_train_wo_vs = X_train.drop(['vehicle_style'], axis=1)
X_val_wo_vs = X_val.drop(['vehicle_style'], axis=1)
train_dict = X_train_wo_vs.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)
X_train1 = dv.transform(train_dict)
val_dict = X_val_wo_vs.to_dict(orient='records')
X_val1 = dv.transform(val_dict) 


model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=SEED)
model.fit(X_train1,y_train)
round(model.score(X_val1,y_val),2) 

0.92

The feature transmission_type gives the smallest difference. 

In [18]:
y = np.log1p(df['price'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = SEED)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 0.5, random_state = SEED)


train_dict = X_train.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)
X_train1 = dv.transform(train_dict)
val_dict = X_val.to_dict(orient='records')
X_val1 = dv.transform(val_dict) 

In [20]:
for a in [0, 0.01, 0.1, 1, 10]: 
    clf = Ridge(alpha=a,solver='sag',random_state=SEED)
    clf.fit(X_train1,y_train)
    y_pred = clf.predict(X_val1)
    print(a,round(mean_squared_error(y_val, y_pred),5))

0 0.23346
0.01 0.23346
0.1 0.23346
1 0.23351
10 0.23373
