In [207]:
import pandas as pd
import numpy as np

### Dataset

In [208]:
!pwsh -Command "Invoke-WebRequest -Uri https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv -OutFile data.csv"

### Features

In [209]:
df_raw = pd.read_csv('data.csv')

In [210]:
df_raw.head().T

Unnamed: 0,0,1,2,3,4
Make,BMW,BMW,BMW,BMW,BMW
Model,1 Series M,1 Series,1 Series,1 Series,1 Series
Year,2011,2011,2011,2011,2011
Engine Fuel Type,premium unleaded (required),premium unleaded (required),premium unleaded (required),premium unleaded (required),premium unleaded (required)
Engine HP,335.0,300.0,300.0,230.0,230.0
Engine Cylinders,6.0,6.0,6.0,6.0,6.0
Transmission Type,MANUAL,MANUAL,MANUAL,MANUAL,MANUAL
Driven_Wheels,rear wheel drive,rear wheel drive,rear wheel drive,rear wheel drive,rear wheel drive
Number of Doors,2.0,2.0,2.0,2.0,2.0
Market Category,"Factory Tuner,Luxury,High-Performance","Luxury,Performance","Luxury,High-Performance","Luxury,Performance",Luxury


In [211]:
columns = pd.Series(['Make','Model','Year','Engine HP','Engine Cylinders','Transmission Type','Vehicle Style','highway MPG','city mpg'])
columns_newnames = columns.str.lower().str.replace(' ', '_')

In [212]:
columns

0                 Make
1                Model
2                 Year
3            Engine HP
4     Engine Cylinders
5    Transmission Type
6        Vehicle Style
7          highway MPG
8             city mpg
dtype: object

In [213]:
columns_newnames

0                 make
1                model
2                 year
3            engine_hp
4     engine_cylinders
5    transmission_type
6        vehicle_style
7          highway_mpg
8             city_mpg
dtype: object

In [214]:
columns_old_new = dict(zip(columns, columns_newnames))

In [215]:
columns_old_new

{'Make': 'make',
 'Model': 'model',
 'Year': 'year',
 'Engine HP': 'engine_hp',
 'Engine Cylinders': 'engine_cylinders',
 'Transmission Type': 'transmission_type',
 'Vehicle Style': 'vehicle_style',
 'highway MPG': 'highway_mpg',
 'city mpg': 'city_mpg'}

### Data preparation

In [216]:
df_raw.rename(columns=columns_old_new, inplace=True)

In [217]:
df_raw.rename(columns={'MSRP':'price'}, inplace=True)

In [218]:
columns_newnames_list = list(columns_newnames)

In [219]:
df = df_raw[columns_newnames_list]

In [220]:
df.head().T

Unnamed: 0,0,1,2,3,4
make,BMW,BMW,BMW,BMW,BMW
model,1 Series M,1 Series,1 Series,1 Series,1 Series
year,2011,2011,2011,2011,2011
engine_hp,335.0,300.0,300.0,230.0,230.0
engine_cylinders,6.0,6.0,6.0,6.0,6.0
transmission_type,MANUAL,MANUAL,MANUAL,MANUAL,MANUAL
vehicle_style,Coupe,Convertible,Coupe,Coupe,Convertible
highway_mpg,26,28,28,28,28
city_mpg,19,19,20,18,18


In [221]:
df = df.fillna(0)
df.isnull().sum()

make                 0
model                0
year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_style        0
highway_mpg          0
city_mpg             0
dtype: int64

### Question 1

In [222]:
df.transmission_type.value_counts()

transmission_type
AUTOMATIC           8266
MANUAL              2935
AUTOMATED_MANUAL     626
DIRECT_DRIVE          68
UNKNOWN               19
Name: count, dtype: int64

Answer: AUTOMATIC

### Question 2

In [223]:
numerrical_columns = list(df.dtypes[(df.dtypes == 'int64') | (df.dtypes == 'float64')].index)

In [224]:
numerrical_columns

['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']

In [225]:
df[numerrical_columns].corrwith(df_raw.price).abs().sort_values(ascending=False)

engine_hp           0.650095
engine_cylinders    0.526274
year                0.227590
highway_mpg         0.160043
city_mpg            0.157676
dtype: float64

Answer: engine_hp and engine_cylinders

In [226]:
df['above_average'] = (df_raw.price > df_raw.price.mean()).astype(int)

In [227]:
#df['price'] = df_raw['price']

In [228]:
df.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,above_average
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,1
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,1
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,0
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,0
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,0


In [229]:
from sklearn.model_selection import train_test_split

In [230]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [231]:
len(df_train), len(df_val), len(df_test)

(7148, 2383, 2383)

In [232]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [233]:
y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

In [234]:
y_train

array([0, 0, 1, ..., 0, 0, 0])

### Question 3

In [235]:
from sklearn.metrics import mutual_info_score

In [236]:
def mutual_info_price_score(series):
    return mutual_info_score(series, y_train)

In [237]:
categorical_columns = list(df.dtypes[(df.dtypes == 'object')].index)
categorical_columns

['make', 'model', 'transmission_type', 'vehicle_style']

In [238]:
mi = df_train[categorical_columns].apply(mutual_info_price_score)
round(mi.sort_values(ascending=False), 2)

model                0.46
make                 0.24
vehicle_style        0.08
transmission_type    0.02
dtype: float64

Answer: transmission_type

### Question 4

In [239]:
from sklearn.feature_extraction import DictVectorizer

In [240]:
numerrical_columns

['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']

In [241]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical_columns + numerrical_columns].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical_columns + numerrical_columns].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [242]:
X_val

array([[2.300e+01, 4.000e+00, 2.100e+02, ..., 0.000e+00, 0.000e+00,
        2.015e+03],
       [1.700e+01, 6.000e+00, 3.540e+02, ..., 0.000e+00, 0.000e+00,
        2.015e+03],
       [2.200e+01, 4.000e+00, 1.400e+02, ..., 1.000e+00, 0.000e+00,
        2.005e+03],
       ...,
       [1.200e+01, 6.000e+00, 1.900e+02, ..., 0.000e+00, 0.000e+00,
        2.003e+03],
       [1.400e+01, 8.000e+00, 4.300e+02, ..., 0.000e+00, 0.000e+00,
        2.015e+03],
       [1.800e+01, 6.000e+00, 3.210e+02, ..., 1.000e+00, 0.000e+00,
        2.015e+03]])

In [243]:
from sklearn.linear_model import LogisticRegression

In [244]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)

In [245]:
model.fit(X_train, y_train)

In [246]:
model.intercept_[0]

-0.3922024649900234

In [247]:
model.coef_[0].round(3)

array([ 7.800e-02, -1.050e-01,  3.700e-02,  0.000e+00,  1.406e+00,
        1.874e+00,  6.250e-01,  2.961e+00,  2.566e+00,  1.850e-01,
        0.000e+00, -4.830e-01,  2.483e+00, -1.519e+00, -1.395e+00,
       -3.865e+00, -5.390e-01,  3.960e-01, -1.962e+00, -1.152e+00,
        6.660e-01, -2.690e-01, -1.336e+00, -2.694e+00,  2.960e-01,
       -1.527e+00,  1.100e-02,  1.992e+00,  1.363e+00,  1.196e+00,
        4.093e+00,  1.104e+00,  5.000e-03, -1.680e+00,  0.000e+00,
        9.520e-01, -2.087e+00, -1.076e+00, -1.496e+00, -3.900e-01,
       -3.283e+00,  2.007e+00,  1.034e+00,  8.970e-01, -2.470e-01,
        3.530e-01, -2.802e+00, -1.980e+00,  3.336e+00, -7.150e-01,
       -8.230e-01,  1.126e+00, -1.424e+00, -4.680e-01, -5.100e-02,
       -1.000e-02, -2.000e-03, -1.310e+00, -9.780e-01, -3.000e-03,
       -5.000e-03, -3.000e-03, -1.930e-01,  9.100e-02,  1.821e+00,
        4.940e-01, -4.990e-01, -6.700e-02, -1.790e-01, -1.870e-01,
       -0.000e+00, -1.800e-02, -1.007e+00,  1.640e-01,  2.700e

In [248]:
y_pred = model.predict(X_val)

In [249]:
y_pred

array([0, 1, 0, ..., 0, 1, 1])

In [250]:
y_val

array([0, 1, 0, ..., 0, 1, 1])

In [252]:
from sklearn.metrics import accuracy_score
round(accuracy_score(y_val, y_pred), 2)

0.95

Answer: 0.95

### Question 5

In [253]:
categorical_columns

['make', 'model', 'transmission_type', 'vehicle_style']

In [254]:
numerrical_columns

['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg']

In [270]:
columns_for_q5 = ['year', 'engine_hp', 'transmission_type', 'city_mpg']

In [271]:
def train_and_get_accuracy(columns_to_use):
    dv = DictVectorizer(sparse=False)

    train_dict_q5 = df_train[columns_to_use].to_dict(orient='records')
    X_train_q5 = dv.fit_transform(train_dict_q5)

    val_dict_q5 = df_val[columns_to_use].to_dict(orient='records')
    X_val_q5 = dv.transform(val_dict_q5)

    model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    model.fit(X_train_q5, y_train)

    y_pred_q5 = model.predict(X_val_q5)

    acc = accuracy_score(y_val, y_pred_q5)

    return acc

    

In [272]:
acc_all_features = train_and_get_accuracy(columns_for_q5)
acc_all_features

0.8850188837599664

In [273]:
columns_for_q5 = ['engine_hp', 'transmission_type', 'city_mpg']
acc_wo_year = train_and_get_accuracy(columns_for_q5)
acc_wo_year

0.8854385228703315

In [274]:
columns_for_q5 = ['year', 'transmission_type', 'city_mpg']
acc_wo_engine_hp = train_and_get_accuracy(columns_for_q5)
acc_wo_engine_hp

0.7444397817876626

In [275]:
columns_for_q5 = ['year', 'engine_hp', 'city_mpg']
acc_wo_transmission_type = train_and_get_accuracy(columns_for_q5)
acc_wo_transmission_type

0.8820814099874108

In [276]:
columns_for_q5 = ['year', 'engine_hp', 'transmission_type']
acc_wo_city_mpg = train_and_get_accuracy(columns_for_q5)
acc_wo_city_mpg

0.8766261015526647

In [278]:
acc_all_features - acc_wo_year

-0.00041963911036513313

In [279]:
acc_all_features - acc_wo_engine_hp

0.14057910197230383

In [280]:
acc_all_features - acc_wo_transmission_type

0.002937473772555599

In [281]:
acc_all_features - acc_wo_city_mpg

0.008392782207301663

Answer: year

### Question 6

In [283]:
from numpy import log1p


df['price'] = log1p(df_raw['price'])

In [284]:
df.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,above_average,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,1,10.739349
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,1,10.612779
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,0,10.500977
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,0,10.290483
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,0,10.448744


In [285]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [286]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [287]:
y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

del df_train['price']
del df_val['price']
del df_test['price']

In [288]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical_columns + numerrical_columns].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical_columns + numerrical_columns].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [295]:
from sklearn.linear_model import Ridge

In [296]:
model = Ridge(solver='sag', alpha=0, random_state=42)

In [297]:
model.fit(X_train, y_train)



In [298]:
y_pred = model.predict(X_val)

In [299]:
y_pred

array([10.23163826, 11.00731088,  9.22103195, ...,  9.03680852,
       11.54600367, 10.8485679 ])

In [300]:
from sklearn.metrics import mean_squared_error

In [301]:
mean_squared_error(y_val, y_pred)

0.23696870340512774

In [304]:
def RidgeWithAlpha(alpha_val):
    model = Ridge(solver='sag', alpha=alpha_val, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred)
    return rmse

In [305]:
for a in [0, 0.01, 0.1, 1, 10]:
    print(a)
    print(RidgeWithAlpha(a))
    print()

0




0.23696870340512774

0.01




0.23696893578632247

0.1




0.23697102714938584

1




0.23699193506564448

10
0.23720044497398873



