In [80]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [22]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv'

In [23]:
!wget $data

--2023-09-29 15:51:18--  https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1475504 (1.4M) [text/plain]
Saving to: ‘data.csv.1’


2023-09-29 15:51:18 (3.68 MB/s) - ‘data.csv.1’ saved [1475504/1475504]



In [24]:
features = ["Make", "Model", "Year", "Engine HP", "Engine Cylinders", "Transmission Type", "Vehicle Style", "highway MPG", "city mpg", "MSRP"]

In [25]:
df = pd.read_csv('data.csv', usecols = features)
df.head(3)

Unnamed: 0,Make,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Vehicle Style,highway MPG,city mpg,MSRP
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350


### Data preparation

In [26]:
df.columns = df.columns.str.replace(' ', '_').str.lower()
df.head(3)

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,msrp
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350


In [27]:
df.isnull().sum()

make                  0
model                 0
year                  0
engine_hp            69
engine_cylinders     30
transmission_type     0
vehicle_style         0
highway_mpg           0
city_mpg              0
msrp                  0
dtype: int64

In [28]:
df['engine_hp'] = df['engine_hp'].fillna(0)
df['engine_cylinders'] = df['engine_cylinders'].fillna(0)

In [29]:
df['engine_hp'] = df['engine_hp'].fillna(0)

In [34]:
df = df.rename(columns={"msrp" : "price"})
df.columns

Index(['make', 'model', 'year', 'engine_hp', 'engine_cylinders',
       'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg',
       'price'],
      dtype='object')

### Question 1
What is the most frequent observation (mode) for the column transmission_type?

In [37]:
df['transmission_type'].mode()

0    AUTOMATIC
Name: transmission_type, dtype: object

In [40]:
df['transmission_type'].value_counts()

transmission_type
AUTOMATIC           8266
MANUAL              2935
AUTOMATED_MANUAL     626
DIRECT_DRIVE          68
UNKNOWN               19
Name: count, dtype: int64

### Question 2
Create the correlation matrix, What are the two features that have the biggest correlation in this dataset?
-engine_hp and year
-engine_hp and engine_cylinders
-highway_mpg and engine_cylinders
-highway_mpg and city_mpg

In [42]:
df_numeric = df.copy()
df_numeric = df.drop(["make", "model", "transmission_type", "vehicle_style", "price"], axis=1)
df_numeric.describe()     

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg
count,11914.0,11914.0,11914.0,11914.0,11914.0
mean,2010.384338,247.941749,5.614655,26.637485,19.733255
std,7.57974,110.507669,1.800554,8.863001,8.987798
min,1990.0,0.0,0.0,12.0,7.0
25%,2007.0,170.0,4.0,22.0,16.0
50%,2015.0,225.0,6.0,26.0,18.0
75%,2016.0,300.0,6.0,30.0,22.0
max,2017.0,1001.0,16.0,354.0,137.0


In [46]:
df_numeric.corr()

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg
year,1.0,0.338714,-0.040708,0.25824,0.198171
engine_hp,0.338714,1.0,0.774851,-0.415707,-0.424918
engine_cylinders,-0.040708,0.774851,1.0,-0.614541,-0.587306
highway_mpg,0.25824,-0.415707,-0.614541,1.0,0.886829
city_mpg,0.198171,-0.424918,-0.587306,0.886829,1.0


In [48]:
# Make price binary
df_class = df.copy()
mean = df_class['price'].mean()

df_class['above_average'] = np.where(df_class['price']>=mean,1,0)

In [49]:
df_class = df_class.drop('price', axis=1)

In [53]:
# Split the dataset
df_train_full, df_test = train_test_split(df_class, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

In [54]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [55]:
y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

### Question 3
Calculate the mutual information

In [58]:
categorical = ["make", "model", "transmission_type", "vehicle_style"]

def calculate_mi(series):
    return mutual_info_score(series, df_train.above_average)


df_mi = df_train[categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')
df_mi

Unnamed: 0,MI
model,0.462344
make,0.239769
vehicle_style,0.084143
transmission_type,0.020958


In [59]:
df_train = df_train.drop('above_average', axis=1)
df_val = df_val.drop('above_average', axis=1)
df_test = df_test.drop('above_average', axis=1)

### Question 4
Logistic regression

In [63]:
numeric = ["engine_hp", "engine_cylinders", "transmission_type", "vehicle_style", "highway_mpg", "city_mpg"]
train_dict = df_train[categorical + numeric].to_dict(orient='records')

#@ VECTORIZING THE DATASET:
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)
X_train = dv.transform(train_dict)

  train_dict = df_train[categorical + numeric].to_dict(orient='records')


In [69]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

val_dict = df_val[categorical + numeric].to_dict(orient='records')
X_val = dv.transform(val_dict)
y_pred = model.predict(X_val)

accuracy = np.round(accuracy_score(y_val, y_pred),2)
print(accuracy)

0.95


  val_dict = df_val[categorical + numeric].to_dict(orient='records')


### Question 5
feature elimination technique

In [73]:
features = categorical + numeric
features

['make',
 'model',
 'transmission_type',
 'vehicle_style',
 'engine_hp',
 'engine_cylinders',
 'transmission_type',
 'vehicle_style',
 'highway_mpg',
 'city_mpg']

In [74]:
orig_score = accuracy

for c in features:
    subset = features.copy()
    subset.remove(c)
    
    train_dict = df_train[subset].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    dv.fit(train_dict)

    X_train = dv.transform(train_dict)

    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    val_dict = df_val[subset].to_dict(orient='records')
    X_val = dv.transform(val_dict)

    y_pred = model.predict(X_val)

    score = accuracy_score(y_val, y_pred)
    print(c, orig_score - score, score)

  train_dict = df_train[subset].to_dict(orient='records')
  val_dict = df_val[subset].to_dict(orient='records')
  train_dict = df_train[subset].to_dict(orient='records')
  val_dict = df_val[subset].to_dict(orient='records')


make 0.005812001678556444 0.9441879983214435
model 0.02721359630717579 0.9227864036928242


  train_dict = df_train[subset].to_dict(orient='records')
  val_dict = df_val[subset].to_dict(orient='records')
  train_dict = df_train[subset].to_dict(orient='records')


transmission_type 0.0016156105749055572 0.9483843894250944
vehicle_style 0.0016156105749055572 0.9483843894250944


  val_dict = df_val[subset].to_dict(orient='records')
  train_dict = df_train[subset].to_dict(orient='records')
  val_dict = df_val[subset].to_dict(orient='records')
  train_dict = df_train[subset].to_dict(orient='records')


engine_hp 0.021758287872429616 0.9282417121275703
engine_cylinders 0.004553084347461156 0.9454469156525388


  val_dict = df_val[subset].to_dict(orient='records')
  train_dict = df_train[subset].to_dict(orient='records')
  val_dict = df_val[subset].to_dict(orient='records')
  train_dict = df_train[subset].to_dict(orient='records')


transmission_type 0.0016156105749055572 0.9483843894250944
vehicle_style 0.0016156105749055572 0.9483843894250944


  val_dict = df_val[subset].to_dict(orient='records')
  train_dict = df_train[subset].to_dict(orient='records')
  val_dict = df_val[subset].to_dict(orient='records')
  train_dict = df_train[subset].to_dict(orient='records')


highway_mpg 0.0016156105749055572 0.9483843894250944
city_mpg 0.001195971464540424 0.9488040285354595


  val_dict = df_val[subset].to_dict(orient='records')


### Question 6
Linear regression model from Scikit-Learn

In [75]:
train_dict = df_train[categorical + numeric].to_dict(orient='records')

  train_dict = df_train[categorical + numeric].to_dict(orient='records')


In [76]:
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

X_train = dv.transform(train_dict)

val_dict = df_val[categorical + numeric].to_dict(orient='records')
X_val = dv.transform(val_dict)

  val_dict = df_val[categorical + numeric].to_dict(orient='records')


In [82]:
for a in [0, 0.01, 0.1, 1, 10]:
    model = Ridge(alpha=a, solver="sag", random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    
    score = np.sqrt(mean_squared_error(y_val, y_pred))
    
    print(a, round(score, 3))



0 0.299




0.01 0.299




0.1 0.299




1 0.299
10 0.299


