In [68]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [39]:
df = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv")

select only the columns needed for HW

In [40]:
df = df[["Make", "Model", "Year", "Engine HP", "Engine Cylinders", "Transmission Type", "Vehicle Style", "highway MPG", "city mpg", "MSRP"]]

transofrm column names to lower case

In [41]:
df.columns = df.columns.str.replace(' ', '_').str.lower()

fill in the missing values of the selected features with 0.

In [42]:
df = df.fillna(0)

rename MSRP variable to price

In [43]:
df = df.rename(columns={'msrp': 'price'})

In [44]:
df.shape

(11914, 10)

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11914 entries, 0 to 11913
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   make               11914 non-null  object 
 1   model              11914 non-null  object 
 2   year               11914 non-null  int64  
 3   engine_hp          11914 non-null  float64
 4   engine_cylinders   11914 non-null  float64
 5   transmission_type  11914 non-null  object 
 6   vehicle_style      11914 non-null  object 
 7   highway_mpg        11914 non-null  int64  
 8   city_mpg           11914 non-null  int64  
 9   price              11914 non-null  int64  
dtypes: float64(2), int64(4), object(4)
memory usage: 930.9+ KB


In [46]:
df.isnull().sum()

make                 0
model                0
year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_style        0
highway_mpg          0
city_mpg             0
price                0
dtype: int64

In [47]:
df

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500
...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,46120
11910,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,56670
11911,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50620
11912,Acura,ZDX,2013,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50920


## Question 1 - Answer: AUTOMATIC

In [48]:
df["transmission_type"].value_counts()

AUTOMATIC           8266
MANUAL              2935
AUTOMATED_MANUAL     626
DIRECT_DRIVE          68
UNKNOWN               19
Name: transmission_type, dtype: int64

## Question 2 - Answer: highway_mpg and city_mpg

In [50]:
df.corr()

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg,price
year,1.0,0.338714,-0.040708,0.25824,0.198171,0.22759
engine_hp,0.338714,1.0,0.774851,-0.415707,-0.424918,0.650095
engine_cylinders,-0.040708,0.774851,1.0,-0.614541,-0.587306,0.526274
highway_mpg,0.25824,-0.415707,-0.614541,1.0,0.886829,-0.160043
city_mpg,0.198171,-0.424918,-0.587306,0.886829,1.0,-0.157676
price,0.22759,0.650095,0.526274,-0.160043,-0.157676,1.0


## Question 3 - Answer: 

In [58]:
df["price"].describe()

count    1.191400e+04
mean     4.059474e+04
std      6.010910e+04
min      2.000000e+03
25%      2.100000e+04
50%      2.999500e+04
75%      4.223125e+04
max      2.065902e+06
Name: price, dtype: float64

In [59]:
df["price"].mean()

40594.737032063116

Let's create a variable above_average which is 1 if the price is above its mean value and 0 otherwise.

In [64]:
df["above_average"] = (df["price"] > df["price"].mean())
df["above_average"] 

0         True
1         True
2        False
3        False
4        False
         ...  
11909     True
11910     True
11911     True
11912     True
11913    False
Name: above_average, Length: 11914, dtype: bool

Split the data

Step 1: 80% for 'full_train' and 20% for test

In [77]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
len(df_full_train), len(df_test)

(9531, 2383)

Step 2: 60% for train 20% for test 20% for validation

In [76]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
len(df_train), len(df_test), len(df_val)

(7148, 2383, 2383)

Make sure that the target value (above_average) is not in your dataframe.

In [80]:
y_train = df_train["above_average"].values
y_val = df_val["above_average"].values
y_test = df_test["above_average"].values
##y_full_train = df_full_train["median_house_value"].values

del df_train['above_average']
del df_val['above_average']
del df_test['above_average']
##del df_full_train["median_house_value"].values

## Question 3 - Answer: transmission_type

In [81]:
df_full_train.isnull().sum()

make                 0
model                0
year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_style        0
highway_mpg          0
city_mpg             0
price                0
above_average        0
dtype: int64

In [82]:
df_full_train.above_average.value_counts(normalize=True)

False    0.723219
True     0.276781
Name: above_average, dtype: float64

In [84]:
churn_rate = df_full_train.above_average.mean()
churn_rate

0.2767810303221068

In [85]:
df_full_train.dtypes

make                  object
model                 object
year                   int64
engine_hp            float64
engine_cylinders     float64
transmission_type     object
vehicle_style         object
highway_mpg            int64
city_mpg               int64
price                  int64
above_average           bool
dtype: object

In [86]:
numerical = ['year', 'engine_hp', 'engine_cylinders', 'city_mpg', 'highway_mpg', ]
categorical = [
    'make',
    'model',
    'transmission_type',
    'vehicle_style',
]

In [87]:
df_full_train[categorical].nunique()

make                  48
model                902
transmission_type      5
vehicle_style         16
dtype: int64

In [88]:
from IPython.display import display

In [96]:
global_average = df_full_train["above_average"].mean()

In [97]:
for c in categorical:
    print(c)
    df_group = df_full_train.groupby(c)["above_average"].agg(['mean', 'count'])
    df_group['diff'] = df_group['mean'] - global_average
    df_group['risk'] = df_group['mean'] / global_average
    display(df_group)
    print()
    print()

make


Unnamed: 0_level_0,mean,count,diff,risk
make,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Acura,0.391089,202,0.114308,1.412991
Alfa Romeo,1.0,5,0.723219,3.612964
Aston Martin,1.0,74,0.723219,3.612964
Audi,0.654412,272,0.377631,2.364366
BMW,0.822064,281,0.545283,2.970088
Bentley,1.0,55,0.723219,3.612964
Bugatti,1.0,3,0.723219,3.612964
Buick,0.123377,154,-0.153404,0.445755
Cadillac,0.881988,322,0.605207,3.18659
Chevrolet,0.181313,899,-0.095468,0.655076




model


Unnamed: 0_level_0,mean,count,diff,risk
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1 Series,0.416667,12,0.139886,1.505402
100,0.000000,11,-0.276781,0.000000
124 Spider,0.000000,2,-0.276781,0.000000
190-Class,0.000000,4,-0.276781,0.000000
2,0.000000,10,-0.276781,0.000000
...,...,...,...,...
iQ,0.000000,3,-0.276781,0.000000
tC,0.000000,13,-0.276781,0.000000
xA,0.000000,5,-0.276781,0.000000
xB,0.000000,7,-0.276781,0.000000




transmission_type


Unnamed: 0_level_0,mean,count,diff,risk
transmission_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AUTOMATED_MANUAL,0.474206,504,0.197425,1.713291
AUTOMATIC,0.310017,6619,0.033236,1.120079
DIRECT_DRIVE,0.458333,48,0.181552,1.655942
MANUAL,0.138652,2344,-0.138129,0.500944
UNKNOWN,0.0,16,-0.276781,0.0




vehicle_style


Unnamed: 0_level_0,mean,count,diff,risk
vehicle_style,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2dr Hatchback,0.0,421,-0.276781,0.0
2dr SUV,0.035398,113,-0.241383,0.127893
4dr Hatchback,0.046099,564,-0.230682,0.166555
4dr SUV,0.374619,1970,0.097838,1.353486
Cargo Minivan,0.0,60,-0.276781,0.0
Cargo Van,0.0,73,-0.276781,0.0
Convertible,0.55538,632,0.278599,2.006567
Convertible SUV,0.153846,26,-0.122935,0.555841
Coupe,0.496257,935,0.219476,1.792958
Crew Cab Pickup,0.337017,543,0.060236,1.217629






In [92]:
from sklearn.metrics import mutual_info_score

In [93]:
def mutual_info_churn_score(series):
    return mutual_info_score(series, df_full_train.above_average)

In [94]:
mi = df_full_train[categorical].apply(mutual_info_churn_score)
mi.sort_values(ascending=False)

model                0.460994
make                 0.238724
vehicle_style        0.083390
transmission_type    0.020884
dtype: float64

## Question 4 - Answer: 0.95


There are several categorical variables in the dataset, here we include them using one-hot encoding.

In [99]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)
train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)
val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [101]:
from sklearn.linear_model import LogisticRegression

In [102]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)

In [103]:
model.fit(X_train, y_train)

LogisticRegression(C=10, max_iter=1000, random_state=42, solver='liblinear')

In [106]:
model.predict(X_train)

array([False, False,  True, ..., False, False, False])

In [108]:
y_pred = model.predict_proba(X_val)
y_pred.round(3)

array([[0.999, 0.001],
       [0.004, 0.996],
       [1.   , 0.   ],
       ...,
       [1.   , 0.   ],
       [0.01 , 0.99 ],
       [0.013, 0.987]])

In [110]:
y_pred = model.predict_proba(X_val)[:, 1]
y_pred.round(3)

array([0.001, 0.996, 0.   , ..., 0.   , 0.99 , 0.987])

In [111]:
hard_decision = (y_pred >= 0.5)
hard_decision

array([False,  True, False, ..., False,  True,  True])

In [112]:
df_val[hard_decision].head(3)

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
9951,Audi,SQ5,2015,354.0,6.0,AUTOMATIC,4dr SUV,24,17,60200
509,BMW,5 Series Gran Turismo,2016,300.0,6.0,AUTOMATIC,4dr Hatchback,26,18,63200
2710,Mercedes-Benz,CLK-Class,2007,382.0,8.0,AUTOMATIC,Convertible,22,15,62900


In [113]:
y_val

array([False,  True, False, ..., False,  True,  True])

In [115]:
hard_decision

array([False,  True, False, ..., False,  True,  True])

In [117]:
y_val == hard_decision

array([ True,  True,  True, ...,  True,  True,  True])

In [129]:
(y_val == hard_decision).mean().round(2)

0.95

## Question 5 - Answer: year

In [127]:
from sklearn.metrics import accuracy_score


In [121]:

# INITIALIZING FEATURES:
features = categorical + numerical
features
     

['make',
 'model',
 'transmission_type',
 'vehicle_style',
 'year',
 'engine_hp',
 'engine_cylinders',
 'city_mpg',
 'highway_mpg']

In [128]:
# INSPECTING THE DIFFERENCE IN ACCURACY:
orig_score = 0.9471254720939991

for c in features:
    subset = features.copy()
    subset.remove(c)
    
    train_dict = df_train[subset].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    dv.fit(train_dict)

    X_train = dv.transform(train_dict)

    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    val_dict = df_val[subset].to_dict(orient='records')
    X_val = dv.transform(val_dict)

    y_pred = model.predict(X_val)

    score = accuracy_score(y_val, y_pred)
    print(c, orig_score - score, score)

make 0.009232060428031819 0.9378934116659673
model 0.025597985732270234 0.9215274863617289
transmission_type 0.005035669324380931 0.9420898027696182
vehicle_style 0.008812421317666796 0.9383130507763323
year -0.0008392782207302663 0.9479647503147294
engine_hp 0.026017624842635256 0.9211078472513639
engine_cylinders 0.009651699538396952 0.9374737725556022
city_mpg 0.0004196391103650221 0.9467058329836341
highway_mpg 0.005035669324380931 0.9420898027696182


In [140]:
data = {
    "Feature": ["make", "model", "transmission_type", "vehicle_style", "year", "engine_hp", "engine_cylinders", "city_mpg", "highway_mpg"],
    "Coefficient": [0.009232060428031819, 0.025597985732270234, 0.005035669324380931, 0.008812421317666796, -0.0008392782207302663, 0.026017624842635256, 0.009651699538396952, 0.0004196391103650221, 0.005035669324380931],
    "Value": [0.9378934116659673, 0.9215274863617289, 0.9420898027696182, 0.9383130507763323, 0.9479647503147294, 0.9211078472513639, 0.9374737725556022, 0.9467058329836341, 0.9420898027696182]
}

df = pd.DataFrame(data)
sorted_df = df.sort_values(by='Coefficient', ascending=False)
print(sorted_df)

             Feature  Coefficient     Value
5          engine_hp     0.026018  0.921108
1              model     0.025598  0.921527
6   engine_cylinders     0.009652  0.937474
0               make     0.009232  0.937893
3      vehicle_style     0.008812  0.938313
2  transmission_type     0.005036  0.942090
8        highway_mpg     0.005036  0.942090
7           city_mpg     0.000420  0.946706
4               year    -0.000839  0.947965


## Question 6 - Answer: 0

In [174]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [159]:
data = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv")
data = data[["Make", "Model", "Year", "Engine HP", "Engine Cylinders", "Transmission Type", "Vehicle Style", "highway MPG", "city mpg", "MSRP"]]
data.columns = data.columns.str.replace(' ', '_').str.lower()
data = data.fillna(0)
data = data.rename(columns={'msrp': 'price'})
data.head(1)

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135


We'll need to use the original column price. Apply the logarithmic transformation to this column.

In [161]:

#@ NORMALIZING THE DATA:
data['price']=np.log1p(data['price'])
data.head(1)

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,2.462946


In [162]:

#@ SPLITTING THE DATASET:
df_train_full, df_test = train_test_split(data, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

In [163]:

#@ PREPARING THE DATASET:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [165]:

#@ PREPARING THE DATASET:
y_train = df_train["price"].values
y_val = df_val["price"].values
y_test = df_test["price"].values

In [166]:

#@ DELETING DATASET:
del df_train['price']
del df_val['price']
del df_test['price']

In [167]:
numerical = ['year', 'engine_hp', 'engine_cylinders', 'city_mpg', 'highway_mpg', ]
categorical = [
    'make',
    'model',
    'transmission_type',
    'vehicle_style',
]

RIDGE REGRESSION:

In [168]:

#@ PREPARING THE DATASET:
train_dict = df_train[categorical + numerical].to_dict(orient='records')

In [170]:

#@ VECTORIZING THE DATASET:
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

X_train = dv.transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)
     

In [175]:

#@ RIDGE REGRESSION IMPLEMENTATION:
for a in [0, 0.01, 0.1, 1, 10]:
    model = Ridge(alpha=a, solver="sag", random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    
    score = np.sqrt(mean_squared_error(y_val, y_pred))
    
    print(a, round(score, 4))

0 0.0479
0.01 0.0479
0.1 0.0479
1 0.0479
10 0.0479
