In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split

In [None]:
!wget https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv

In [None]:
PATH = "./data.csv"
select_cols = ["Make", "Model", "Year", "Engine HP", "Engine Cylinders", "Transmission Type", "Vehicle Style", 
               "highway MPG", "city mpg", "MSRP"]
data = pd.read_csv(PATH, usecols=select_cols)

In [None]:
print(data.shape)

In [None]:
data.head(10)

In [None]:
data.columns

In [None]:
data.columns = data.columns.str.lower().str.replace(' ', '_')

string_columns = list(data.dtypes[data.dtypes == 'object'].index)

for col in string_columns:
    data[col] = data[col].str.lower().str.replace(' ', '_')


In [None]:
data.head()

In [None]:
data.isnull().sum()

In [None]:
data["engine_hp"].fillna(0, inplace=True)      
data["engine_cylinders"].fillna(0, inplace=True)    

In [None]:
data.isnull().sum()

In [None]:
data.rename(columns={'msrp': 'price'}, inplace=True)

In [None]:
data.head()

Question 1 ##

What is the most frequent observation (mode) for the column transmission_type?

In [None]:
data['transmission_type'].mode()
     

Question 2 ##

What are the two features that have the biggest correlation in this dataset?

In [None]:
data.dtypes

In [None]:
data["engine_hp"] = data["engine_hp"].astype(float)
data["year"] = data["year"].astype(int)
data["engine_cylinders"] = data["engine_cylinders"].astype(int)
data["highway_mpg"] = data["highway_mpg"].astype(int)
data["city_mpg"] = data["city_mpg"].astype(int)

In [None]:
data["engine_per_year"] = data["engine_hp"]/data["year"]   #engine_per_year
#data["engine_per_cylinders"] = data["engine_hp"]/data["engine_cylinders"] #engine_per_cylinders
#data["highway_per_cylinders"] = data["highway_mpg"]/data["engine_cylinders"] # highway_per_cylinders
data["highway_per_city"] = data["highway_mpg"]/data["city_mpg"] # highway_per_city
data.head()


In [None]:
data.isnull().sum()

In [None]:
 
#data["engine_per_cylinders"].fillna(0, inplace=True)    

In [None]:
data.describe()

In [None]:

#INSPECTING DATASET:
data_numeric = data.copy()
data_numeric = data.drop(["make","model","transmission_type", "vehicle_style"], axis=1)
data_numeric.describe()

In [None]:

#@ INSPECTING CORRELATION:
data_numeric.corr()

In [None]:
#@ INSPECTING HEATMAP:
plt.figure(figsize=(15,10))  
sns.heatmap(data_numeric.corr(),annot=True,linewidths=.5, cmap="Blues")
plt.title('Heatmap showing correlations between numerical data')
plt.show()

In [None]:

#@ INSPECTING CORRELATION:
data_numeric.corr().unstack().sort_values(ascending = False)[:15]

In [None]:

#@ PROCESSING DATASET:
data_class = data.copy()
mean = data_class['price'].mean()

data_class['above_average'] = np.where(data_class['price']>=mean,1,0)
     

#@ PROCESSING DATASET:
data_class = data_class.drop('price', axis=1)

In [None]:
df_train_full, df_test = train_test_split(data_class, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

In [None]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
     

In [None]:
y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

Question 3 ##

Which of these variables has the lowest mutual information score?

In [None]:
cat = ['make','model','transmission_type','vehicle_style']

#@ DEFINING FUNCTION:
def calculate_mi(series):
    return mutual_info_score(series, df_train.above_average)

#@ IMPLEMENTATION:
df_mi = df_train[cat].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')
df_mi

Question 4 ##

Accuracy of the model



In [None]:

#@ PREPARING THE DATASET:
df_train = df_train.drop('above_average', axis=1)
df_val = df_val.drop('above_average', axis=1)
df_test = df_test.drop('above_average', axis=1)

In [None]:
data.dtypes

In [None]:

#@ TRANSFORMING THE DATASET:
num = ["year", "engine_hp", "engine_cylinders", "highway_mpg", "city_mpg", "engine_per_year", 
       "highway_per_city"]
train_dict = df_train[cat + num].to_dict(orient='records')



#@ VECTORIZING THE DATASET:
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

X_train = dv.transform(train_dict)

# Assuming X_train is your NumPy array with missing values




In [None]:

#@ TRAINING LOGISTIC REGRESSSION MODEL:
model = LogisticRegression(solver="liblinear", C=10, max_iter=1000, random_state=42)


model.fit(X_train, y_train)

#@ INITIALIZING THE MODEL PREDICTION:
val_dict = df_val[cat + num].to_dict(orient='records')
X_val = dv.transform(val_dict)
y_pred = model.predict(X_val)


#@ INSPECTING THE ACCURACY:
accuracy = np.round(accuracy_score(y_val, y_pred),2)
print(accuracy)




Question 5: Feature selection - the smallest difference in accuracy


In [None]:
features = cat + num
features
     

In [None]:
orig_score = accuracy

for c in features:
    subset = features.copy()
    subset.remove(c)
    
    train_dict = df_train[subset].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    dv.fit(train_dict)

    X_train = dv.transform(train_dict)

    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    val_dict = df_val[subset].to_dict(orient='records')
    X_val = dv.transform(val_dict)

    y_pred = model.predict(X_val)

    score = accuracy_score(y_val, y_pred)
    print(c, orig_score - score, score)

Question 6: Regression with Scikit-Learn. What's the best alpha?

In [None]:
#NORMALIZING THE DATA:
data['price']=np.log1p(data['price'])

In [None]:
df_train_full, df_test = train_test_split(data, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

In [None]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [None]:
y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values
     

In [None]:
del df_train['price']
del df_val['price']
del df_test['price']
     

In [None]:
train_dict = df_train[cat + num].to_dict(orient='records')

In [None]:
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

X_train = dv.transform(train_dict)

val_dict = df_val[cat + num].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [None]:
for a in [0, 0.01, 0.1, 1, 10]:
    model = Ridge(alpha=a, solver="sag", random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    
    score = np.sqrt(mean_squared_error(y_val, y_pred))
    
    print(a, round(score, 4))