In [67]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import numpy as np


In [68]:
data =  pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv")

In [69]:
data.columns

Index(['Make', 'Model', 'Year', 'Engine Fuel Type', 'Engine HP',
       'Engine Cylinders', 'Transmission Type', 'Driven_Wheels',
       'Number of Doors', 'Market Category', 'Vehicle Size', 'Vehicle Style',
       'highway MPG', 'city mpg', 'Popularity', 'MSRP'],
      dtype='object')

In [70]:
data_sub = data.copy()

In [71]:
data_sub = data_sub.drop(columns = ['Engine Fuel Type','Driven_Wheels','Number of Doors', 'Market Category', 'Vehicle Size','Popularity'])

In [72]:
data_sub

Unnamed: 0,Make,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Vehicle Style,highway MPG,city mpg,MSRP
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500
...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,46120
11910,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,56670
11911,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50620
11912,Acura,ZDX,2013,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50920


In [73]:
data_sub.columns = data_sub.columns.str.replace(' ', '_').str.lower()

In [74]:
imputer = SimpleImputer(strategy='constant', fill_value=0)

In [75]:
data_sub = pd.DataFrame(imputer.fit_transform(data_sub), columns=data_sub.columns)

In [76]:
data_sub.rename(columns={'msrp': 'price'}, inplace=True)

In [77]:
data_sub

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500
...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,46120
11910,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,56670
11911,Acura,ZDX,2012,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50620
11912,Acura,ZDX,2013,300.0,6.0,AUTOMATIC,4dr Hatchback,23,16,50920


### Question 1
What is the most frequent observation (mode) for the column transmission_type?

A: AUTOMATIC

In [78]:
data_sub['transmission_type'].value_counts()

AUTOMATIC           8266
MANUAL              2935
AUTOMATED_MANUAL     626
DIRECT_DRIVE          68
UNKNOWN               19
Name: transmission_type, dtype: int64

### Question 2
Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.

What are the two features that have the biggest correlation in this dataset?

A: highway_mpg and city_mpg

In [79]:
data_sub.dtypes

make                 object
model                object
year                 object
engine_hp            object
engine_cylinders     object
transmission_type    object
vehicle_style        object
highway_mpg          object
city_mpg             object
price                object
dtype: object

In [80]:
data_sub['year'] = data_sub['year'].astype('int64')
data_sub['engine_hp'] = data_sub['engine_hp'].astype('float64')
data_sub['engine_cylinders'] = data_sub['engine_cylinders'].astype('int64')

data_sub['highway_mpg'] = data_sub['highway_mpg'].astype('float64')
data_sub['city_mpg'] = data_sub['city_mpg'].astype('float64')
data_sub['price'] = data_sub['price'].astype('int64')

In [81]:
numerical_columns = data_sub.select_dtypes(include=['int64', 'float64'])
numerical_columns = numerical_columns.drop('price', axis=1)
numerical_columns.corr()

Unnamed: 0,year,engine_hp,engine_cylinders,highway_mpg,city_mpg
year,1.0,0.338714,-0.040708,0.25824,0.198171
engine_hp,0.338714,1.0,0.774851,-0.415707,-0.424918
engine_cylinders,-0.040708,0.774851,1.0,-0.614541,-0.587306
highway_mpg,0.25824,-0.415707,-0.614541,1.0,0.886829
city_mpg,0.198171,-0.424918,-0.587306,0.886829,1.0


In [82]:
# Make price binary
# Now we need to turn the price variable from numeric into a binary format.
# Let's create a variable above_average which is 1 if the price is above its mean value and 0 otherwise.

In [83]:
mean_price = data_sub['price'].mean()
data_sub['above_average'] = np.where(data_sub['price'] > mean_price, 1, 0)

In [84]:
data_sub.iloc[:,:-2]

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg
0,BMW,1 Series M,2011,335.0,6,MANUAL,Coupe,26.0,19.0
1,BMW,1 Series,2011,300.0,6,MANUAL,Convertible,28.0,19.0
2,BMW,1 Series,2011,300.0,6,MANUAL,Coupe,28.0,20.0
3,BMW,1 Series,2011,230.0,6,MANUAL,Coupe,28.0,18.0
4,BMW,1 Series,2011,230.0,6,MANUAL,Convertible,28.0,18.0
...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,300.0,6,AUTOMATIC,4dr Hatchback,23.0,16.0
11910,Acura,ZDX,2012,300.0,6,AUTOMATIC,4dr Hatchback,23.0,16.0
11911,Acura,ZDX,2012,300.0,6,AUTOMATIC,4dr Hatchback,23.0,16.0
11912,Acura,ZDX,2013,300.0,6,AUTOMATIC,4dr Hatchback,23.0,16.0


In [101]:
X = data_sub.iloc[:,:-2]
y = data_sub["above_average"]
seed = 42


In [102]:

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=seed)


X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=seed)

In [103]:
print("Training set - X_train shape:", X_train.shape, " y_train shape:", y_train.shape)
print("Validation set - X_val shape:", X_val.shape, " y_val shape:", y_val.shape)
print("Test set - X_test shape:", X_test.shape, " y_test shape:", y_test.shape)

Training set - X_train shape: (7148, 9)  y_train shape: (7148,)
Validation set - X_val shape: (2383, 9)  y_val shape: (2383,)
Test set - X_test shape: (2383, 9)  y_test shape: (2383,)


### Question 3
Calculate the mutual information score between above_average and other categorical variables in our dataset. Use the training set only.
Round the scores to 2 decimals using round(score, 2).
Which of these variables has the lowest mutual information score?



A :transmission_type



In [88]:
from sklearn.metrics import mutual_info_score


mi_scores = []

for column in X_train.columns:
    mi_score = mutual_info_score(X_train[column], y_train)
    mi_scores.append((column, mi_score))

mi_scores_df = pd.DataFrame(mi_scores, columns=['Feature', 'Mutual_Info_Score'])
mi_scores_df.sort_values(by='Mutual_Info_Score', ascending=False, inplace=True)

print(mi_scores_df)



             Feature  Mutual_Info_Score
1              model           0.463623
3          engine_hp           0.364958
0               make           0.235285
4   engine_cylinders           0.120325
6      vehicle_style           0.082958
2               year           0.074551
8           city_mpg           0.061826
7        highway_mpg           0.046678
5  transmission_type           0.021197


### Question 4
Now let's train a logistic regression.

Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.

Fit the model on the training dataset.

To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)

Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

What accuracy did you get?

0.60

0.72

0.84

0.95

A:0.95

In [23]:
data_sub

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price,above_average
0,BMW,1 Series M,2011,335.0,6,MANUAL,Coupe,26.0,19.0,46135,1
1,BMW,1 Series,2011,300.0,6,MANUAL,Convertible,28.0,19.0,40650,1
2,BMW,1 Series,2011,300.0,6,MANUAL,Coupe,28.0,20.0,36350,0
3,BMW,1 Series,2011,230.0,6,MANUAL,Coupe,28.0,18.0,29450,0
4,BMW,1 Series,2011,230.0,6,MANUAL,Convertible,28.0,18.0,34500,0
...,...,...,...,...,...,...,...,...,...,...,...
11909,Acura,ZDX,2012,300.0,6,AUTOMATIC,4dr Hatchback,23.0,16.0,46120,1
11910,Acura,ZDX,2012,300.0,6,AUTOMATIC,4dr Hatchback,23.0,16.0,56670,1
11911,Acura,ZDX,2012,300.0,6,AUTOMATIC,4dr Hatchback,23.0,16.0,50620,1
11912,Acura,ZDX,2013,300.0,6,AUTOMATIC,4dr Hatchback,23.0,16.0,50920,1


In [115]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer

In [109]:
categorical_cols = ['make', 'model', 'transmission_type', 'vehicle_style']

In [110]:
X_train_dict = X_train[categorical_cols].to_dict(orient='records')
X_val_dict = X_val[categorical_cols].to_dict(orient='records')

In [111]:
vectorizer = DictVectorizer(sparse=False)

In [112]:
X_train_encoded = vectorizer.fit_transform(X_train_dict)
X_val_encoded = vectorizer.transform(X_val_dict)

In [113]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
model.fit(X_train_encoded, y_train)

In [114]:
y_val_pred = model.predict(X_val_encoded)

In [116]:

accuracy = accuracy_score(y_val, y_val_pred)

rounded_accuracy = round(accuracy, 2)

print("Accuracy on the validation dataset:", rounded_accuracy)

Accuracy on the validation dataset: 0.94


## Question 5
Let's find the least useful feature using the feature elimination technique.
Train a model with all these features (using the same parameters as in Q4).
Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
Which of following feature has the smallest difference?

year

engine_hp

transmission_type

city_mpg

A: transmyssion type

## Question 6
For this question, we'll see how to use a linear regression model from Scikit-Learn.
We'll need to use the original column price. Apply the logarithmic transformation to this column.
Fit the Ridge regression model on the training data with a solver 'sag'. Set the seed to 42.
This model also has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10].
Round your RMSE scores to 3 decimal digits.
Which of these alphas leads to the best RMSE on the validation set?

0

0.01

0.1

1

10

In [125]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Load the data_sub DataFrame

# Assuming data_sub is the DataFrame you provided

# Apply logarithmic transformation to the 'price' column
data_sub['log_price'] = np.log1p(data_sub['price'])

# Select features and target
X = data_sub[['year', 'engine_hp', 'transmission_type', 'city_mpg']]
y = data_sub['log_price']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Ridge regression with different alpha values
alpha_values = [0, 0.01, 0.1, 1, 10]
rmse_scores = []

for alpha in alpha_values:
    model = Ridge(alpha=alpha, solver='sag', random_state=42)
    model.fit(X_train, y_train)

    # Predict on the validation set
    y_val_pred = model.predict(X_val)

    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
    rmse_scores.append(round(rmse, 3))

# Print RMSE scores for each alpha
for alpha, rmse in zip(alpha_values, rmse_scores):
    print(f"Alpha = {alpha}: RMSE = {rmse}")

# Identify the best alpha with the lowest RMSE
best_alpha = alpha_values[np.argmin(rmse_scores)]
print("\nThe best alpha that leads to the lowest RMSE on the validation set:", best_alpha)


ValueError: ignored