# Machine Learning Classification

Classification is a fundamental task in supervised machine learning where the goal is to predict the categorical class or label of a given data point based on its features. In other words, it involves assigning a predefined category to each input instance based on its characteristics.

In [None]:
# download the data
!wget wget https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv

In [128]:
# import all the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import accuracy_score, classification_report, mutual_info_score, mean_squared_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler


## Prepare the data and Exploratory Data Analysis (EDA):

In [74]:
features = [
    'Make', 'Model','Year','Engine HP','Engine Cylinders','Transmission Type',
    'Vehicle Style','highway MPG','city mpg','MSRP'
]

df = pd.read_csv('data.csv', iterator=False, usecols=features)
df.head()

Unnamed: 0,Make,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Vehicle Style,highway MPG,city mpg,MSRP
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


### Data Preparation

In [75]:
# transform the column names to lower case and replace spaces with underscores
df.columns = df.columns.str.replace(' ', '_').str.lower()

# fill the missing values with 0
df.fillna(0, inplace=True)

# rename msrp to price
df.rename(columns={'msrp': 'price'}, inplace=True)

df.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


### Question 1 - What is the most frequent observation (mode) for the column transmission_type?

In [76]:
# get the most frequent observation for the transmission type column
transmission_mode = df['transmission_type'].mode()[0]

print(f'Most frequent observation (mode) for transmission_type: {transmission_mode}')

Most frequent observation (mode) for transmission_type: AUTOMATIC


In [77]:
def find_max_correlation(correlation_matrix):
    """
    Find the maximum correlation between features in a correlation matrix.
    Return the names of the two features and the correlation value.
    """
    # Exclude diagonal elements (self-correlation)
    np.fill_diagonal(correlation_matrix.values, np.nan)
    
    # Find the indices of the maximum correlation
    max_corr_index = np.nanargmax(correlation_matrix.values)
    max_corr_row, max_corr_col = np.unravel_index(max_corr_index, correlation_matrix.shape)
    
    # Identify the features with the highest correlation
    feature1 = correlation_matrix.columns[max_corr_row]
    feature2 = correlation_matrix.columns[max_corr_col]
    max_corr_value = correlation_matrix.iloc[max_corr_row, max_corr_col]
    
    return feature1, feature2, max_corr_value


### Question 2 - What are the two features that have the biggest correlation in this dataset?

In [78]:
# get the numeric features by reading the data types of the columns and exclude year
numeric_features = df.select_dtypes(include=np.number).columns.tolist()
print(f'Numeric features: {numeric_features}')

# Create the correlation matrix for the numerical features of your dataset. 
# In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset
corr_matrix = df[numeric_features].corr()
print(f'Correlation Matrix: \n{corr_matrix}\n')

feature1, feature2, max_corr_value = find_max_correlation(corr_matrix)
print(f'Highest Correlation - Feature 1: {feature1} Feature 2: {feature2} ')
print(f'Highest Correlation Value: {max_corr_value:.2f}')


Numeric features: ['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg', 'price']
Correlation Matrix: 
                      year  engine_hp  engine_cylinders  highway_mpg  \
year              1.000000   0.338714         -0.040708     0.258240   
engine_hp         0.338714   1.000000          0.774851    -0.415707   
engine_cylinders -0.040708   0.774851          1.000000    -0.614541   
highway_mpg       0.258240  -0.415707         -0.614541     1.000000   
city_mpg          0.198171  -0.424918         -0.587306     0.886829   
price             0.227590   0.650095          0.526274    -0.160043   

                  city_mpg     price  
year              0.198171  0.227590  
engine_hp        -0.424918  0.650095  
engine_cylinders -0.587306  0.526274  
highway_mpg       0.886829 -0.160043  
city_mpg          1.000000 -0.157676  
price            -0.157676  1.000000  

Highest Correlation - Feature 1: highway_mpg Feature 2: city_mpg 
Highest Correlation Value: 0.89


# Make price binary

In [79]:
# make the price column binary getting the price median value and then adding a new column above_average = 1 when price > median and 0 otherwise
price_median = df['price'].median()
df['above_average'] = (df['price'] > price_median).astype(int)
df.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price,above_average
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135,1
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650,1
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350,1
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450,0
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500,1


# Split the data
- Split your data in train/val/test sets with 60%/20%/20% distribution.
- Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
- Make sure that the target value (price) is not in your dataframe.

In [126]:
# split the data in train/val/test sets, with 60%/20%/20% distribution with seed 42
# .2 splits the data into 80% train and 20% test
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
#.25 splits the 80% train into 60% train and 20% val
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

# reset the indexes of the dataframes
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# separate the target variable from the train/val/test sets
y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

# delete the price column from the train/val/test sets
del df_train['price']
del df_val['price']
del df_test['price']

print('train data length: ',len(df_train),'price values length: ', len(y_train))


train data length:  7148 price values length:  7148


### Question 3 - Which of these variables has the lowest mutual information score?

- Calculate the mutual information score between above_average and other categorical variables in our dataset. Use the training set only.
- Round the scores to 2 decimals using round(score, 2).

In [81]:
# using the training set only, calculate the mutual information score between above_average and other categorical variables
# use the training set only
categorical_features = ['make', 'model', 'transmission_type', 'vehicle_style']
df_train_categorical = df_train[categorical_features]

# calculate the mutual information score between above_average and other categorical variables
def mutual_info_price_score(series):
    return mutual_info_score(series, df_train.above_average)

mi = df_train_categorical.apply(mutual_info_price_score)
mi.sort_values(ascending=False).round(2)

# A higher MI score indicates a stronger relationship between a categorical feature and the target variable, 
# making it potentially more informative for predicting the target.
print(f'Mutual Information Score: \n{mi}\n')

# print the lowest MI score and the feature name
lowest_mi_score = mi.idxmin()
print(f'Lowest Mutual Information Score: {lowest_mi_score} {mi[lowest_mi_score]:.2f}\n')


Mutual Information Score: 
make                 0.191186
model                0.536538
transmission_type    0.054405
vehicle_style        0.091207
dtype: float64

Lowest Mutual Information Score: transmission_type 0.05



### Question 4 - Calculate the accuracy on the validation dataset

Now let's train a logistic regression.
- Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
- Fit the model on the training dataset.
  - To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
  - model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
- Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

In [118]:
# train a logistic regression model with the one-hot encoded categorical variables
# use the training set only
# use the same encoder instance to avoid data leakage
def calculate_feature_accuracy (df_train, df_val, features, target_variable) -> float:

    # Convert each category into a binary vector (a series of 0s and 1s). 
    # Each category becomes a new column with a 1 or 0 indicating the presence of that category.
    encoder = OneHotEncoder(sparse_output=False,  handle_unknown='ignore')  

    # one-hot encode the categorical variables
    df_train_categorical = df_train[features]    
    df_train_categorical_encoded = encoder.fit_transform(df_train_categorical, True)
    # print(df_train_categorical.head(2))
    # train a logistic regression model with the one-hot encoded use (solver='liblinear', C=10, max_iter=1000, random_state=42)
    model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    model.fit(df_train_categorical_encoded, df_train[target_variable])

    # process the validation set in the same way you processed the training set
    df_val_categorical = df_val[features]
    df_val_categorical_encoded = encoder.transform(df_val_categorical)
    # print(df_val_categorical.head(2))
    # calculate the accuracy on the validation dataset and round it to 2 decimal digits
    y_val_pred = model.predict(df_val_categorical_encoded)

    # Calculate accuracy on validation set
    accuracy = accuracy_score(df_val[target_variable], y_val_pred)
    rounded_accuracy = round(accuracy,6 )
    print(f'Accuracy: {rounded_accuracy}\n {features}')

    return rounded_accuracy

result = calculate_feature_accuracy(df_train, df_val, categorical_features, 'above_average')


Accuracy: 0.911456
 ['make', 'model', 'transmission_type', 'vehicle_style']


### Question 5 - Which of following feature has the smallest difference?

Let's find the least useful feature using the feature elimination technique.
- Train a model with all these features (using the same parameters as in Q4).
- Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
- For each feature, calculate the difference between the original accuracy and the accuracy without the feature.

In [112]:
# track the accuracy everytime you remove a feature from the dataset
track_accuracy = {}
feature_names = df_train.columns.tolist()

# delete the above_average column from the feature_names list
feature_names.remove('above_average')

accuracy_all_features = calculate_feature_accuracy(df_train, df_val, feature_names, 'above_average')

# foreach feature in the categorical_features list, train a model without that feature and calculate the accuracy on the validation dataset
for feature in feature_names:
    all_features = feature_names.copy()
    all_features.remove(feature)
    # print(f'Feature: {feature}')
    accuracy_without_feature = calculate_feature_accuracy(df_train, df_val, all_features, 'above_average')
    # print(f'Accuracy without feature: {accuracy_without_feature}\n')

    # calculate the difference between accuracy_all_features and accuracy_without_feature    
    accuracy_difference = accuracy_all_features - accuracy_without_feature
    track_accuracy[feature] = accuracy_difference

# select the smallest difference 
feature_with_smallest_difference = min(track_accuracy, key=track_accuracy.get)
print(f'Feature with smallest difference: {feature_with_smallest_difference} {track_accuracy[feature_with_smallest_difference]}\n')

print(f'Track Accuracy: \n{track_accuracy}\n')

Accuracy: 0.937054
 ['make', 'model', 'year', 'engine_hp', 'engine_cylinders', 'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg']
Accuracy: 0.933277
 ['model', 'year', 'engine_hp', 'engine_cylinders', 'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg']
Accuracy: 0.929081
 ['make', 'year', 'engine_hp', 'engine_cylinders', 'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg']
Accuracy: 0.929081
 ['make', 'model', 'engine_hp', 'engine_cylinders', 'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg']
Accuracy: 0.929081
 ['make', 'model', 'year', 'engine_cylinders', 'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg']
Accuracy: 0.934536
 ['make', 'model', 'year', 'engine_hp', 'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg']
Accuracy: 0.934956
 ['make', 'model', 'year', 'engine_hp', 'engine_cylinders', 'vehicle_style', 'highway_mpg', 'city_mpg']
Accuracy: 0.933277
 ['make', 'model', 'year', 'engine_hp', 'engine_cyli

### Question 6 - Which of these alphas leads to the best RMSE on the validation set?

For this question, we'll see how to use a linear regression model from Scikit-Learn.
- We need to use the original column price. Apply the logarithmic transformation to this column.
- Fit the Ridge regression model on the training data with a solver 'sag'. Set the seed to 42.
- This model also has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10].
- Round your RMSE scores to 3 decimal digits.

In [132]:
# apply the logarithmic transformation to the price variable which has original price (y_train)
y_log_train = np.log1p(y_train)
average_msrp = y_train.mean()
print(f'Average MSRP: {average_msrp:.2f}\n')

print(len(y_train), len(df_train))

# List of alpha values to try
alpha_values = [0, 0.01, 0.1, 1, 10]

# Train Ridge regression models with different alpha values
# A higher alpha leads to stronger regularization, which can help prevent overfitting but may make the model too biased.
rmse_scores = []
for alpha in alpha_values:
    ridge_model = Ridge(alpha=alpha, solver='sag', random_state=42)
    
    # one-hot encode the variables for both the training and validation sets
    encoder = OneHotEncoder(sparse_output=False,  handle_unknown='ignore')          
    df_train_encoded = encoder.fit_transform(df_train[categorical_features], True)    
    df_val_encoded = encoder.transform(df_val[categorical_features])

    # fit the training data to the model    
    ridge_model.fit(df_train_encoded, y_log_train)

    # Predict on the validation set
    y_val_pred = ridge_model.predict(df_val_encoded)

    # Calculate the mean squared error
    mse = mean_squared_error(y_test, y_val_pred)

    # Calculate the RMSE
    rmse = np.sqrt(mse)

    print(f"RMSE for alpha={alpha}: {rmse}")
    print(f"Example y_pred for alpha={alpha}: {y_val_pred[:5]}")  # Display first 5 predictions

    # Append the RMSE to the list
    rmse_scores.append((alpha, rmse))

print(f'RMSE Scores: \n{rmse_scores}\n')

Average MSRP: 40606.94

7148 7148
RMSE for alpha=0: 62953.934337618724
Example y_pred for alpha=0: [10.14282379 10.89830291  9.99009626 10.5399875   7.60678884]
RMSE for alpha=0.01: 62953.93440734904
Example y_pred for alpha=0.01: [10.14295946 10.89772153  9.9926303  10.54171836  7.60758037]
RMSE for alpha=0.1: 62953.934603996095
Example y_pred for alpha=0.1: [10.14354825 10.89504623 10.00094738 10.54395348  7.6209582 ]
RMSE for alpha=1: 62953.93467240232
Example y_pred for alpha=1: [10.14853211 10.89078162 10.02109745 10.52933801  7.67176187]
RMSE for alpha=10: 62953.931151423734
Example y_pred for alpha=10: [10.18230442 10.85792458 10.05989312 10.41136683  7.904774  ]
RMSE Scores: 
[(0, 62953.934337618724), (0.01, 62953.93440734904), (0.1, 62953.934603996095), (1, 62953.93467240232), (10, 62953.931151423734)]



In [130]:

# Sample data for vehicles with same year, make, model
num_vehicles = 100  # Number of vehicles
average_msrp = 45000  # Average MSRP

# Generate random features (e.g., horsepower, features, etc.) for the vehicles
np.random.seed(42)
horsepower = np.random.randint(150, 400, num_vehicles)
features = np.random.randint(4, 10, num_vehicles)

# Assuming all vehicles have the same year, make, and model
year = 2022
make = 'Toyota'
model = 'Camry'

# Create a DataFrame with the features
# For simplicity, we'll just consider horsepower and features as features
# You can add more features as needed for your specific case
data = {
    'year': year,
    'make': make,
    'model': model,
    'horsepower': horsepower,
    'features': features
}
df = pd.DataFrame(data)

# Generate target variable (MSRP) with some random noise
np.random.seed(42)
df['msrp'] = np.random.normal(average_msrp, 2000, num_vehicles)  # Adding noise

# Features and target
X = df[['horsepower', 'features']]
y = df['msrp']

# One-hot encode the categorical features (year, make, model)
categorical_features = ['year', 'make', 'model']
encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(df[categorical_features])

# Concatenate the one-hot encoded features with the numeric features
X = np.hstack((X, X_encoded.toarray()))

# Alpha values for Ridge regression
alpha_values = [0, 0.01, 0.1, 1, 10]

# Train Ridge models for each alpha and calculate RMSE
best_rmse = float('inf')
best_alpha = None

for alpha in alpha_values:
    # Train Ridge model
    model = Ridge(alpha=alpha, random_state=42)
    model.fit(X, y)
    
    # Predict MSRP
    y_pred = model.predict(X)
    
    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    
    # Update best RMSE and alpha if needed
    if rmse < best_rmse:
        best_rmse = rmse
        best_alpha = alpha
    
    print(f"RMSE for alpha={alpha}: {rmse}")
    print(f"Example y_pred for alpha={alpha}: {y_pred[:5]}")  # Display first 5 predictions

print(f"\nBest RMSE: {best_rmse} (for alpha={best_alpha})")

RMSE for alpha=0: 1799.9209406153482
Example y_pred for alpha=0: [44642.91297957 45090.43270507 44780.86895271 44798.52256515
 44961.28458256]
RMSE for alpha=0.01: 1799.9209406214413
Example y_pred for alpha=0.01: [44642.91749102 45090.42485015 44780.86796957 44798.51705287
 44961.27776408]
RMSE for alpha=0.1: 1799.9209412242742
Example y_pred for alpha=0.1: [44642.95807943 45090.35418149 44780.85912452 44798.46746027
 44961.21641998]
RMSE for alpha=1: 1799.9210011131509
Example y_pred for alpha=1: [44643.36251379 45089.65001868 44780.77099027 44797.97330665
 44960.60517033]
RMSE for alpha=10: 1799.9266158196995
Example y_pred for alpha=10: [44647.26695104 45082.85195901 44779.92016312 44793.20282158
 44954.70415882]

Best RMSE: 1799.9209406153482 (for alpha=0)
