# Machine Learning Classification

Classification is a fundamental task in supervised machine learning where the goal is to predict the categorical class or label of a given data point based on its features. In other words, it involves assigning a predefined category to each input instance based on its characteristics.

In [None]:
# download the data
!wget wget https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv

In [99]:
# import all the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import accuracy_score, classification_report, mutual_info_score, mean_squared_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction import DictVectorizer


## Prepare the data and Exploratory Data Analysis (EDA):

In [110]:
features = [
    'Make', 'Model','Year','Engine HP','Engine Cylinders','Transmission Type',
    'Vehicle Style','highway MPG','city mpg','MSRP'
]

df = pd.read_csv('data.csv', iterator=False, usecols=features)
df.head()

Unnamed: 0,Make,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Vehicle Style,highway MPG,city mpg,MSRP
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


### Data Preparation

In [111]:
# transform the column names to lower case and replace spaces with underscores
df.columns = df.columns.str.replace(' ', '_').str.lower()

# fill the missing values with 0
df.fillna(0, inplace=True)

# rename msrp to price
df.rename(columns={'msrp': 'price'}, inplace=True)

df.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500


### Question 1 - What is the most frequent observation (mode) for the column transmission_type?

In [112]:
# get the most frequent observation for the transmission type column
transmission_mode = df['transmission_type'].mode()[0]

print(f'Most frequent observation (mode) for transmission_type: {transmission_mode}')

Most frequent observation (mode) for transmission_type: AUTOMATIC


In [113]:
def find_max_correlation(correlation_matrix):
    """
    Find the maximum correlation between features in a correlation matrix.
    Return the names of the two features and the correlation value.
    """
    # Exclude diagonal elements (self-correlation)
    np.fill_diagonal(correlation_matrix.values, np.nan)
    
    # Find the indices of the maximum correlation
    max_corr_index = np.nanargmax(correlation_matrix.values)
    max_corr_row, max_corr_col = np.unravel_index(max_corr_index, correlation_matrix.shape)
    
    # Identify the features with the highest correlation
    feature1 = correlation_matrix.columns[max_corr_row]
    feature2 = correlation_matrix.columns[max_corr_col]
    max_corr_value = correlation_matrix.iloc[max_corr_row, max_corr_col]
    
    return feature1, feature2, max_corr_value


### Question 2 - What are the two features that have the biggest correlation in this dataset?

In [114]:
# get the numeric features by reading the data types of the columns 
numeric_features = df.select_dtypes(include=np.number).columns.tolist()

print(f'Numeric features: {numeric_features}')

# Create the correlation matrix for the numerical features of your dataset. 
# In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset
corr_matrix = df[numeric_features].corr()
print(f'Correlation Matrix: \n{corr_matrix}\n')

feature1, feature2, max_corr_value = find_max_correlation(corr_matrix)
print(f'Highest Correlation - Feature 1: {feature1} Feature 2: {feature2} ')
print(f'Highest Correlation Value: {max_corr_value:.2f}')


Numeric features: ['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg', 'price']
Correlation Matrix: 
                      year  engine_hp  engine_cylinders  highway_mpg  \
year              1.000000   0.338714         -0.040708     0.258240   
engine_hp         0.338714   1.000000          0.774851    -0.415707   
engine_cylinders -0.040708   0.774851          1.000000    -0.614541   
highway_mpg       0.258240  -0.415707         -0.614541     1.000000   
city_mpg          0.198171  -0.424918         -0.587306     0.886829   
price             0.227590   0.650095          0.526274    -0.160043   

                  city_mpg     price  
year              0.198171  0.227590  
engine_hp        -0.424918  0.650095  
engine_cylinders -0.587306  0.526274  
highway_mpg       0.886829 -0.160043  
city_mpg          1.000000 -0.157676  
price            -0.157676  1.000000  

Highest Correlation - Feature 1: highway_mpg Feature 2: city_mpg 
Highest Correlation Value: 0.89


# Make price binary

In [115]:
# make the price column binary getting the price median value and then adding a new column above_average = 1 when price > median and 0 otherwise
price_median = df['price'].median()
df['above_average'] = (df['price'] > price_median).astype(int)
df.head()

Unnamed: 0,make,model,year,engine_hp,engine_cylinders,transmission_type,vehicle_style,highway_mpg,city_mpg,price,above_average
0,BMW,1 Series M,2011,335.0,6.0,MANUAL,Coupe,26,19,46135,1
1,BMW,1 Series,2011,300.0,6.0,MANUAL,Convertible,28,19,40650,1
2,BMW,1 Series,2011,300.0,6.0,MANUAL,Coupe,28,20,36350,1
3,BMW,1 Series,2011,230.0,6.0,MANUAL,Coupe,28,18,29450,0
4,BMW,1 Series,2011,230.0,6.0,MANUAL,Convertible,28,18,34500,1


# Split the data
- Split your data in train/val/test sets with 60%/20%/20% distribution.
- Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
- Make sure that the target value (price) is not in your dataframe.

In [117]:
# split the data in train/val/test sets, with 60%/20%/20% distribution with seed 42
# .2 splits the data into 80% train and 20% test
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
#.25 splits the 80% train into 60% train and 20% val
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

# reset the indexes of the dataframes
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# separate the target variable from the train/val/test sets
y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

# delete the price column from the train/val/test sets
del df_train['price']
del df_val['price']
del df_test['price']

print('train data length: ',len(df_train),'price values length: ', len(y_train))


train data length:  7148 price values length:  7148


### Question 3 - Which of these variables has the lowest mutual information score?

- Calculate the mutual information score between above_average and other categorical variables in our dataset. Use the training set only.
- Round the scores to 2 decimals using round(score, 2).

In [118]:
# using the training set only, calculate the mutual information score between above_average and other categorical variables
# use the training set only
categorical_features = ['make', 'model', 'transmission_type', 'vehicle_style']
df_train_categorical = df_train[categorical_features]

# calculate the mutual information score between above_average and other categorical variables
def mutual_info_price_score(series):
    return mutual_info_score(series, df_train['above_average'])

mi = df_train_categorical.apply(mutual_info_price_score)
mi.sort_values(ascending=False).round(2)

# A higher MI score indicates a stronger relationship between a categorical feature and the target variable, 
# making it potentially more informative for predicting the target.
print(f'Mutual Information Score: \n{mi}\n')

# print the lowest MI score and the feature name
lowest_mi_score = mi.idxmin()
print(f'Lowest Mutual Information Score: {lowest_mi_score} {mi[lowest_mi_score]:.2f}\n')


Mutual Information Score: 
make                 0.191186
model                0.536538
transmission_type    0.054405
vehicle_style        0.091207
dtype: float64

Lowest Mutual Information Score: transmission_type 0.05



### Question 4 - Calculate the accuracy on the validation dataset

Now let's train a logistic regression.
- Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
- Fit the model on the training dataset.
  - To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
  - model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
- Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

In [151]:
def calc_accuracy (df_train, df_val, features, y_train, y_val) -> float:

    # Convert each category into a binary vector (a series of 0s and 1s). 
    # Each category becomes a new column with a 1 or 0 indicating the presence of that category.
    encoder = OneHotEncoder(sparse_output=False,  handle_unknown='ignore')  
        
    # one-hot encode the categorical variables    
    X_train = encoder.fit_transform(df_train[features].values, True)    

    # train a logistic regression model use (solver='liblinear', C=10, max_iter=1000, random_state=42)
    model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    model.fit(X_train, df_train['above_average'].values)
          
    # process the validation set in the same way you processed the training set     
    X_val = encoder.transform(df_val[features].values)
        
    # calculate the accuracy on the validation dataset and round it to 2 decimal digits
    y_pred = model.predict(X_val)
    # y_pred = model.predict_proba(X_val)[:, 1]
    
    # Calculate accuracy on validation set
    accuracy = accuracy_score(df_val['above_average'].values,  y_pred)
    rounded_accuracy = round(accuracy,6 )
    print(f'Accuracy: {rounded_accuracy} with features: {features}')

    return rounded_accuracy

In [126]:
# train a logistic regression model with the one-hot encoded categorical variables
# use the training set only
# use the same encoder instance to avoid data leakage
def calculate_feature_accuracy (df_train, df_val, cat_features, num_features, y_train, y_val) -> float:

    # Convert each category into a binary vector (a series of 0s and 1s). 
    # Each category becomes a new column with a 1 or 0 indicating the presence of that category.
    encoder = OneHotEncoder(sparse_output=False,  handle_unknown='ignore')  
    scaler = StandardScaler()

    # scale and encode the train and validation sets    
    X_train_num = scaler.fit_transform(df_train[num_features].values)         
    X_val_num = scaler.transform(df_val[num_features].values) 

    # one-hot encode the categorical variables    
    X_train_cat_encoded = encoder.fit_transform(df_train[cat_features].values, True)
    X_train = np.column_stack([X_train_num, X_train_cat_encoded])

    # train a logistic regression model use (solver='liblinear', C=10, max_iter=1000, random_state=42)
    model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    # process the validation set in the same way you processed the training set     
    X_val_cat_encoded = encoder.transform(df_val[cat_features].values)
    X_val = np.column_stack([X_val_num, X_val_cat_encoded])
        
    # calculate the accuracy on the validation dataset and round it to 2 decimal digits
    y_pred = model.predict(X_val)
    # y_pred = model.predict_proba(X_val)[:, 1]
    
    # Calculate accuracy on validation set
    accuracy = accuracy_score(y_val,  y_pred)
    rounded_accuracy = round(accuracy,3 )
    print(f'Accuracy: {rounded_accuracy} with features: {num_features} {cat_features}')

    return rounded_accuracy


In [152]:
# feature name provides all the features except the target variable
feature_names = df_train.columns.tolist()
feature_names.remove('above_average')

numeric_features_names = numeric_features.copy()
numeric_features_names.remove('price')  
y_train_target = df_train['above_average'].values
y_val_target = df_val['above_average'].values

# accuracy_all_features = calculate_feature_accuracy(df_train, df_val, feature_names, numeric_features_names, y_train_target, y_val_target)
accuracy_all_features = calc_accuracy(df_train, df_val, feature_names, y_train, y_val)

Accuracy: 0.937054 with features: ['make', 'model', 'year', 'engine_hp', 'engine_cylinders', 'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg']


### Question 5 - Which of following feature has the smallest difference?

Let's find the least useful feature using the feature elimination technique.
- Train a model with all these features (using the same parameters as in Q4).
- Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
- For each feature, calculate the difference between the original accuracy and the accuracy without the feature.

In [154]:
# track the accuracy everytime you remove a feature from the dataset
track_accuracy = {}

# foreach feature in the categorical_features list, train a model without that feature and calculate the accuracy on the validation dataset
for feature in feature_names:

    all_features = feature_names.copy()
    all_features.remove(feature)

    # accuracy_without_feature = calculate_feature_accuracy(df_train, df_val, cat_features, num_features, y_train_target, y_val_target)
    accuracy_without_feature = calc_accuracy(df_train, df_val, all_features, y_train, y_val)
    
    # calculate the difference between accuracy_all_features and accuracy_without_feature    
    accuracy_difference = accuracy_all_features - accuracy_without_feature
    track_accuracy[feature] = accuracy_difference

# select the smallest difference 
feature_with_smallest_difference = min(track_accuracy, key=track_accuracy.get)
print(f'Feature with smallest difference: {feature_with_smallest_difference} {track_accuracy[feature_with_smallest_difference]}\n')


Accuracy: 0.933277 with features: ['model', 'year', 'engine_hp', 'engine_cylinders', 'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg']
Accuracy: 0.929081 with features: ['make', 'year', 'engine_hp', 'engine_cylinders', 'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg']
Accuracy: 0.929081 with features: ['make', 'model', 'engine_hp', 'engine_cylinders', 'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg']
Accuracy: 0.929081 with features: ['make', 'model', 'year', 'engine_cylinders', 'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg']
Accuracy: 0.934536 with features: ['make', 'model', 'year', 'engine_hp', 'transmission_type', 'vehicle_style', 'highway_mpg', 'city_mpg']
Accuracy: 0.934956 with features: ['make', 'model', 'year', 'engine_hp', 'engine_cylinders', 'vehicle_style', 'highway_mpg', 'city_mpg']
Accuracy: 0.933277 with features: ['make', 'model', 'year', 'engine_hp', 'engine_cylinders', 'transmission_type', 'highway_mpg', 'c

In [158]:
# feature scaling by processing the numerical columns and the categorical columns separately
def scale_num_features(df, features, scaler, fit_model= True) -> np.ndarray:

    """
    Scale the numerical columns using StandardScaler
    """
    
    # scale the numerical columns
    # scaler = StandardScaler()
    df_numerical = df[features]
    X_train_num = None

    if fit_model:
        X_train_num = scaler.fit_transform(df_numerical.values) 
    else:
        X_train_num = scaler.transform(df_numerical.values) 
    
    scaler.fit_transform(df_numerical.values)

    return X_train_num
    
def scale_cat_features(df, features, encoder, fit_model= True) -> np.ndarray:

    """
    Scale the categorical columns using OneHotEncoder 
    """
    
    # one-hot encode the categorical variables
    # encoder = OneHotEncoder(sparse_output=False,  handle_unknown='ignore')  
    df_categorical = df[features]
    X_train_cat = None
    
    if fit_model:
        X_train_cat = encoder.fit_transform(df_categorical.values)
    else:
        X_train_cat = encoder.transform(df_categorical.values)
    
    return X_train_cat

def scale_encode_features(df, num_features, cat_features, scaler, encoder, fit_model= True) -> np.ndarray:

    """
    Scale the numerical and categorical columns
    """
    
    # scale the numerical columns
    X_train_num = scale_num_features(df, num_features, scaler, fit_model)
        
    # one-hot encode the categorical variables    
    X_train_cat = scale_cat_features(df, cat_features,encoder, fit_model)
    
    # concatenate the numerical and categorical columns
    X_train = np.column_stack([X_train_num, X_train_cat])
    
    return X_train

### Question 6 - Which of these alphas leads to the best RMSE on the validation set?

For this question, we'll see how to use a linear regression model from Scikit-Learn.
- We need to use the original column price. Apply the logarithmic transformation to this column.
- Fit the Ridge regression model on the training data with a solver 'sag'. Set the seed to 42.
- This model also has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10].
- Round your RMSE scores to 3 decimal digits.

In [161]:
# apply the logarithmic transformation to the price variable which has original price (y_train)
y_log_train = np.log1p(y_train)
average_msrp = y_log_train.mean()
print(f'Average MSRP: {average_msrp:.2f}\n')

# List of alpha values to try
alpha_values = [0]
# , 0.01, 0.1, 1, 10]

# Train Ridge regression models with different alpha values
# A higher alpha leads to stronger regularization, which can help prevent overfitting but may make the model too biased.
rmse_scores = []

# instantiate the scaler and encoder
scaler = StandardScaler()
encoder = OneHotEncoder(sparse_output=False,  handle_unknown='ignore')

# standardize the numeric and categorical features for the train set
X_train_std = scale_encode_features(df_train, numeric_features_names, categorical_features, scaler, encoder, True)

# standardize the numeric and categorical features for the validation set
y_val_std = scale_encode_features(df_val, numeric_features_names, categorical_features, scaler, encoder, False)

# define the ridge model    
ridge_model = Ridge(alpha=alpha, solver='sag', random_state=42, max_iter=5000)
for alpha in alpha_values:
            
    # fit the training data to the model    
    ridge_model.fit(X_train_std, y_log_train)

    # Predict on the validation set
    # y_log_val = np.log1p(y_val_std)

    # show nan values
    # print(np.isnan(y_log_val).any())
    y_log_pred = ridge_model.predict(y_val_std)

    # convert prediction back to original scale
    y_pred = np.expm1(y_log_pred)
    print(f"Example y_pred for alpha={alpha}: {y_log_pred[:5]}")  # Display first 5 predictions
    
    rmse = mean_squared_error(y_test, y_pred, squared=False)

    print(f"RMSE for alpha={alpha}: {rmse}")
    print(f"Example y_pred for alpha={alpha}: {y_pred[:5]}")  # Display first 5 predictions

    y_log_test = np.log1p(y_test)
    rmse = mean_squared_error(y_log_test, y_log_pred, squared=False)
    print(f"RMSE for alpha={alpha}: {rmse}")
    

    # Append the RMSE to the list
    rmse_scores.append((alpha, rmse))

print(f'RMSE Scores: \n{rmse_scores}\n')

Average MSRP: 10.13

RMSE for alpha=0: 81134.1537097014
Example y_pred for alpha=0: [24730.91610139 48620.65328883 21802.31510211 41870.97448071
  1953.20717633]
RMSE Scores: 
[(0, 81134.1537097014)]

