# Selecting a Model

In [14]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import re
import pickle
from itertools import combinations

## Data Cleaning

In [15]:
df = pd.read_csv('../data/cleaned_car_price_prediction.csv')
df['HasTurbo'] = df['HasTurbo'].astype(int)
df.columns = [re.sub(' ', '_', col) for col in df.columns]
df.columns = [re.sub('\.', '', col) for col in df.columns]
x_cols = df.columns[1:]
df = df.sample(n=1000, random_state=42)
df.head()

Unnamed: 0,Price,Manufacturer,Prod_year,Category,Leather_interior,Fuel_type,Engine_Volume,HasTurbo,Mileage,Cylinders,Gear_box_type,Drive_wheels,Doors,Wheel,Color,Airbags
736,27284,CHEVROLET,2014,Hatchback,No,Plug-in Hybrid,1.4,0,65000,4.0,Automatic,Front,04-May,Left wheel,Silver,10
8674,10349,MERCEDES-BENZ,1997,Microbus,Yes,Diesel,2.9,1,3333,6.0,Manual,Rear,02-Mar,Left wheel,Red,2
5971,40769,MERCEDES-BENZ,1996,Sedan,No,Petrol,1.8,0,212485,8.0,Manual,Rear,04-May,Left wheel,Green,2
1957,38737,HYUNDAI,2014,Jeep,Yes,Diesel,2.0,0,132756,4.0,Automatic,Front,04-May,Left wheel,Grey,4
11075,42102,SSANGYONG,2017,Jeep,Yes,Petrol,1.6,0,50750,4.0,Automatic,Front,04-May,Left wheel,White,4


In [16]:
formula = "Price ~ " #initiate the formula
for i, predictor in enumerate(x_cols):
    if i == 0:
        if df.dtypes[predictor] == 'object': 
            formula += f'C({predictor})'
        else:
            formula += predictor
    else:
        if df.dtypes[predictor] == 'object':
            formula += f' + C({predictor})'
        else:
            formula += f' + {predictor}'
model = smf.ols(formula, data = df).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.379
Model:                            OLS   Adj. R-squared:                  0.323
Method:                 Least Squares   F-statistic:                     6.731
Date:                Tue, 08 Oct 2024   Prob (F-statistic):           1.27e-52
Time:                        17:18:00   Log-Likelihood:                -11004.
No. Observations:                1000   AIC:                         2.218e+04
Df Residuals:                     916   BIC:                         2.259e+04
Df Model:                          83                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
Intercep

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 736 to 4073
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Price             1000 non-null   int64  
 1   Manufacturer      1000 non-null   object 
 2   Prod_year         1000 non-null   int64  
 3   Category          1000 non-null   object 
 4   Leather_interior  1000 non-null   object 
 5   Fuel_type         1000 non-null   object 
 6   Engine_Volume     1000 non-null   float64
 7   HasTurbo          1000 non-null   int64  
 8   Mileage           1000 non-null   int64  
 9   Cylinders         1000 non-null   float64
 10  Gear_box_type     1000 non-null   object 
 11  Drive_wheels      1000 non-null   object 
 12  Doors             1000 non-null   object 
 13  Wheel             1000 non-null   object 
 14  Color             1000 non-null   object 
 15  Airbags           1000 non-null   int64  
dtypes: float64(2), int64(5), object(9)
memory usa

## Funtion Creation

In [18]:
def all_subsets(lst):
    subsets = []
    # Iterate over all possible lengths of the subset
    for r in range(len(lst) + 1):
        # Generate all combinations of length r
        for combo in combinations(lst, r):
            subsets.append(list(combo))
    return subsets

In [19]:
subsets = all_subsets(x_cols) #list of all predictors
subsets = subsets[1:]
len(subsets)

32767

In [20]:
n = 100
sections = []
for i in range(n):
    k = len(subsets) // n 
    if i == n - 1:
        sections += [subsets[i*k:]]
    else: 
        sections += [subsets[i*k:i*k+k]]
sum([len(section) for section in sections])
subsets[327:327+10] == sections[1][:10]

True

In [21]:
subsets[327:327+10] == sections[1][:10]

True

In [22]:
def calculate_metrics(model, X, y):
    n = len(y)
    k = model.df_model  # Number of predictors, excluding intercept
    
    # AIC
    aic = model.aic
    
    # BIC
    bic = model.bic
    
    # PRESS (Prediction Sum of Squares)
    hat_matrix = X @ np.array(np.linalg.inv(X.T @ X) @ X.T)
    residuals = model.resid
    press = np.sum((residuals)**2 / (1 - np.diag(hat_matrix)))
        
    # Adjusted R-squared
    r2 = model.rsquared
    adj_r2 = 1 - (1 - r2) * (n - 1) / (n - k)
    
    # Mean Squared Error MSE
    residuals = model.resid
    mse = (residuals ** 2).mean()
    
    return aic, bic, press, adj_r2, int(k), mse #dont consider intercept as a predictor

In [23]:
def cross_validate(X, y, k=5):
    np.random.seed(42)  # For reproducibility
    shuffled_indices = np.random.permutation(len(X))
    fold_sizes = len(X) // k
    scores = []
    for i in range(k):
        val_start = i * fold_sizes
        val_end = val_start + fold_sizes
        val_indices = shuffled_indices[val_start:val_end]
        train_indices = np.concatenate([shuffled_indices[:val_start], shuffled_indices[val_end:]])
        # print(val_indices, train_indices)

        X_train, y_train = X.iloc[train_indices], y.iloc[train_indices]
        X_val, y_val = X.iloc[val_indices], y.iloc[val_indices]
        
        #calculate mse if you train the model with the training set
        model = sm.OLS(y_train, X_train).fit()
        predictions = model.predict(X_val)
        mse = ((predictions - y_val.squeeze()) ** 2).mean()
        scores.append(mse)

    return sum(scores) / len(scores)

In [24]:
n = 100
sections = []
for i in range(n):
    k = len(subsets) // n 
    if i == n - 1:
        sections += [subsets[i*k:]]
    else: 
        sections += [subsets[i*k:i*k+k]]
len(sections[0])

def results_to_pkl(subsets, index):     
    results = []
    for j, predictors in enumerate(subsets):
        # if j == 7: break
        # print(predictors)
        formula = "Price ~ " #initiate the formula
        for i, predictor in enumerate(predictors):
            if i == 0:
                if df.dtypes[predictor] == 'object':
                    formula += f'C({predictor})'
                else:
                    formula += predictor
            else:
                if df.dtypes[predictor] == 'object':
                    formula += f' + C({predictor})'
                else:
                    formula += f' + {predictor}'
                    
        #train the model            
        model = smf.ols(formula, data = df).fit()
        
        y = df['Price']
        X = df[predictors].copy()
        cat_cols = []
        for predictor in predictors: 
            if X.dtypes[predictor] == 'object':
                cat_cols.append(predictor)
        if len(cat_cols) > 0:
            X = pd.get_dummies(X, columns=cat_cols, drop_first=True)
            for col in X.columns:
                X[col] = X[col].astype(float)
        X = sm.add_constant(X, has_constant='add')
                
        aic, bic, press, adj_r2, num_predictors, mse = calculate_metrics(model, X = X, y = y)
        mse_cv_5 = cross_validate(X, y, k = 5)
        mse_cv_10 = cross_validate(X, y, k = 10)
        # mse_cv_100 = cross_validate(X, y, k = 100)
        
        results.append({
            'Predictors': predictors,
            'n_Predictors': num_predictors,
            'Adjusted R^2': adj_r2,
            'AIC': aic,
            'BIC': bic,
            'PRESS': press,
            'MSE': mse,
            '5-Fold_CV MSE': mse_cv_5,
            '10-Fold_CV MSE': mse_cv_10,
            # '100-Fold_CV MSE': mse_cv_100        
        })
            
    # Convert results to pd DataFrame
    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values(by='n_Predictors').reset_index(drop=True)
    with open(f'models/results_df_{index}', 'wb') as f:
        pickle.dump(results_df, f)

In [25]:
results_to_pkl(sections[0], 0)

In [26]:
with open(f'models/results_df_0', 'rb') as f:
    data = pickle.load(f)
# with open(f'models/results_df', 'rb') as f:
#     data = pickle.load(f)
data

Unnamed: 0,Predictors,n_Predictors,Adjusted R^2,AIC,BIC,PRESS,MSE,5-Fold_CV MSE,10-Fold_CV MSE
0,[Wheel],1,0.017739,22469.976207,22479.791718,3.348607e+11,3.344838e+08,3.352270e+08,3.352906e+08
1,[Prod_year],1,0.081436,22402.931063,22412.746573,3.141844e+11,3.127936e+08,3.149773e+08,3.151173e+08
2,[Leather_interior],1,0.022549,22465.068075,22474.883586,3.333617e+11,3.328462e+08,3.342666e+08,3.338816e+08
3,[Engine_Volume],1,0.006237,22481.618229,22491.433740,3.394439e+11,3.384007e+08,3.415937e+08,3.407610e+08
4,[HasTurbo],1,0.036649,22450.536954,22460.352464,3.288783e+11,3.280445e+08,3.296144e+08,3.296077e+08
...,...,...,...,...,...,...,...,...,...
322,"[Manufacturer, Drive_wheels, Color]",56,0.108355,22426.559176,22706.301227,inf,2.869109e+08,3.090503e+08,3.097062e+08
323,"[Manufacturer, Doors, Color]",56,0.106646,22428.473580,22708.215631,inf,2.874606e+08,3.075365e+08,3.085067e+08
324,"[Manufacturer, Gear_box_type, Color]",57,0.139731,22391.675698,22676.325505,inf,2.765214e+08,2.964827e+08,2.983080e+08
325,"[Manufacturer, Fuel_type, Color]",59,0.134899,22399.154159,22693.619475,inf,2.774849e+08,2.983675e+08,2.995818e+08


In [None]:
for i in range(n):
    results_to_pkl(sections[i], i)

In [None]:
dataframes = []
for i in range(n):
    with open(f'models/results_df_{i}', 'rb') as f:
        data = pickle.load(f)
    #concat all the data into one dataframe
    dataframes.append(data)
results_df = pd.concat(dataframes, ignore_index=True)
# with open(f'models/full_results_df', 'wb') as f:
#         pickle.dump(results_df, f)
results_df

In [35]:
# results = []
# for j, predictors in enumerate(subsets):
#     # if j == 7: break
#     # print(predictors)
#     formula = "Price ~ " #initiate the formula
#     for i, predictor in enumerate(predictors):
#         if i == 0:
#             if df.dtypes[predictor] == 'object':
#                 formula += f'C({predictor})'
#             else:
#                 formula += predictor
#         else:
#             if df.dtypes[predictor] == 'object':
#                 formula += f' + C({predictor})'
#             else:
#                 formula += f' + {predictor}'
                
#     #train the model            
#     model = smf.ols(formula, data = df).fit()
    
#     y = df['Price']
#     X = df[predictors].copy()
#     cat_cols = []
#     for predictor in predictors: 
#         if X.dtypes[predictor] == 'object':
#             cat_cols.append(predictor)
#     if len(cat_cols) > 0:
#         X = pd.get_dummies(X, columns=cat_cols, drop_first=True)
#         for col in X.columns:
#             X[col] = X[col].astype(float)
#     X = sm.add_constant(X, has_constant='add')
            
#     aic, bic, press, adj_r2, num_predictors, mse = calculate_metrics(model, X = X, y = y)
#     mse_cv_5 = cross_validate(X, y, k = 5)
#     mse_cv_10 = cross_validate(X, y, k = 10)
#     mse_cv_100 = cross_validate(X, y, k = 100)
    
#     results.append({
#         'Predictors': predictors,
#         'n_Predictors': num_predictors,
#         'Adjusted R^2': adj_r2,
#         'AIC': aic,
#         'BIC': bic,
#         'PRESS': press,
#         'MSE': mse,
#         '5-Fold_CV MSE': mse_cv_5,
#         '10-Fold_CV MSE': mse_cv_10,
#         '100-Fold_CV MSE': mse_cv_100        
#     })
        
# # Convert results to pd DataFrame
# results_df = pd.DataFrame(results)
# results_df = results_df.sort_values(by='n_Predictors').reset_index(drop=True)
# with open(f'models/results_df', 'wb') as f:
#     pickle.dump(results_df, f)

In [None]:
with open(f'models/results_df_1', 'rb') as f:
    results_df = pickle.load(f)
results_df

# Analysis

In [None]:
print('Lowest MSE')
results_df.loc[results_df['MSE'].idxmin()]

In [None]:
print('Lowest CV MSE')
results_df.loc[results_df['5-Fold_CV MSE'].idxmin()]

In [None]:
print('Lowest AIC')
results_df.loc[results_df['AIC'].idxmin()]

In [None]:
print('Lowest BIC')
results_df.loc[results_df['BIC'].idxmin()]

In [None]:
print('Highest Adjusted R^2	')
results_df.loc[results_df['Adjusted R^2'].idxmax()]