# Selecting a Model

In [12]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import re
import pickle
from itertools import combinations

## Data Cleaning

In [13]:
df = pd.read_csv('../data/cleaned_car_price_prediction.csv')
df['HasTurbo'] = df['HasTurbo'].astype(int)
df.columns = [re.sub(' ', '_', col) for col in df.columns]
df.columns = [re.sub('\.', '', col) for col in df.columns]
x_cols = df.columns[1:]
df = df.sample(n=2000, random_state=42)
df.head()

Unnamed: 0,Price,Manufacturer,Prod_year,Category,Leather_interior,Fuel_type,Engine_Volume,HasTurbo,Mileage,Cylinders,Gear_box_type,Drive_wheels,Doors,Wheel,Color,Airbags
736,27284,CHEVROLET,2014,Hatchback,No,Plug-in Hybrid,1.4,0,65000,4.0,Automatic,Front,04-May,Left wheel,Silver,10
8674,10349,MERCEDES-BENZ,1997,Microbus,Yes,Diesel,2.9,1,3333,6.0,Manual,Rear,02-Mar,Left wheel,Red,2
5971,40769,MERCEDES-BENZ,1996,Sedan,No,Petrol,1.8,0,212485,8.0,Manual,Rear,04-May,Left wheel,Green,2
1957,38737,HYUNDAI,2014,Jeep,Yes,Diesel,2.0,0,132756,4.0,Automatic,Front,04-May,Left wheel,Grey,4
11075,42102,SSANGYONG,2017,Jeep,Yes,Petrol,1.6,0,50750,4.0,Automatic,Front,04-May,Left wheel,White,4


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2000 entries, 736 to 15704
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Price             2000 non-null   int64  
 1   Manufacturer      2000 non-null   object 
 2   Prod_year         2000 non-null   int64  
 3   Category          2000 non-null   object 
 4   Leather_interior  2000 non-null   object 
 5   Fuel_type         2000 non-null   object 
 6   Engine_Volume     2000 non-null   float64
 7   HasTurbo          2000 non-null   int64  
 8   Mileage           2000 non-null   int64  
 9   Cylinders         2000 non-null   float64
 10  Gear_box_type     2000 non-null   object 
 11  Drive_wheels      2000 non-null   object 
 12  Doors             2000 non-null   object 
 13  Wheel             2000 non-null   object 
 14  Color             2000 non-null   object 
 15  Airbags           2000 non-null   int64  
dtypes: float64(2), int64(5), object(9)
memory us

## Funtion Creation

In [15]:
def all_subsets(lst):
    subsets = []
    # Iterate over all possible lengths of the subset
    for r in range(len(lst) + 1):
        # Generate all combinations of length r
        for combo in combinations(lst, r):
            subsets.append(list(combo))
    return subsets

In [16]:
subsets = all_subsets(x_cols) #list of all predictors
subsets = subsets[1:]
len(subsets)

32767

In [17]:
n = 100
sections = []
for i in range(n):
    k = len(subsets) // n 
    if i == n - 1:
        sections += [subsets[i*k:]]
    else: 
        sections += [subsets[i*k:i*k+k]]
sum([len(section) for section in sections])
subsets[327:327+10] == sections[1][:10]

32767

In [18]:
subsets[327:327+10] == sections[1][:10]

True

In [19]:
def calculate_metrics(model, X, y):
    n = len(y)
    k = model.df_model  # Number of predictors, excluding intercept
    
    # AIC
    aic = model.aic
    
    # BIC
    bic = model.bic
    
    # PRESS (Prediction Sum of Squares)
    hat_matrix = X @ np.array(np.linalg.inv(X.T @ X) @ X.T)
    residuals = model.resid
    press = np.sum((residuals)**2 / (1 - np.diag(hat_matrix)))
        
    # Adjusted R-squared
    r2 = model.rsquared
    adj_r2 = 1 - (1 - r2) * (n - 1) / (n - k)
    
    # Mean Squared Error MSE
    residuals = model.resid
    mse = (residuals ** 2).mean()
    
    return aic, bic, press, adj_r2, int(k), mse #dont consider intercept as a predictor

In [20]:
def cross_validate(X, y, k=5):
    np.random.seed(42)  # For reproducibility
    shuffled_indices = np.random.permutation(len(X))
    fold_sizes = len(X) // k
    scores = []
    for i in range(k):
        val_start = i * fold_sizes
        val_end = val_start + fold_sizes
        val_indices = shuffled_indices[val_start:val_end]
        train_indices = np.concatenate([shuffled_indices[:val_start], shuffled_indices[val_end:]])
        # print(val_indices, train_indices)

        X_train, y_train = X.iloc[train_indices], y.iloc[train_indices]
        X_val, y_val = X.iloc[val_indices], y.iloc[val_indices]
        
        #calculate mse if you train the model with the training set
        model = sm.OLS(y_train, X_train).fit()
        predictions = model.predict(X_val)
        mse = ((predictions - y_val.squeeze()) ** 2).mean()
        scores.append(mse)

    return sum(scores) / len(scores)

In [21]:
n = 100
sections = []
for i in range(n):
    k = len(subsets) // n 
    if i == n - 1:
        sections += [subsets[i*k:]]
    else: 
        sections += [subsets[i*k:i*k+k]]
len(sections[0])

def results_to_pkl(subsets, index):     
    results = []
    for j, predictors in enumerate(subsets):
        # if j == 7: break
        # print(predictors)
        formula = "Price ~ " #initiate the formula
        for i, predictor in enumerate(predictors):
            if i == 0:
                if df.dtypes[predictor] == 'object':
                    formula += f'C({predictor})'
                else:
                    formula += predictor
            else:
                if df.dtypes[predictor] == 'object':
                    formula += f' + C({predictor})'
                else:
                    formula += f' + {predictor}'
                    
        #train the model            
        model = smf.ols(formula, data = df).fit()
        
        y = df['Price']
        X = df[predictors].copy()
        cat_cols = []
        for predictor in predictors: 
            if X.dtypes[predictor] == 'object':
                cat_cols.append(predictor)
        if len(cat_cols) > 0:
            X = pd.get_dummies(X, columns=cat_cols, drop_first=True)
            for col in X.columns:
                X[col] = X[col].astype(float)
        X = sm.add_constant(X, has_constant='add')
                
        aic, bic, press, adj_r2, num_predictors, mse = calculate_metrics(model, X = X, y = y)
        mse_cv_5 = cross_validate(X, y, k = 5)
        mse_cv_10 = cross_validate(X, y, k = 10)
        mse_cv_100 = cross_validate(X, y, k = 100)
        
        results.append({
            'Predictors': predictors,
            'n_Predictors': num_predictors,
            'Adjusted R^2': adj_r2,
            'AIC': aic,
            'BIC': bic,
            'PRESS': press,
            'MSE': mse,
            '5-Fold_CV MSE': mse_cv_5,
            '10-Fold_CV MSE': mse_cv_10,
            '100-Fold_CV MSE': mse_cv_100        
        })
            
    # Convert results to pd DataFrame
    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values(by='n_Predictors').reset_index(drop=True)
    with open(f'models/results_df_{index}', 'wb') as f:
        pickle.dump(results_df, f)

In [22]:
results_to_pkl(sections[0], 0)

In [24]:
with open(f'models/results_df_6', 'rb') as f:
    data = pickle.load(f)
# with open(f'models/results_df', 'rb') as f:
#     data = pickle.load(f)
data

Unnamed: 0,Predictors,n_Predictors,Adjusted R^2,AIC,BIC,PRESS,MSE,5-Fold_CV MSE,10-Fold_CV MSE,100-Fold_CV MSE
0,[Wheel],1,0.017918,44932.145767,44943.347572,6.680732e+11,3.338478e+08,3.341604e+08,3.340392e+08,3.341389e+08
1,[Prod_year],1,0.086412,44787.554836,44798.756641,6.221476e+11,3.105639e+08,3.110198e+08,3.112046e+08,3.113505e+08
2,[Leather_interior],1,0.017062,44933.888300,44945.090105,6.688104e+11,3.341388e+08,3.347360e+08,3.345047e+08,3.345622e+08
3,[Engine_Volume],1,0.011778,44944.611106,44955.812911,6.730064e+11,3.359351e+08,3.367175e+08,3.366080e+08,3.368707e+08
4,[HasTurbo],1,0.047422,44871.139192,44882.340997,6.485958e+11,3.238181e+08,3.242530e+08,3.242776e+08,3.246379e+08
...,...,...,...,...,...,...,...,...,...,...
322,"[Manufacturer, Drive_wheels, Color]",62,0.094518,44829.750042,45182.606897,inf,2.984157e+08,3.137097e+08,3.125216e+08,3.135079e+08
323,"[Manufacturer, Doors, Color]",62,0.091023,44837.453530,45190.310385,inf,2.995673e+08,3.143558e+08,3.134102e+08,3.143261e+08
324,"[Manufacturer, Gear_box_type, Color]",63,0.129996,44750.777002,45109.234759,5.848569e+11,2.865752e+08,3.010295e+08,2.998683e+08,3.006153e+08
325,"[Manufacturer, Fuel_type, Color]",65,0.126240,44761.326724,45130.986286,5.872067e+11,2.875152e+08,3.034995e+08,3.021386e+08,3.024989e+08


In [None]:
for i in range(n):
    results_to_pkl(sections[i], i)

In [None]:
dataframes = []
for i in range(n):
    with open(f'models/results_df_{i}', 'rb') as f:
        data = pickle.load(f)
    #concat all the data into one dataframe
    dataframes.append(data)
results_df = pd.DataFrame(dataframes)
with open(f'models/full_results_df', 'wb') as f:
        pickle.dump(results_df, f)

In [35]:
# results = []
# for j, predictors in enumerate(subsets):
#     # if j == 7: break
#     # print(predictors)
#     formula = "Price ~ " #initiate the formula
#     for i, predictor in enumerate(predictors):
#         if i == 0:
#             if df.dtypes[predictor] == 'object':
#                 formula += f'C({predictor})'
#             else:
#                 formula += predictor
#         else:
#             if df.dtypes[predictor] == 'object':
#                 formula += f' + C({predictor})'
#             else:
#                 formula += f' + {predictor}'
                
#     #train the model            
#     model = smf.ols(formula, data = df).fit()
    
#     y = df['Price']
#     X = df[predictors].copy()
#     cat_cols = []
#     for predictor in predictors: 
#         if X.dtypes[predictor] == 'object':
#             cat_cols.append(predictor)
#     if len(cat_cols) > 0:
#         X = pd.get_dummies(X, columns=cat_cols, drop_first=True)
#         for col in X.columns:
#             X[col] = X[col].astype(float)
#     X = sm.add_constant(X, has_constant='add')
            
#     aic, bic, press, adj_r2, num_predictors, mse = calculate_metrics(model, X = X, y = y)
#     mse_cv_5 = cross_validate(X, y, k = 5)
#     mse_cv_10 = cross_validate(X, y, k = 10)
#     mse_cv_100 = cross_validate(X, y, k = 100)
    
#     results.append({
#         'Predictors': predictors,
#         'n_Predictors': num_predictors,
#         'Adjusted R^2': adj_r2,
#         'AIC': aic,
#         'BIC': bic,
#         'PRESS': press,
#         'MSE': mse,
#         '5-Fold_CV MSE': mse_cv_5,
#         '10-Fold_CV MSE': mse_cv_10,
#         '100-Fold_CV MSE': mse_cv_100        
#     })
        
# # Convert results to pd DataFrame
# results_df = pd.DataFrame(results)
# results_df = results_df.sort_values(by='n_Predictors').reset_index(drop=True)
# with open(f'models/results_df', 'wb') as f:
#     pickle.dump(results_df, f)

In [10]:
with open(f'models/results_df_1', 'rb') as f:
    results_df = pickle.load(f)
results_df

Unnamed: 0,Predictors,n_Predictors,Adjusted R^2,AIC,BIC,PRESS,MSE,5-Fold_CV MSE,10-Fold_CV MSE,100-Fold_CV MSE
0,[Prod_year],1,0.0001685437,522352.503679,522368.232861,698752900000000.0,36313210000.0,36338920000.0,36347550000.0,36403390000.0
1,[Leather_interior],1,5.688586e-07,522355.735284,522371.464466,698806100000000.0,36319310000.0,36338960000.0,36346870000.0,36402560000.0
2,[Engine_Volume],1,7.660652e-05,522354.272491,522370.001673,698676300000000.0,36316550000.0,36326890000.0,36336410000.0,36391960000.0
3,[HasTurbo],1,0.0003020182,522349.935427,522365.664609,698505300000000.0,36308360000.0,36317170000.0,36326170000.0,36382220000.0
4,[Fuel_type],6,0.001016721,522341.176698,522396.228834,inf,36272970000.0,36296700000.0,36304290000.0,36360560000.0
5,[Category],10,0.003858554,522290.373084,522376.883582,698627500000000.0,36162260000.0,36505850000.0,36498470000.0,36543830000.0
6,[Manufacturer],64,0.0003639341,522411.637281,522922.835682,inf,36187210000.0,36436970000.0,36432810000.0,36480610000.0


# Analysis

In [11]:
print('Lowest MSE')
results_df.loc[results_df['MSE'].idxmin()]

Lowest MSE


Predictors                 [Category]
n_Predictors                       10
Adjusted R^2                 0.003859
AIC                     522290.373084
BIC                     522376.883582
PRESS              698627511100193.25
MSE                36162264914.094215
5-Fold_CV MSE      36505852407.023056
10-Fold_CV MSE     36498472658.730316
100-Fold_CV MSE    36543825293.901321
Name: 5, dtype: object

In [12]:
print('Lowest CV MSE')
results_df.loc[results_df['5-Fold_CV MSE'].idxmin()]

Lowest CV MSE


Predictors                [Fuel_type]
n_Predictors                        6
Adjusted R^2                 0.001017
AIC                     522341.176698
BIC                     522396.228834
PRESS                             inf
MSE                36272974759.054375
5-Fold_CV MSE      36296701410.087692
10-Fold_CV MSE     36304286846.241257
100-Fold_CV MSE    36360563173.718864
Name: 4, dtype: object

In [13]:
print('Lowest AIC')
results_df.loc[results_df['AIC'].idxmin()]

Lowest AIC


Predictors                 [Category]
n_Predictors                       10
Adjusted R^2                 0.003859
AIC                     522290.373084
BIC                     522376.883582
PRESS              698627511100193.25
MSE                36162264914.094215
5-Fold_CV MSE      36505852407.023056
10-Fold_CV MSE     36498472658.730316
100-Fold_CV MSE    36543825293.901321
Name: 5, dtype: object

In [14]:
print('Lowest BIC')
results_df.loc[results_df['BIC'].idxmin()]

Lowest BIC


Predictors                 [HasTurbo]
n_Predictors                        1
Adjusted R^2                 0.000302
AIC                     522349.935427
BIC                     522365.664609
PRESS               698505251493110.5
MSE                36308363165.083794
5-Fold_CV MSE      36317166325.836815
10-Fold_CV MSE     36326166742.250198
100-Fold_CV MSE    36382222399.537987
Name: 3, dtype: object

In [15]:
print('Highest Adjusted R^2	')
results_df.loc[results_df['Adjusted R^2'].idxmax()]

Highest Adjusted R^2	


Predictors                 [Category]
n_Predictors                       10
Adjusted R^2                 0.003859
AIC                     522290.373084
BIC                     522376.883582
PRESS              698627511100193.25
MSE                36162264914.094215
5-Fold_CV MSE      36505852407.023056
10-Fold_CV MSE     36498472658.730316
100-Fold_CV MSE    36543825293.901321
Name: 5, dtype: object