# What drives the price of a car?

![](images/kurt.jpeg)

In [108]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import json
import joblib
import pickle 

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Pre-Processing

#### Import and prepare preprocessed data

In [111]:
training = pd.read_csv('data/training.csv').reset_index(drop=True)
testing = pd.read_csv('data/testing.csv').reset_index(drop=True)  
print('Done downloading')

Done downloading


In [112]:
training.drop(columns=['region','state','condition_ranked'], inplace=True)
testing.drop(columns=['region','state','condition_ranked'],inplace=True)

In [113]:
training.rename(columns={
    'cylinders_imputed':'cylinders',
    'condition_imputed':'condition',
    'drive_imputed':'drive',
    'type_imputed':'type'},
inplace=True)

testing.rename(columns={
    'cylinders_imputed':'cylinders',
    'condition_imputed':'condition',
    'drive_imputed':'drive',
    'type_imputed':'type'},
inplace=True)

In [114]:
fuel_order = ['gas', 'diesel', 'hybrid', 'electric', 'other']
title_status_order = ['clean', 'rebuilt', 'lien', 'salvage', 'missing', 'parts only']
transmission_order = ['automatic', 'manual', 'other']
cylinders_order = ['4 cylinders', '6 cylinders', '8 cylinders', '5 cylinders', '3 cylinders', '10 cylinders', '12 cylinders', 'other']
condition_order = ['new', 'like new', 'excellent', 'good', 'fair', 'salvage']
drive_order = ['4wd', 'fwd', 'rwd']
type_order = ['sedan', 'suv', 'truck', 'coupe', 'pickup', 'hatchback', 'wagon', 'convertible', 'van', 'bus', 'other']

In [115]:
ordinal_mappings = {
    'fuel': fuel_order,
    'title_status': title_status_order,
    'transmission': transmission_order,
    'cylinders': cylinders_order,
    'condition': condition_order,
    'drive': drive_order,
    'type': type_order
}

def ordinal_encode_column(df, column, order):
    df[column] = df[column].apply(lambda x: order.index(x) if x in order else None)
    return df

for col, order in ordinal_mappings.items():
    training = ordinal_encode_column(training, col, order)
    testing = ordinal_encode_column(testing, col, order)

In [116]:
training.drop(columns='model', inplace=True)
testing.drop(columns='model', inplace=True)

In [117]:
categorical_columns = training.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_columns = training.select_dtypes(include=['number']).columns.tolist()
numerical_columns.remove('price')

### Modeling

With your (almost?) final dataset in hand, it is now time to build some models.  Here, you should build a number of different regression models with the price as the target.  In building your models, you should explore different parameters and be sure to cross-validate your findings.

### Baseline

In [120]:
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import mean_squared_error 
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


from sklearn import set_config

set_config(display="diagram") #setting this will display your pipelines as seen above

In [121]:
X_train = training.drop('price', axis=1)
y_train = training['price']

X_test = testing.drop('price', axis=1)
y_test = testing['price']

Xnum_train = X_train[numerical_columns]
Xnum_test = X_test[numerical_columns]

#log_scaling
y_train = np.log1p(y_train)
y_test = np.log1p(y_test)

In [122]:
mse_baseline_train, mse_baseline_test = joblib.load('data/mse_baseline.pkl')

print(f'Baseline for training data: {mse_baseline_train:.2f}')
print(f'Baseline for testing data: {mse_baseline_test:.2f}')

Baseline for training data: 7.41
Baseline for testing data: 7.32


### LinReg on highest correlation feature

In [124]:
numeric_train = training.select_dtypes(include=[np.number])

correlation_matrix = numeric_train.corr()

highest_corr = correlation_matrix[['price']].nlargest(columns='price', n=2).index[1]
print(highest_corr)

n = 5 
highest_corr_columns = correlation_matrix['price'].drop('price').nlargest(n).index

print(highest_corr_columns)

condition
Index(['condition', 'type', 'cylinders', 'region_lon', 'state_lon'], dtype='object')


In [125]:
train_mse_highcorr, test_mse_highcorr = joblib.load('data/mse_highcorr.pkl')
highcorr = joblib.load('data/highcorr.pkl')

print(f'Train MSE:         {train_mse_highcorr: .2f}')
print(f'Train Baseline MSE: {mse_baseline_train:.2f}\n')
print(f'Test MSE:         {test_mse_highcorr: .2f}')
print(f'Test Baseline MSE: {mse_baseline_test:.2f}\n')
print(f'Train MSE diff:   {mse_baseline_train-train_mse_highcorr: .2f}')
print(f'Test MSE diff:    {mse_baseline_test-test_mse_highcorr: .2f}')

Train MSE:          7.41
Train Baseline MSE: 7.41

Test MSE:          7.32
Test Baseline MSE: 7.32

Train MSE diff:    0.00
Test MSE diff:     0.00


### LinReg on highest correlation feature

In [127]:
train_mse_highcorr_col, test_mse_highcorr_col = joblib.load('data/mse_highcorrs.pkl')
highcorrs = joblib.load('data/highcorrs.pkl')

In [128]:
print(f'Train MSE:         {train_mse_highcorr_col: .2f}')
print(f'Train Baseline MSE: {mse_baseline_train:.2f}\n')
print(f'Test MSE:         {test_mse_highcorr_col: .2f}')
print(f'Test Baseline MSE: {mse_baseline_test:.2f}\n')
print(f'Train MSE diff:   {mse_baseline_train-train_mse_highcorr_col: .2f}')
print(f'Test MSE diff:    {mse_baseline_test-test_mse_highcorr_col: .2f}')

Train MSE:          7.35
Train Baseline MSE: 7.41

Test MSE:          7.27
Test Baseline MSE: 7.32

Train MSE diff:    0.06
Test MSE diff:     0.05


### LinReg on numerical columns

In [130]:
train_mse_numlr, test_mse_numlr = joblib.load('data/mse_linreg.pkl')
lr = joblib.load('data/linreg.pkl')

print(f'Train MSE:         {train_mse_numlr: .2f}')
print(f'Train Baseline MSE: {mse_baseline_train:.2f}\n')
print(f'Test MSE:          {test_mse_numlr: .2f}')
print(f'Test Baseline MSE:  {mse_baseline_test:.2f}\n')
print(f'Train MSE diff:    {mse_baseline_train-train_mse_numlr: .2f}')
print(f'Test MSE diff:     {mse_baseline_test-test_mse_numlr: .2f}')

Train MSE:          7.06
Train Baseline MSE: 7.41

Test MSE:           6.97
Test Baseline MSE:  7.32

Train MSE diff:     0.35
Test MSE diff:      0.35


In [131]:
coefficients = lr.named_steps['model'].coef_

importance_df = pd.DataFrame({
    'Feature': Xnum_train.columns,
    'Coefficient': coefficients
})

# Add a new column to interpret whether the feature has a positive or negative impact
importance_df['Impact'] = np.where(importance_df['Coefficient'] > 0, 'Positive', 'Negative')

# Sort by absolute value of the coefficient
importance_df['Importance'] = np.abs(importance_df['Coefficient'])
importance_df_sorted = importance_df.sort_values(by='Importance', ascending=False)

# Display the top n most important features with their impact
n = 5  # Number of important features to display
print(importance_df_sorted.head(n))

         Feature  Coefficient    Impact  Importance
4   transmission     0.530939  Positive    0.530939
12    region_lon     0.352072  Positive    0.352072
10     state_lon    -0.324038  Negative    0.324038
5      cylinders     0.159117  Positive    0.159117
11    region_lat     0.156777  Positive    0.156777


### Polynomial model with numerical data

In [133]:
train_mse_polynum, test_mse_polynum = joblib.load('data/mse_polynum.pkl')
pipeline = joblib.load('data/polynum.pkl')

In [134]:
print(f'Train MSE:         {train_mse_polynum: .2f}')
print(f'Train Baseline MSE: {mse_baseline_train:.2f}\n')
print(f'Test MSE:          {test_mse_polynum: .2f}')
print(f'Test Baseline MSE:  {mse_baseline_test:.2f}\n')
print(f'Train MSE diff:    {mse_baseline_train-train_mse_polynum: .2f}')
print(f'Test MSE diff:     {mse_baseline_test-test_mse_polynum: .2f}')

Train MSE:          6.86
Train Baseline MSE: 7.41

Test MSE:           6.77
Test Baseline MSE:  7.32

Train MSE diff:     0.55
Test MSE diff:      0.56


In [135]:
X_train_transformed = pipeline.named_steps['preprocessing'].transform(Xnum_train)

transformed_feature_names = pipeline.named_steps['preprocessing'].get_feature_names_out()

coefficients = pipeline.named_steps['model'].coef_
print(f"Number of transformed features: {X_train_transformed.shape[1]}")
print(f"Number of coefficients: {len(coefficients)}")

importance_df = pd.DataFrame({
    'Feature': transformed_feature_names,
    'Coefficient': coefficients
})

importance_df['Importance'] = np.abs(importance_df['Coefficient'])
importance_df['Impact'] = np.where(importance_df['Coefficient'] > 0, 'Positive', 'Negative')

importance_df_sorted = importance_df.sort_values(by='Importance', ascending=False)

# Display the top n most important features
n = 10  
print(importance_df_sorted.head(n))

Number of transformed features: 118
Number of coefficients: 118
                 Feature  Coefficient  Importance    Impact
4   scaler__transmission   -18.035685   18.035685  Negative
18    poly__transmission   -13.231355   13.231355  Negative
25      poly__region_lat    -4.791949    4.791949  Negative
15            poly__fuel    -4.623615    4.623615  Negative
1           scaler__fuel    -4.294641    4.294641  Negative
22            poly__type     3.733733    3.733733  Positive
6      scaler__condition    -3.436129    3.436129  Negative
8           scaler__type     2.917534    2.917534  Positive
23       poly__state_lat     2.302460    2.302460  Positive
20       poly__condition    -2.168317    2.168317  Negative


### Polynomial model with OHE manufacturer

In [137]:
train_mse_poly, test_mse_poly = joblib.load('data/mse_poly.pkl')
pipe = joblib.load('data/poly.pkl')

In [138]:
print(f'Train MSE:         {train_mse_poly: .2f}')
print(f'Train Baseline MSE: {mse_baseline_train:.2f}\n')
print(f'Test MSE:          {test_mse_poly: .2f}')
print(f'Test Baseline MSE:  {mse_baseline_test:.2f}\n')
print(f'Train MSE diff:    {mse_baseline_train-train_mse_poly: .2f}')
print(f'Test MSE diff:     {mse_baseline_test-test_mse_poly: .2f}')

Train MSE:          6.83
Train Baseline MSE: 7.41

Test MSE:           6.73
Test Baseline MSE:  7.32

Train MSE diff:     0.58
Test MSE diff:      0.59


In [139]:
X_train_transformed = pipeline.named_steps['preprocessing'].transform(X_train)

transformed_feature_names = pipeline.named_steps['preprocessing'].get_feature_names_out()

coefficients = pipeline.named_steps['model'].coef_

print(f"Number of transformed features: {X_train_transformed.shape[1]}")
print(f"Number of coefficients: {len(coefficients)}")

if X_train_transformed.shape[1] == len(coefficients):
    importance_df = pd.DataFrame({
        'Feature': transformed_feature_names,
        'Coefficient': coefficients
    })

    importance_df['Importance'] = np.abs(importance_df['Coefficient'])
    importance_df['Impact'] = np.where(importance_df['Coefficient'] > 0, 'Positive', 'Negative')

    importance_df_sorted = importance_df.sort_values(by='Importance', ascending=False)

    # Display the top n most important features
    n = 10 
    print(importance_df_sorted.head(n))
else:
    print("Mismatch still exists between the number of transformed features and coefficients.")

Number of transformed features: 118
Number of coefficients: 118
                 Feature  Coefficient  Importance    Impact
4   scaler__transmission   -18.035685   18.035685  Negative
18    poly__transmission   -13.231355   13.231355  Negative
25      poly__region_lat    -4.791949    4.791949  Negative
15            poly__fuel    -4.623615    4.623615  Negative
1           scaler__fuel    -4.294641    4.294641  Negative
22            poly__type     3.733733    3.733733  Positive
6      scaler__condition    -3.436129    3.436129  Negative
8           scaler__type     2.917534    2.917534  Positive
23       poly__state_lat     2.302460    2.302460  Positive
20       poly__condition    -2.168317    2.168317  Negative


### Lasso with OHE manufacturer

In [141]:
train_mse_lasso, test_mse_lasso = joblib.load('data/mse_lasso.pkl')
lasso_pipeline = joblib.load('data/lasso.pkl')

In [142]:
print(f'LASSO Train MSE: {train_mse_lasso:.2f}')
print(f'Train Baseline MSE: {mse_baseline_train:.2f}\n')
print(f'LASSO Test MSE: {test_mse_lasso:.2f}')
print(f'Test Baseline MSE:  {mse_baseline_test:.2f}\n')
print(f'Train MSE diff:    {mse_baseline_train-train_mse_lasso: .2f}')
print(f'Test MSE diff:     {mse_baseline_test-test_mse_lasso: .2f}')

LASSO Train MSE: 7.12
Train Baseline MSE: 7.41

LASSO Test MSE: 7.03
Test Baseline MSE:  7.32

Train MSE diff:     0.29
Test MSE diff:      0.29


In [143]:
coefficients = lasso_pipeline.named_steps['lasso'].coef_

transformed_feature_names = lasso_pipeline.named_steps['preprocessing'].get_feature_names_out()

importance_df = pd.DataFrame({
    'Feature': transformed_feature_names,
    'Coefficient': coefficients
})

importance_df['Importance'] = np.abs(importance_df['Coefficient'])
importance_df['Impact'] = np.where(importance_df['Coefficient'] > 0, 'Positive', 'Negative')

importance_df_sorted = importance_df.sort_values(by='Importance', ascending=False)

# Display the top n most important features
n = 10
print(f"Top {n} most important features based on LASSO regression:")
print(importance_df_sorted.head(n))

Top 10 most important features based on LASSO regression:
                         Feature  Coefficient  Importance    Impact
4           scaler__transmission     0.410639    0.410639  Positive
5              scaler__cylinders     0.084298    0.084298  Positive
8                   scaler__type     0.029540    0.029540  Positive
2               scaler__odometer    -0.000484    0.000484  Negative
0                   scaler__year     0.000000    0.000000  Negative
41  ohe__manufacturer_mitsubishi    -0.000000    0.000000  Negative
31      ohe__manufacturer_jaguar     0.000000    0.000000  Negative
32        ohe__manufacturer_jeep    -0.000000    0.000000  Negative
33         ohe__manufacturer_kia    -0.000000    0.000000  Negative
34  ohe__manufacturer_land rover    -0.000000    0.000000  Negative


### Lasso with numerical features

In [145]:
train_mse_lasso_num, test_mse_lasso_num = joblib.load('data/mse_lassonum.pkl')
lasso_pipeline_numerical = joblib.load('data/lassonum.pkl')

In [146]:
print(f'LASSO Train MSE: {train_mse_lasso_num:.2f}')
print(f'Train Baseline MSE: {mse_baseline_train:.2f}\n')
print(f'LASSO Test MSE: {test_mse_lasso_num:.2f}')
print(f'Test Baseline MSE:  {mse_baseline_test:.2f}\n')
print(f'Train MSE diff:    {mse_baseline_train-train_mse_lasso_num: .2f}')
print(f'Test MSE diff:     {mse_baseline_test-test_mse_lasso_num: .2f}')

LASSO Train MSE: 7.12
Train Baseline MSE: 7.41

LASSO Test MSE: 7.03
Test Baseline MSE:  7.32

Train MSE diff:     0.29
Test MSE diff:      0.29


In [147]:
poly_feature_names = lasso_pipeline_numerical.named_steps['poly'].get_feature_names_out(input_features=numerical_columns)

coefficients = lasso_pipeline_numerical.named_steps['lasso'].coef_

importance_df = pd.DataFrame({
    'Feature': poly_feature_names,
    'Coefficient': coefficients
})

importance_df['Importance'] = np.abs(importance_df['Coefficient'])
importance_df['Impact'] = np.where(importance_df['Coefficient'] > 0, 'Positive', 'Negative')

importance_df_sorted = importance_df.sort_values(by='Importance', ascending=False)

# Display the top n most important features
n = 10 
print(f"Top {n} most important polynomial features based on LASSO regression:")
print(importance_df_sorted.head(n))

Top 10 most important polynomial features based on LASSO regression:
        Feature  Coefficient  Importance    Impact
5  transmission     0.410639    0.410639  Positive
6     cylinders     0.084298    0.084298  Positive
9          type     0.029540    0.029540  Positive
3      odometer    -0.000484    0.000484  Negative
0             1     0.000000    0.000000  Negative
1          year     0.000000    0.000000  Negative
2          fuel     0.000000    0.000000  Negative
4  title_status     0.000000    0.000000  Negative
7     condition    -0.000000    0.000000  Negative
8         drive    -0.000000    0.000000  Negative


### Ridge with OHE manufacturer

In [149]:
train_mse_ridge, test_mse_ridge = joblib.load('data/mse_ridge.pkl')
ridge_pipeline = joblib.load('data/ridge.pkl')

In [150]:
print(f'Ridge Train MSE: {train_mse_ridge:.2f}')
print(f'Train Baseline MSE: {mse_baseline_train:.2f}\n')
print(f'Ridge Test MSE: {test_mse_ridge:.2f}')
print(f'Test Baseline MSE:  {mse_baseline_test:.2f}\n')
print(f'Train MSE diff:    {mse_baseline_train-train_mse_ridge: .2f}')
print(f'Test MSE diff:     {mse_baseline_test-test_mse_ridge: .2f}')

Ridge Train MSE: 7.02
Train Baseline MSE: 7.41

Ridge Test MSE: 6.94
Test Baseline MSE:  7.32

Train MSE diff:     0.39
Test MSE diff:      0.39


In [151]:
coefficients = ridge_pipeline.named_steps['ridge'].coef_

transformed_feature_names = ridge_pipeline.named_steps['preprocessing'].get_feature_names_out()

importance_df = pd.DataFrame({
    'Feature': transformed_feature_names,
    'Coefficient': coefficients
})

importance_df['Importance'] = np.abs(importance_df['Coefficient'])
importance_df['Impact'] = np.where(importance_df['Coefficient'] > 0, 'Positive', 'Negative')

importance_df_sorted = importance_df.sort_values(by='Importance', ascending=False)

# Display the top n most important features
n = 10  
print(f"Top {n} most important features based on Ridge regression:")
print(importance_df_sorted.head(n))

Top 10 most important features based on Ridge regression:
                              Feature  Coefficient  Importance    Impact
34       ohe__manufacturer_land rover    -4.609246    4.609246  Negative
27  ohe__manufacturer_harley-davidson    -1.177442    1.177442  Negative
39          ohe__manufacturer_mercury    -0.798646    0.798646  Negative
22            ohe__manufacturer_dodge    -0.714754    0.714754  Negative
33              ohe__manufacturer_kia    -0.609846    0.609846  Negative
49            ohe__manufacturer_tesla    -0.542013    0.542013  Negative
4                scaler__transmission     0.514594    0.514594  Positive
29          ohe__manufacturer_hyundai    -0.488592    0.488592  Negative
42           ohe__manufacturer_nissan    -0.477881    0.477881  Negative
44          ohe__manufacturer_porsche     0.469427    0.469427  Positive


### Ridge with numerical features

In [153]:
train_mse_ridge_num, test_mse_ridge_num = joblib.load('data/mse_ridgenum.pkl')
ridge_pipeline_numerical = joblib.load('data/ridgenum.pkl')

In [154]:
print(f'Ridge Train MSE: {train_mse_ridge_num:.2f}')
print(f'Train Baseline MSE: {mse_baseline_train:.2f}\n')
print(f'Ridge Test MSE: {test_mse_ridge_num:.2f}')
print(f'Test Baseline MSE:  {mse_baseline_test:.2f}\n')
print(f'Train MSE diff:    {mse_baseline_train-train_mse_ridge_num: .2f}')
print(f'Test MSE diff:     {mse_baseline_test-test_mse_ridge_num: .2f}')

Ridge Train MSE: 7.06
Train Baseline MSE: 7.41

Ridge Test MSE: 6.97
Test Baseline MSE:  7.32

Train MSE diff:     0.35
Test MSE diff:      0.35


In [155]:
poly_feature_names = ridge_pipeline_numerical.named_steps['poly'].get_feature_names_out(input_features=numerical_columns)

coefficients = ridge_pipeline_numerical.named_steps['ridge'].coef_

importance_df = pd.DataFrame({
    'Feature': poly_feature_names,
    'Coefficient': coefficients
})

importance_df['Importance'] = np.abs(importance_df['Coefficient'])
importance_df['Impact'] = np.where(importance_df['Coefficient'] > 0, 'Positive', 'Negative')

importance_df_sorted = importance_df.sort_values(by='Importance', ascending=False)

# Display the top n most important features
n = 10 
print(f"Top {n} most important polynomial features based on Ridge regression:")
print(importance_df_sorted.head(n))

Top 10 most important polynomial features based on Ridge regression:
         Feature  Coefficient  Importance    Impact
5   transmission     0.530937    0.530937  Positive
13    region_lon     0.351926    0.351926  Positive
11     state_lon    -0.323892    0.323892  Negative
6      cylinders     0.159117    0.159117  Positive
12    region_lat     0.156745    0.156745  Positive
7      condition    -0.122996    0.122996  Negative
9           type     0.095423    0.095423  Positive
3       odometer    -0.084113    0.084113  Negative
1           year     0.075176    0.075176  Positive
10     state_lat    -0.050153    0.050153  Negative


### GridSearch on Lasso and Ridge params

In [157]:
train_mse_lassogrid, test_mse_lassogrid = joblib.load('data/mse_bestlasso.pkl')
best_lasso = joblib.load('data/bestlasso.pkl')

train_mse_ridgegrid, test_mse_ridgegrid = joblib.load('data/mse_bestridge.pkl')
best_ridge = joblib.load('data/bestridge.pkl')

In [158]:
print(f'Lasso GridCV Train MSE:         {train_mse_lassogrid:.2f}')
print(f'Baseline for training data: {mse_baseline_train:.2f}')
print(f'Lasso GridCV Test MSE:         {test_mse_lassogrid:.2f}')
print(f'Baseline for testing data: {mse_baseline_test:.2f}\n')
print(f'Train MSE diff:    {mse_baseline_train-train_mse_lassogrid:.2f}')
print(f'Test MSE diff:     {mse_baseline_test-test_mse_lassogrid:.2f}')

Lasso GridCV Train MSE:         7.06
Baseline for training data: 7.41
Lasso GridCV Test MSE:         6.97
Baseline for testing data: 7.32

Train MSE diff:    0.35
Test MSE diff:     0.35


In [159]:
print(f'Ridge GridCV Train MSE:         {train_mse_ridgegrid:.2f}')
print(f'Baseline for training data: {mse_baseline_train:.2f}')
print(f'Ridge GridCV Test MSE:         {test_mse_ridgegrid:.2f}')
print(f'Baseline for testing data: {mse_baseline_test:.2f}\n')
print(f'Train MSE diff:    {mse_baseline_train-train_mse_ridgegrid:.2f}')
print(f'Test MSE diff:     {mse_baseline_test-test_mse_ridgegrid:.2f}')

Ridge GridCV Train MSE:         7.06
Baseline for training data: 7.41
Ridge GridCV Test MSE:         6.97
Baseline for testing data: 7.32

Train MSE diff:    0.35
Test MSE diff:     0.35


In [160]:
lasso_coefficients = best_lasso.named_steps['lasso'].coef_
ridge_coefficients = best_ridge.named_steps['ridge'].coef_

transformed_feature_names = best_lasso.named_steps['poly'].get_feature_names_out(numerical_columns)

print(f"Number of transformed features: {len(transformed_feature_names)}")
print(f"Number of coefficients: {len(lasso_coefficients)}")

if len(transformed_feature_names) == len(lasso_coefficients):
    lasso_importance_df = pd.DataFrame({
        'Feature': transformed_feature_names,
        'Coefficient': lasso_coefficients
    })

    ridge_importance_df = pd.DataFrame({
        'Feature': transformed_feature_names,
        'Coefficient': ridge_coefficients
    })

    lasso_importance_df['Importance'] = np.abs(lasso_importance_df['Coefficient'])
    lasso_importance_df['Impact'] = np.where(lasso_importance_df['Coefficient'] > 0, 'Positive', 'Negative')

    ridge_importance_df['Importance'] = np.abs(ridge_importance_df['Coefficient'])
    ridge_importance_df['Impact'] = np.where(ridge_importance_df['Coefficient'] > 0, 'Positive', 'Negative')

    # Sort by importance and display top n most important features
    n = 10
    print("Top Lasso Features:")
    print(lasso_importance_df.sort_values(by='Importance', ascending=False).head(n))

    print("\nTop Ridge Features:")
    print(ridge_importance_df.sort_values(by='Importance', ascending=False).head(n))
else:
    print("Mismatch between the number of transformed features and coefficients.")


Number of transformed features: 14
Number of coefficients: 14
Top Lasso Features:
         Feature  Coefficient  Importance    Impact
5   transmission     0.514063    0.514063  Positive
6      cylinders     0.153807    0.153807  Positive
7      condition    -0.110400    0.110400  Negative
12    region_lat     0.091137    0.091137  Positive
9           type     0.082054    0.082054  Positive
3       odometer    -0.075738    0.075738  Negative
1           year     0.063803    0.063803  Positive
4   title_status     0.029311    0.029311  Positive
13    region_lon     0.020669    0.020669  Positive
8          drive    -0.019261    0.019261  Negative

Top Ridge Features:
         Feature  Coefficient  Importance    Impact
5   transmission     0.530937    0.530937  Positive
13    region_lon     0.351926    0.351926  Positive
11     state_lon    -0.323892    0.323892  Negative
6      cylinders     0.159117    0.159117  Positive
12    region_lat     0.156745    0.156745  Positive
7      condit

### Sequential Feature Selector

In [162]:
train_mse_sfs, test_mse_sfs = joblib.load('data/mse_sfs.pkl')
sequential_pipe = joblib.load('data/sfs.pkl')

In [163]:
print(f'Baseline for training data: {mse_baseline_train:.2f}')
print(f'SFS Train MSE:         {train_mse_sfs: .2f}')
print(f'Baseline for testing data: {mse_baseline_test:.2f}')
print(f'SFS Test MSE:         {test_mse_sfs: .2f}\n')
print(f'Train MSE diff:    {mse_baseline_train-train_mse_sfs: .2f}')
print(f'Test MSE diff:     {mse_baseline_test-test_mse_sfs: .2f}')

Baseline for training data: 7.41
SFS Train MSE:          7.07
Baseline for testing data: 7.32
SFS Test MSE:          6.98

Train MSE diff:     0.34
Test MSE diff:      0.34


In [164]:
coefficients = sequential_pipe.named_steps['linreg'].coef_

poly_feature_names = sequential_pipe.named_steps['poly_features'].get_feature_names_out(Xnum_train.columns)

selected_feature_indices = sequential_pipe.named_steps['selector'].get_support(indices=True)

print(f"Selected Feature Indices: {selected_feature_indices}")
print(f"Coefficients Shape: {coefficients.shape}")
print(f"Number of coefficients: {len(coefficients)}")
print(f"Selected feature indices: {selected_feature_indices}")

if len(selected_feature_indices) != len(coefficients):
    print(f"Warning: Number of coefficients ({len(coefficients)}) does not match the number of selected features ({len(selected_feature_indices)}).")

selected_feature_names = poly_feature_names[selected_feature_indices]

selected_coefficients = coefficients[:len(selected_feature_indices)]

importance_df = pd.DataFrame({
    'Feature': selected_feature_names,
    'Coefficient': selected_coefficients
})

importance_df['Importance'] = np.abs(importance_df['Coefficient'])
importance_df['Impact'] = np.where(importance_df['Coefficient'] > 0, 'Positive', 'Negative')

importance_df_sorted = importance_df.sort_values(by='Importance', ascending=False)

# Display the top n most important features
n = 5 
print(f"Top {n} Selected Features:")
print(importance_df_sorted.head(n))
    

Selected Feature Indices: [ 2  4  5  6 11]
Coefficients Shape: (5,)
Number of coefficients: 5
Selected feature indices: [ 2  4  5  6 11]
Top 5 Selected Features:
        Feature   Coefficient    Importance    Impact
1  transmission  7.279279e-01  7.279279e-01  Positive
2     cylinders  2.094657e-01  2.094657e-01  Positive
3     condition -1.949108e-01  1.949108e-01  Negative
4    region_lat  1.776449e-02  1.776449e-02  Positive
0      odometer -4.951317e-07  4.951317e-07  Negative
