# Modeling - Given these features, what model works best?

## Here you can actually test assumptions
1. Linearity by residuals vs predicted plot.
2. Homoscedasticity with spread of residuals vs predicted.
3. Normality of errors with Q-Q plot of residuals.
4. Independence with residuals vs time/order.
5. Multicollinearity with VIF, condition number.
6. Influential points with Cook's distance.

## They require predictions, residuals, fitted model parameters so they belong to modeling.ipynb.

In [8]:
# 3rd Party
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

# linear_regression_001
from linear_regression_001.data.loader import load_raw_data
from linear_regression_001.features.build_features import split_features_target, build_features, build_preprocessor, FEATURE_LIST
from linear_regression_001.utils.paths import INTERIM, PROCESSED, INFERENCE, PREDICTIONS, MODELS
from linear_regression_001.models import train_models, predict_features, evaluate_model

# Baseline Model
from sklearn.dummy import DummyRegressor

# Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.metrics import r2_score as r2

In [2]:
train_models()

*normalized*
*missing checked*
*schema enforced*
Training complete. Models saved.


In [3]:
predict_features(MODELS/"linear_regression.pkl","predict_data.csv","linear_regression")

Inference completed. Predictions from linear_regression saved.


array([10774.2005307 , 10830.19824896, 11190.60631904, ...,
       13219.57863858, 57167.75704678, 11291.66534863], shape=(1070,))

In [4]:
evaluate_model(MODELS / "linear_regression.pkl", INTERIM / "test.csv", MODELS / "baseline.pkl")

{'metrics': {'rmse': np.float64(4573.807686290385),
  'mae': 2756.898659517957,
  'r2': 0.8652503208873077,
  'mape': 29.24770582699016,
  'adjusted_r2': 0.8616224449111968},
 'baseline_comparison': {'baseline_rmse': np.float64(12465.610441715768),
  'model_rmse': np.float64(4573.807686290385),
  'improvement_percent': np.float64(-63.30859441119478),
  'beats_baseline': np.True_},
 'residual_analysis': {'mean_residual': np.float64(-342.37261123381757),
  'std_residual': np.float64(4560.975525723193),
  'residual_skewness': np.float64(2.643628716854398),
  'residual_kurtosis': np.float64(9.051494190997499),
  'max_overestimation': np.float64(-15418.306359906339),
  'max_underestimation': np.float64(21208.549780449517),
  'residuals_within_1std': np.float64(86.94029850746269)}}

In [6]:
# ================================================================
# STEP 1: Setup Data
# ================================================================
# Load raw that is then cleaned on load
df = load_raw_data("insurance.csv")

# Split to X and y to build features
X, y = split_features_target(df)

# Build dataset and preprocessor
X_features_built = build_features(X,FEATURE_LIST)
df = pd.concat([X_features_built,y],axis=1)
preprocessor = build_preprocessor()

# split the train and test
X_train, X_test, y_train, y_test = train_test_split(X_features_built,y,test_size=0.2,random_state=42)


*normalized*
*missing checked*
*schema enforced*


In [None]:
# ================================================================
# STEP 2: Train Multiple Models
# ================================================================

# Ridge
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", Ridge())
])
param_grid = {
    'model__alpha': np.logspace(-4, 3, 20)
}

grid = GridSearchCV(
    pipeline,
    param_grid,
    cv = 5,
    scoring = "neg_mean_squared_error"
)

grid.fit(X_train, y_train)

print("Best alpha: ", grid.best_params_)
print("Best CV score: ", -grid.best_score_)

best_ridge = grid.best_estimator_

joblib.dump(best_ridge, MODELS / "ridge.pkl")

Best alpha:  {'model__alpha': np.float64(0.0001)}
Best CV score:  24313461.368943825


['/home/kenchan/projects/linear_regression/linear_regression_001/linear_regression_001/models/saved_models/ridge.pkl']

In [11]:
# Lasso

pipeline = Pipeline([
    ("preprocessor",preprocessor),
    ("model", Lasso(max_iter=10000))
])

param_grid = {
    'model__alpha':np.logspace(-4, 1, 20)
}

grid = GridSearchCV(
    pipeline,
    param_grid,
    cv = 5,
    scoring = "neg_mean_squared_error"
)

grid.fit(X_train, y_train)

print("Best alpha: ", grid.best_params_)
print("Best CV Score: ", -grid.best_score_)

best_lasso = grid.best_estimator_

joblib.dump(best_lasso, MODELS / "lasso.pkl")

Best alpha:  {'model__alpha': np.float64(0.07847599703514607)}
Best CV Score:  24313459.970700447


['/home/kenchan/projects/linear_regression/linear_regression_001/linear_regression_001/models/saved_models/lasso.pkl']

In [14]:
# ElasticNet

pipeline = Pipeline([
    ("preprocessor",preprocessor),
    ("model", ElasticNet(max_iter=10000))
])

param_grid = {
    'model__alpha': np.logspace(-4, 1, 10),
    'model__l1_ratio': [0.1, 0.5, 0.7, 0.9, 0.95, 1]
}

grid = GridSearchCV(
    pipeline,
    param_grid,
    cv = 5,
    scoring = 'neg_mean_squared_error'
)

grid.fit(X_train, y_train)

print("Best params: ", grid.best_params_)
print("Best CV score: ", -grid.best_score_)

best_elastic = grid.best_estimator_

joblib.dump(best_elastic, MODELS / "elastic_net.pkl")

Best params:  {'model__alpha': np.float64(0.05994842503189409), 'model__l1_ratio': 1}
Best CV score:  24313459.896045472


['/home/kenchan/projects/linear_regression/linear_regression_001/linear_regression_001/models/saved_models/elastic_net.pkl']

In [None]:
# ================================================================
# STEP 2: Evaluate Models
# ================================================================
# Ridge
evaluate_model(MODELS / "ridge.pkl", INTERIM / "test.csv", MODELS / "baseline.pkl")

{'metrics': {'rmse': np.float64(4573.805255024336),
  'mae': 2756.900310597436,
  'r2': 0.8652504641431048,
  'mape': 29.24770738629884,
  'adjusted_r2': 0.8616225920238807},
 'baseline_comparison': {'baseline_rmse': np.float64(12465.610441715768),
  'model_rmse': np.float64(4573.805255024336),
  'improvement_percent': np.float64(-63.308613914981315),
  'beats_baseline': np.True_},
 'residual_analysis': {'mean_residual': np.float64(-342.3715010941844),
  'std_residual': np.float64(4560.973170950115),
  'residual_skewness': np.float64(2.643628168775114),
  'residual_kurtosis': np.float64(9.051435957899253),
  'max_overestimation': np.float64(-15418.087431014479),
  'max_underestimation': np.float64(21208.538724900616),
  'residuals_within_1std': np.float64(86.94029850746269)}}

In [None]:
# Lasso
evaluate_model(MODELS / "lasso.pkl", INTERIM / "test.csv", MODELS / "baseline.pkl")

{'metrics': {'rmse': np.float64(4573.726840474379),
  'mae': 2757.031163700371,
  'r2': 0.8652550844682152,
  'mape': 29.250847514581608,
  'adjusted_r2': 0.8616273367423595},
 'baseline_comparison': {'baseline_rmse': np.float64(12465.610441715768),
  'model_rmse': np.float64(4573.726840474379),
  'improvement_percent': np.float64(-63.30924296199287),
  'beats_baseline': np.True_},
 'residual_analysis': {'mean_residual': np.float64(-342.3560877039321),
  'std_residual': np.float64(4560.895692787526),
  'residual_skewness': np.float64(2.6435796450837303),
  'residual_kurtosis': np.float64(9.048821813858876),
  'max_overestimation': np.float64(-15409.217945194403),
  'max_underestimation': np.float64(21207.545849901842),
  'residuals_within_1std': np.float64(86.94029850746269)}}

In [None]:
# ElasticNet
evaluate_model(MODELS / "elastic_net.pkl", INTERIM / "test.csv", MODELS / "baseline.pkl")

{'metrics': {'rmse': np.float64(4573.745881154683),
  'mae': 2756.999874980938,
  'r2': 0.8652539625646608,
  'mape': 29.250105716245667,
  'adjusted_r2': 0.8616261846337094},
 'baseline_comparison': {'baseline_rmse': np.float64(12465.610441715768),
  'model_rmse': np.float64(4573.745881154683),
  'improvement_percent': np.float64(-63.30909021632195),
  'beats_baseline': np.True_},
 'residual_analysis': {'mean_residual': np.float64(-342.3599907081927),
  'std_residual': np.float64(4560.914494061659),
  'residual_skewness': np.float64(2.6435913382990357),
  'residual_kurtosis': np.float64(9.049453016366426),
  'max_overestimation': np.float64(-15411.364056785584),
  'max_underestimation': np.float64(21207.78289489103),
  'residuals_within_1std': np.float64(86.94029850746269)}}

In [None]:
# Group models
models = {
    'lin_reg': lr,
    'baseline': baseline
}

In [10]:
# Cross Validation Score for models
for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring="r2")
    print(f"{name}: ", scores.mean())

lin_reg:  0.8279884988384433
baseline:  -0.004796192893767958


In [None]:
# Hyperparameter tuning
param_grid = {
    'alpha': [0.1, 1.0, 10.0, 100.0]
}

grid = GridSearchCV(LinearRegression(), param_grid, cv=5, scoring="r2")
grid.fit(X_train, y_train)

best_model = grid.best_estimator_

In [None]:
# Validation Set Eval

In [None]:
# Feature importance

In [None]:
# Final model train

In [None]:
# Save final model