# Modeling - Given these features, what model works best?

## Here you can actually test assumptions
1. Linearity by residuals vs predicted plot.
2. Homoscedasticity with spread of residuals vs predicted.
3. Normality of errors with Q-Q plot of residuals.
4. Independence with residuals vs time/order.
5. Multicollinearity with VIF, condition number.
6. Influential points with Cook's distance.

## They require predictions, residuals, fitted model parameters so they belong to modeling.ipynb.

In [None]:
# 3rd Party
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# linear_regression_001
from linear_regression_001.data.loader import load_processed_data
from linear_regression_001.features.build_features import split_features_target, build_features
from linear_regression_001.utils.paths import FEATURES,PROCESSED
from linear_regression_001.models import train_models, predict, evaluate, evaluate_model

# Baseline Model
from sklearn.dummy import DummyRegressor

# Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import r2_score as r2

In [2]:
# Load data
df = load_processed_data("processed_insurance.csv")
X, y = split_features_target(df)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
# Baseline Model -- Predict Mean of y or simple linear regression w/o tuning
baseline_model = DummyRegressor(strategy='mean') # predicts the mean of y_train
baseline_model.fit(X_train,y_train)
baseline_preds = baseline_model.predict(X_test)

In [4]:
# Baseline Evaluation
evaluate(y_test, baseline_preds, "Baseline Model")

Baseline Model Performance
-------------------------
MAE: 9861.7979
RMSE: 13612.4260
R^2: -0.0084


In [None]:
# Train Model

In [None]:
# Evaluation