In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Practicum/v1/train_df.csv')

Define the target variables

In [None]:
X = df.drop('TOTALPMT_ADJ',axis=1).drop('LOG_TOTALPMT', axis=1)
y = df[['TOTALPMT_ADJ']]

In [None]:
X = X.drop('index', axis=1)

I. Linear Regression

Determine K-fold Splits

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

In [None]:
n_splits = 5
cv = KFold(n_splits)

In [None]:
lm = LinearRegression()

In [None]:
X.reset_index(inplace=True)

Calculate MSEs

In [None]:
errors = []
for train_index, test_index in cv.split(X):
    X_train_kfold, y_train_kfold = X.iloc[train_index], y.iloc[train_index]
    X_test_kfold, y_test_kfold = X.iloc[test_index], y.iloc[test_index]
    lm.fit(X_train_kfold, y_train_kfold)
    y_pred = lm.predict(X_test_kfold)
    errors.append(np.sqrt(mean_squared_error(y_test_kfold, y_pred)))

In [None]:
errors

[256419.84385517793,
 255454.911203295,
 252929.77293623812,
 253868.18861559324,
 256805.52590108235]

In [None]:
np.mean(errors)

255095.64850227736

II. Ridge

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score

In [None]:
rdg = Ridge()

In [None]:
param = {'alpha':[1, 5, 10, 20, 50]}

In [None]:
GS_object = GridSearchCV(
    rdg, 
    param,
    scoring='neg_mean_squared_error',
    cv = 5,
    )

In [None]:
GS_object.fit(X, y)

GridSearchCV(cv=5, estimator=Ridge(), param_grid={'alpha': [1, 5, 10, 20, 50]},
             scoring='neg_mean_squared_error')

In [None]:
GS_object.best_params_

{'alpha': 50}

In [None]:
rdg_optim = Ridge(alpha = GS_object.best_params_.get('alpha'))

In [None]:
rdg_errors = []
for train_index, test_index in cv.split(X):
    X_train_kfold, y_train_kfold = X.iloc[train_index], y.iloc[train_index]
    X_test_kfold, y_test_kfold = X.iloc[test_index], y.iloc[test_index]
    rdg_optim.fit(X_train_kfold, y_train_kfold)
    y_pred = rdg_optim.predict(X_test_kfold)
    rdg_errors.append(np.sqrt(mean_squared_error(y_test_kfold, y_pred)))

In [None]:
rdg_errors

[256166.83590415915,
 255382.51760220042,
 252870.58033655697,
 253749.86784681273,
 256717.39026046454]

In [None]:
np.mean(rdg_errors)

254977.43839003876

III. Lasso

In [None]:
from sklearn.linear_model import Lasso

In [None]:
lss = Lasso()

In [None]:
param = {'alpha':[1, 5, 10, 20, 50]}

In [None]:
GS_lasso = GridSearchCV(
    lss, 
    param,
    scoring='neg_mean_squared_error',
    cv = 5,
    )

In [None]:
GS_lasso.fit(X,y)

  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rn

GridSearchCV(cv=5, estimator=Lasso(), param_grid={'alpha': [1, 5, 10, 20, 50]},
             scoring='neg_mean_squared_error')

In [None]:
GS_lasso.best_params_

{'alpha': 10}

Calculate MSEs

In [None]:
lss_optim = Lasso(alpha = GS_lasso.best_params_.get('alpha'))

In [None]:
lss_errors = []
for train_index, test_index in cv.split(X):
    X_train_kfold, y_train_kfold = X.iloc[train_index], y.iloc[train_index]
    X_test_kfold, y_test_kfold = X.iloc[test_index], y.iloc[test_index]
    lss_optim.fit(X_train_kfold, y_train_kfold)
    y_pred = lss_optim.predict(X_test_kfold)
    lss_errors.append(np.sqrt(mean_squared_error(y_test_kfold, y_pred)))

  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


In [None]:
lss_errors

[256171.96440199565,
 255392.55999019698,
 252865.96220085118,
 253750.28987366968,
 256716.49697749183]

In [None]:
np.mean(lss_errors)

254979.45468884107

Final Evaluation

In [None]:
test_df = pd.read_csv('/content/drive/MyDrive/Practicum/v1/test_df.csv')

In [None]:
X_test = test_df.drop('TOTALPMT_ADJ',axis=1).drop('LOG_TOTALPMT', axis=1)
y_test = test_df[['TOTALPMT_ADJ']]

In [None]:
lm.fit(X, y)
y_pred_lm = lm.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred_lm))

143484373486958.0

In [None]:
rdg_optim.fit(X, y)
y_pred_rdg = rdg_optim.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred_rdg))

255086.4104747788

In [None]:
lss_optim.fit(X, y)
y_pred_lss = lss_optim.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred_lss))

  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


255092.88153757234