In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import Lasso, Ridge
from sklearn.linear_model import RidgeCV, LassoCV

import warnings

warnings.filterwarnings('ignore')

train = pd.read_csv('../datasets/train_eng.csv', index_col = 'Unnamed: 0')

In [2]:
train.head()

Unnamed: 0,Id,PID,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,BsmtFin SF 1,...,Heating_Wall,Kitchen Qual_Po,Mas Vnr Type_CBlock,Exterior 1st_PreCast,Exterior 2nd_Other,Exterior 2nd_PreCast,Sale Type_VWD,Roof Matl_Metal,Roof Matl_Roll,Heating_GasA
0,109,533352170,0.0,13517,6,8,1976,2005,289.0,533.0,...,0,0,0,0,0,0,0,0,0,0
1,544,531379050,43.0,11492,7,5,1996,1997,132.0,637.0,...,0,0,0,0,0,0,0,0,0,0
2,153,535304180,68.0,7922,5,7,1953,2007,0.0,731.0,...,0,0,0,0,0,0,0,0,0,0
3,318,916386060,73.0,9802,5,5,2006,2007,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,255,906425045,82.0,14235,6,8,1900,1993,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
poly = PolynomialFeatures(degree = 2, interaction_only = False, include_bias = False)

In [4]:
features = train.drop(columns = 'SalePrice')._get_numeric_data()
X = features

In [5]:
X_overfit = poly.fit_transform(X)

In [6]:
poly.get_feature_names(features.columns)

['Id',
 'PID',
 'Lot Frontage',
 'Lot Area',
 'Overall Qual',
 'Overall Cond',
 'Year Built',
 'Year Remod/Add',
 'Mas Vnr Area',
 'BsmtFin SF 1',
 'BsmtFin SF 2',
 'Bsmt Unf SF',
 'Total Bsmt SF',
 '1st Flr SF',
 '2nd Flr SF',
 'Low Qual Fin SF',
 'Gr Liv Area',
 'Bsmt Full Bath',
 'Bsmt Half Bath',
 'Full Bath',
 'Half Bath',
 'Bedroom AbvGr',
 'Kitchen AbvGr',
 'TotRms AbvGrd',
 'Fireplaces',
 'Garage Cars',
 'Garage Area',
 'Wood Deck SF',
 'Open Porch SF',
 'Enclosed Porch',
 '3Ssn Porch',
 'Screen Porch',
 'Mo Sold',
 'Yr Sold',
 'porch sf',
 'age at sale',
 'age remod',
 'total baths',
 'garage area_cars',
 'Neighborhood_Blueste',
 'Neighborhood_BrDale',
 'Neighborhood_BrkSide',
 'Neighborhood_ClearCr',
 'Neighborhood_CollgCr',
 'Neighborhood_Crawfor',
 'Neighborhood_Edwards',
 'Neighborhood_Gilbert',
 'Neighborhood_Greens',
 'Neighborhood_GrnHill',
 'Neighborhood_IDOTRR',
 'Neighborhood_Landmrk',
 'Neighborhood_MeadowV',
 'Neighborhood_Mitchel',
 'Neighborhood_NAmes',
 'Neighbo

In [7]:
X_overfit.shape

(2046, 37400)

In [8]:
pd.DataFrame(X_overfit,
             columns = poly.get_feature_names(features.columns))

Unnamed: 0,Id,PID,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,BsmtFin SF 1,...,Sale Type_VWD^2,Sale Type_VWD Roof Matl_Metal,Sale Type_VWD Roof Matl_Roll,Sale Type_VWD Heating_GasA,Roof Matl_Metal^2,Roof Matl_Metal Roof Matl_Roll,Roof Matl_Metal Heating_GasA,Roof Matl_Roll^2,Roof Matl_Roll Heating_GasA,Heating_GasA^2
0,109.0,533352170.0,0.0,13517.0,6.0,8.0,1976.0,2005.0,289.0,533.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,544.0,531379050.0,43.0,11492.0,7.0,5.0,1996.0,1997.0,132.0,637.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,153.0,535304180.0,68.0,7922.0,5.0,7.0,1953.0,2007.0,0.0,731.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,318.0,916386060.0,73.0,9802.0,5.0,5.0,2006.0,2007.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,255.0,906425045.0,82.0,14235.0,6.0,8.0,1900.0,1993.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2041,1587.0,921126030.0,79.0,11449.0,8.0,5.0,2007.0,2007.0,0.0,1011.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2042,785.0,905377130.0,0.0,12342.0,4.0,5.0,1940.0,1950.0,0.0,262.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2043,916.0,909253010.0,57.0,7558.0,6.0,6.0,1928.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2044,639.0,535179160.0,80.0,10400.0,4.0,5.0,1956.0,1956.0,0.0,155.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
def evaluate_model(model, X, y): 
    
    y_train_hat = ols.predict(X)

    mse = mean_squared_error(y, y_train_hat)
    rmse = mse ** .5
    r2 = r2_score(y, y_train_hat)

    print(f'R^2: {r2}')
    print(f'MSE: {mse}')
    print(f'RMSE: {rmse}')

In [10]:
X = X_overfit
y = train['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

ols = LinearRegression()
ols.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [11]:
evaluate_model(ols, X_train, y_train)

R^2: 0.9703537452817792
MSE: 183586540.03841567
RMSE: 13549.411058729293


In [12]:
evaluate_model(ols, X_test, y_test)

R^2: -19.161491115178784
MSE: 131544221716.90463
RMSE: 362690.2558891052


In [13]:
cross_val_score(ols, X_overfit, y).mean()

-0.18833025345581564

In [14]:
# Create train/test splits.
X_train, X_test, y_train, y_test = train_test_split(
    X_overfit,
    y,
    test_size=0.7,
    random_state=42
)

In [15]:
# Scale our data.
# Relabeling scaled data as "Z" is common.
ss = StandardScaler()
Z_train = ss.fit_transform(X_train)
Z_test = ss.transform(X_test)

In [16]:
print(f'Z_train shape is: {Z_train.shape}')
print(f'y_train shape is: {y_train.shape}')
print(f'Z_test shape is: {Z_test.shape}')
print(f'y_test shape is: {y_test.shape}')

Z_train shape is: (613, 37400)
y_train shape is: (613,)
Z_test shape is: (1433, 37400)
y_test shape is: (1433,)


In [17]:
ols = LinearRegression()
ols.fit(Z_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [18]:
ols.score(Z_train, y_train)

1.0

In [19]:
ols.score(Z_test, y_test)

-1070.9274691803937

In [20]:
from sklearn.linear_model import Ridge

In [21]:
# Instantiate.
ridge = Ridge(alpha = 10)
# Fit.
ridge.fit(Z_train, y_train)

Ridge(alpha=10, copy_X=True, fit_intercept=True, max_iter=None, normalize=False,
      random_state=None, solver='auto', tol=0.001)

In [22]:
ridge.score(Z_train, y_train)

0.999992656578096

In [23]:
ridge.score(Z_test, y_test)

0.9011071984322111

In [24]:
from sklearn.linear_model import RidgeCV

In [25]:
r_alphas = np.logspace(0, 5, 100)

In [26]:
ridge_cv = RidgeCV(
    alphas = r_alphas,
    scoring = 'r2',
    cv = 5
)

# Fit model using best ridge alpha!
ridge_cv.fit(Z_train, y_train)

RidgeCV(alphas=array([1.00000000e+00, 1.12332403e+00, 1.26185688e+00, 1.41747416e+00,
       1.59228279e+00, 1.78864953e+00, 2.00923300e+00, 2.25701972e+00,
       2.53536449e+00, 2.84803587e+00, 3.19926714e+00, 3.59381366e+00,
       4.03701726e+00, 4.53487851e+00, 5.09413801e+00, 5.72236766e+00,
       6.42807312e+00, 7.22080902e+00, 8.11130831e+00, 9.11162756e+00,
       1.02353102e+01, 1.14975700e+0...
       1.09749877e+04, 1.23284674e+04, 1.38488637e+04, 1.55567614e+04,
       1.74752840e+04, 1.96304065e+04, 2.20513074e+04, 2.47707636e+04,
       2.78255940e+04, 3.12571585e+04, 3.51119173e+04, 3.94420606e+04,
       4.43062146e+04, 4.97702356e+04, 5.59081018e+04, 6.28029144e+04,
       7.05480231e+04, 7.92482898e+04, 8.90215085e+04, 1.00000000e+05]),
        cv=5, fit_intercept=True, gcv_mode=None, normalize=False, scoring='r2',
        store_cv_values=False)

In [27]:
ridge_cv.alpha_

422.9242874389499

In [28]:
ridge_cv.score(Z_train, y_train)

0.9983823996788834

In [29]:
ridge_cv.score(Z_test, y_test)

0.910860718013533

In [30]:
from sklearn.linear_model import Lasso, LassoCV

In [31]:
# Reminders
print(" OLS ".center(18, "="))
print(ols.score(Z_train, y_train))
print(ols.score(Z_test, y_test))
print()
print(" Ridge ".center(18, "="))
print(ridge_cv.score(Z_train, y_train))
print(ridge_cv.score(Z_test, y_test))

1.0
-1070.9274691803937

0.9983823996788834
0.910860718013533


In [32]:
l_alphas = np.logspace(-10, 0, 100)

In [33]:
# Cross-validate over our list of Lasso alphas.
lasso_cv = LassoCV(alphas = l_alphas, cv = 5, max_iter = 5000)

# Fit model using best ridge alpha!
lasso_cv.fit(Z_train, y_train)

LassoCV(alphas=array([1.00000000e-10, 1.26185688e-10, 1.59228279e-10, 2.00923300e-10,
       2.53536449e-10, 3.19926714e-10, 4.03701726e-10, 5.09413801e-10,
       6.42807312e-10, 8.11130831e-10, 1.02353102e-09, 1.29154967e-09,
       1.62975083e-09, 2.05651231e-09, 2.59502421e-09, 3.27454916e-09,
       4.13201240e-09, 5.21400829e-09, 6.57933225e-09, 8.30217568e-09,
       1.04761575e-08, 1.32194115e-0...
       7.74263683e-02, 9.77009957e-02, 1.23284674e-01, 1.55567614e-01,
       1.96304065e-01, 2.47707636e-01, 3.12571585e-01, 3.94420606e-01,
       4.97702356e-01, 6.28029144e-01, 7.92482898e-01, 1.00000000e+00]),
        copy_X=True, cv=5, eps=0.001, fit_intercept=True, max_iter=5000,
        n_alphas=100, n_jobs=None, normalize=False, positive=False,
        precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
        verbose=False)

In [34]:
lasso_cv.alpha_

1e-10

In [35]:
lasso_cv.score(Z_train, y_train)

0.9999986798877915

In [36]:
lasso_cv.score(Z_test, y_test)

0.6964325439514165

# TEST

In [53]:

test = pd.read_csv('../datasets/test_eng.csv', index_col = 'Unnamed: 0')

In [44]:
test.shape

(878, 274)

In [45]:
poly = PolynomialFeatures(degree = 2, interaction_only = False, include_bias = False)

In [46]:
features = test._get_numeric_data()
X = features

In [47]:
X_overfit = poly.fit_transform(X)

In [48]:
ss = StandardScaler()
Z_kaggle = ss.fit_transform(X_overfit)

In [49]:
test['SalePrice'] = ridge_cv.predict(Z_kaggle)

In [50]:
ridge_cv.score(X_overfit, test['SalePrice'])

-1.610186271558304e+29

In [51]:
evaluate_model(ridge_cv, X_overfit, test['SalePrice'])

R^2: -1.3165011653744182e+28
MSE: 2.0446741215524768e+37
RMSE: 4.5218072952664366e+18


In [58]:
pd.DataFrame(X_overfit,
             columns = poly.get_feature_names(features.columns))

Unnamed: 0,Id,PID,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,BsmtFin SF 1,...,Condition 2_RRNn^2,Condition 2_RRNn Roof Matl_Membran,Condition 2_RRNn Heating_OthW,Condition 2_RRNn Heating_Wall,Roof Matl_Membran^2,Roof Matl_Membran Heating_OthW,Roof Matl_Membran Heating_Wall,Heating_OthW^2,Heating_OthW Heating_Wall,Heating_Wall^2
0,2658.0,902301120.0,69.0,9142.0,6.0,8.0,1910.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2718.0,905108090.0,0.0,9662.0,5.0,4.0,1977.0,1977.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2414.0,528218130.0,58.0,17104.0,7.0,5.0,2006.0,2006.0,0.0,554.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1989.0,902207150.0,60.0,8520.0,5.0,6.0,1923.0,2006.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,625.0,535105100.0,0.0,9500.0,6.0,5.0,1963.0,1963.0,247.0,609.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
873,1662.0,527377110.0,80.0,8000.0,6.0,6.0,1974.0,1974.0,0.0,931.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
874,1234.0,535126140.0,90.0,14670.0,6.0,7.0,1966.0,1999.0,410.0,575.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
875,1373.0,904100040.0,55.0,8250.0,5.0,5.0,1968.0,1968.0,0.0,250.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
876,1672.0,527425140.0,60.0,9000.0,4.0,6.0,1971.0,1971.0,0.0,616.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [59]:
poly = PolynomialFeatures(degree = 2, interaction_only = False, include_bias = False)

In [60]:
features = test._get_numeric_data()
X = features

In [61]:
list(features.columns)

['Id',
 'PID',
 'Lot Frontage',
 'Lot Area',
 'Overall Qual',
 'Overall Cond',
 'Year Built',
 'Year Remod/Add',
 'Mas Vnr Area',
 'BsmtFin SF 1',
 'BsmtFin SF 2',
 'Bsmt Unf SF',
 'Total Bsmt SF',
 '1st Flr SF',
 '2nd Flr SF',
 'Low Qual Fin SF',
 'Gr Liv Area',
 'Bsmt Full Bath',
 'Bsmt Half Bath',
 'Full Bath',
 'Half Bath',
 'Bedroom AbvGr',
 'Kitchen AbvGr',
 'TotRms AbvGrd',
 'Fireplaces',
 'Garage Cars',
 'Garage Area',
 'Wood Deck SF',
 'Open Porch SF',
 'Enclosed Porch',
 '3Ssn Porch',
 'Screen Porch',
 'Mo Sold',
 'Yr Sold',
 'porch sf',
 'age at sale',
 'age remod',
 'total baths',
 'garage area_cars',
 'gr lv area_totrms abvgrd',
 'gr lv area_log',
 'Neighborhood_Blueste',
 'Neighborhood_BrDale',
 'Neighborhood_BrkSide',
 'Neighborhood_ClearCr',
 'Neighborhood_CollgCr',
 'Neighborhood_Crawfor',
 'Neighborhood_Edwards',
 'Neighborhood_Gilbert',
 'Neighborhood_Greens',
 'Neighborhood_IDOTRR',
 'Neighborhood_MeadowV',
 'Neighborhood_Mitchel',
 'Neighborhood_NAmes',
 'Neighborh

In [62]:
X_kaggle = X_overfit
test['SalePrice'] = ols.predict(X_kaggle)

In [95]:
test[['Id', 'SalePrice']].to_csv('../output_csvs/7_9_1_preds.csv', index = False)

In [14]:
# Scale our data.
# Relabeling scaled data as "Z" is common.
ss = StandardScaler()
Z_train = ss.fit_transform(X_train)
Z_test = ss.transform(X_test)

In [63]:
from sklearn.linear_model import RidgeCV

In [64]:
r_alphas = np.logspace(0, 5, 100)

In [65]:
ridge_cv = RidgeCV(
    alphas = r_alphas,
    scoring = 'r2',
    cv = 5
)

# Fit model using best ridge alpha!
ridge_cv.fit(Z_train, y_train)

KeyboardInterrupt: 

In [26]:
ridge_cv.alpha_

599.4842503189409

In [27]:
ridge_cv.score(Z_train, y_train)

0.9978185453657265

In [28]:
ridge_cv.score(Z_test, y_test)

0.9122973701322867