## Ridge Regression with Lasso-Limited Features

In [1]:
import pandas as pd
import numpy as np
import helper
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.style as style
style.use('fivethirtyeight')

from sklearn.linear_model import ElasticNet, Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

In [2]:
def net_grid(train, test, cat_feats, alpha, l1_ratio, cv_):
    
    scaler = StandardScaler(with_mean=False)
    net = ElasticNet(max_iter = 50000)

    X = train.drop(['SalePrice','PID'],axis=1)
    transformer = ColumnTransformer([("Cat", 
                                      OneHotEncoder(handle_unknown = 'ignore'), 
                                      cat_feats)], remainder='passthrough')
    X = transformer.fit_transform(X)
    X = scaler.fit_transform(X)
    y = np.log(train['SalePrice'])
    net.fit(X, y)

    X_tst = test.drop(['SalePrice','PID'],axis=1)
    X_tst = transformer.transform(X_tst)
    X_tst = scaler.transform(X_tst)
    y_tst = np.log(test['SalePrice'])
    
    tuned_parameters = [{'alpha': alpha, 'l1_ratio': l1_ratio}]
    # print(f'Performing Grid Search with alphas of: {alphas}')
    clf = GridSearchCV(net, tuned_parameters, cv=cv_)
    clf.fit(X, y)
    
    
    tst_score = clf.score(X_tst, y_tst)
    print(f"test score: {tst_score}")
    
    return clf


In [3]:
# loading and splitting data
housing = pd.read_csv('Ames_Housing_Price_Data.csv', index_col = 0,
                      low_memory=False)

train, test = helper.data_processing_wrapper(housing,
                                               num_to_cat_list = ['MSSubClass','MoSold'],
                                             remove_PID = False
                                        )
train, test = helper.feature_engineering_wrapper(train,test)

In [5]:
# importing school feature
schools = pd.read_csv('schoolFeatures.csv',index_col = 0)
school_keep = [
    'PID',
    'closestSchool'
]
schools = schools[school_keep]

# merge school feature onto original data set.
train = train.merge(schools, how = 'left', left_on = 'PID', right_on = 'PID')
test = test.merge(schools, how = 'left', left_on = 'PID', right_on = 'PID')

train = train.dropna(subset=['closestSchool'])
train = train.reset_index(drop=True)

test = test.dropna(subset=['closestSchool'])
test = test.reset_index(drop=True)

In [6]:
cat_feats = train.select_dtypes(['object','bool']).columns.to_list()
num_feats = train.select_dtypes(['int','float']).columns.to_list()

In [7]:
print(f'train size: {train.shape}, test size: {test.shape}')

train size: (1856, 129), test size: (621, 129)


In [None]:
coef_df = pd.read_csv('lasso_coef.csv',index_col=0) #Hayden shared this on Slack
coef_df.features.to_list();

# Ridge

In [None]:
scaler = StandardScaler(with_mean=False)
net = ElasticNet(max_iter = 50000)
X = train.drop(['SalePrice'],axis=1)
cat_feats = X.select_dtypes(['object','bool']).columns.to_list()
num_feats = X.select_dtypes(['int64','float64']).columns.to_list()
transformer = ColumnTransformer([("Cat", 
                                      OneHotEncoder(handle_unknown = 'ignore'), 
                                      cat_feats)], remainder='passthrough')
X = transformer.fit_transform(X)
X = scaler.fit_transform(X)
y = np.log(train['SalePrice'])
# net.fit(X, y)

X_tst = test.drop(['SalePrice'],axis=1)
X_tst = transformer.transform(X_tst)
X_tst = scaler.transform(X_tst)
y_tst = np.log(test['SalePrice'])

# add column names to the X Dataframe
col_names = transformer.named_transformers_['Cat'].get_feature_names(input_features= cat_feats)
new_columns = list(col_names)+num_feats
X = pd.DataFrame(X, columns=new_columns)
X_tst = pd.DataFrame(X_tst, columns=new_columns)

In [None]:
X = X[coef_df['features']]
X_tst = X_tst[coef_df['features']]
print(f'train size: {X_tst.shape}, test size: {y_tst.shape}')

In [18]:
ridge = Ridge()
alpha = [0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1, 3, 6]
tuned_parameters = [{'alpha': alpha}]
clf = GridSearchCV(ridge, tuned_parameters, cv=5);
clf.fit(X, y)
clf.best_score_

0.949411242949229

In [19]:
tst_score = clf.score(X_tst, y_tst)
print(f"test score: {tst_score}")
print(f'best params: {clf.best_params_}')
best_alpha = clf.best_params_['alpha']

test score: 0.9211997221838475
best params: {'alpha': 6}


In [20]:
alpha = [best_alpha*0.8, best_alpha*0.85, best_alpha*0.9, best_alpha*0.95,
         best_alpha*0.1, best_alpha*1.5, best_alpha*2, best_alpha*2.5]
tuned_parameters = [{'alpha': alpha}]
clf = GridSearchCV(ridge, tuned_parameters, cv=5);
clf.fit(X, y)
clf.best_score_

0.9495383326380228

In [21]:
tst_score = clf.score(X_tst, y_tst)
print(f"test score: {tst_score}")
print(f'best params: {clf.best_params_}')


test score: 0.9214209260645868
best params: {'alpha': 15.0}


In [22]:
alpha = [best_alpha*0.8, best_alpha*0.85, best_alpha*0.9, best_alpha*0.95,
         best_alpha*0.1, best_alpha*1.5, best_alpha*2, best_alpha*2.5]
tuned_parameters = [{'alpha': alpha}]
clf = GridSearchCV(ridge, tuned_parameters, cv=5);
clf.fit(X, y)
clf.best_score_

0.9495383326380228

In [23]:
tst_score = clf.score(X_tst, y_tst)
print(f"test score: {tst_score}")
print(f'best params: {clf.best_params_}')
best_alpha = clf.best_params_['alpha']

test score: 0.9214209260645868
best params: {'alpha': 15.0}


In [24]:
## Round 3

alpha = [best_alpha*0.8, best_alpha*0.85, best_alpha*0.9, best_alpha*0.95,
         best_alpha*0.1, best_alpha*1.5, best_alpha*2, best_alpha*2.5]
tuned_parameters = [{'alpha': alpha}]
clf = GridSearchCV(ridge, tuned_parameters, cv=5);
clf.fit(X, y)
clf.best_score_

0.9496177681879183

In [25]:
tst_score = clf.score(X_tst, y_tst)
print(f"test score: {tst_score}")
print(f'best params: {clf.best_params_}')
best_alpha = clf.best_params_['alpha']

test score: 0.9217873594983708
best params: {'alpha': 37.5}


In [26]:
## Round 4

alpha = [best_alpha*0.8, best_alpha*0.85, best_alpha*0.9, best_alpha*0.95,
         best_alpha*0.1, best_alpha*1.5, best_alpha*2, best_alpha*2.5]
tuned_parameters = [{'alpha': alpha}]
clf = GridSearchCV(ridge, tuned_parameters, cv=5);
clf.fit(X, y)
clf.best_score_

0.9496189389335381

In [27]:
tst_score = clf.score(X_tst, y_tst)
print(f"test score: {tst_score}")
print(f'best params: {clf.best_params_}')
best_alpha = clf.best_params_['alpha']

test score: 0.9217390613811546
best params: {'alpha': 33.75}


In [28]:
## Round 5

alpha = [best_alpha*0.8, best_alpha*0.85, best_alpha*0.9, best_alpha*0.95,
         best_alpha*0.1, best_alpha*1.5, best_alpha*2, best_alpha*2.5]
tuned_parameters = [{'alpha': alpha}]
clf = GridSearchCV(ridge, tuned_parameters, cv=5);
clf.fit(X, y)
clf.best_score_

0.9496180333736712

In [29]:
tst_score = clf.score(X_tst, y_tst)
print(f"test score: {tst_score}")
print(f'best params: {clf.best_params_}')
best_alpha = clf.best_params_['alpha']

test score: 0.9217159637862055
best params: {'alpha': 32.0625}


In [30]:
## Round 6

alpha = [best_alpha*0.8, best_alpha*0.85, best_alpha*0.9, best_alpha*0.95,
         best_alpha*0.1, best_alpha*1.5, best_alpha*2, best_alpha*2.5]
tuned_parameters = [{'alpha': alpha}]
clf = GridSearchCV(ridge, tuned_parameters, cv=5);
clf.fit(X, y)
clf.best_score_

0.9496162674327542

In [31]:
tst_score = clf.score(X_tst, y_tst)
print(f"test score: {tst_score}")
print(f'best params: {clf.best_params_}')
best_alpha = clf.best_params_['alpha']

test score: 0.9216931791415927
best params: {'alpha': 30.459374999999998}


## Get coefficient table

In [27]:
# Get top coefficients
coef_table = pd.DataFrame({ 'features': coef_df['features'] , 'coefficients': clf.best_estimator_.coef_})
coef_table['absCoef'] = np.abs(coef_table['coefficients'])
coef_table = coef_table.sort_values('absCoef', ascending=False)
top10coef = coef_table[['features', 'absCoef']]
coef_table.head(15)

Unnamed: 0,features,coefficients,absCoef
0,GrLivArea_square_root,0.073881,0.073881
1,year_since_built_square_root,-0.070151,0.070151
2,total_sf_square_root,0.059132,0.059132
3,overall_score_square_root,0.039542,0.039542
4,LotArea_square_root,0.038704,0.038704
6,OverallQual_cubed,0.037801,0.037801
5,total_high_qual_finished_sf,0.02561,0.02561
7,Neighborhood_log_comp,0.022867,0.022867
20,OverallCond,0.019329,0.019329
10,Neighborhood_Crawfor,0.014146,0.014146


## Graphing
### Written by Hayden

In [None]:
import matplotlib.patches as mpatches

colors = ["#FF0B04", "#F1BE48",
           "#B9975B", "#8B5B29",
           "#524727",
         ]

coef_table = coef_table[coef_table['coefficients']!=0]
coef_table = coef_table.sort_values('absCoef',ascending=False).reset_index(drop=True)
coef_table.loc[:,'sign'] = np.sign(coef_table.loc[:,'coefficients'])

graph_df = coef_table.loc[0:19,:]
sign_=list(graph_df.sign)
color_map = {1:colors[0],-1:colors[1]}
sign_colors = map(color_map.get, sign_)
g = sns.barplot(data = graph_df,y='features',x='absCoef',palette=sign_colors)

pos_patch = mpatches.Patch(color=colors[0], label='Positive Coefficient')
neg_patch = mpatches.Patch(color=colors[1], label='Negative Coefficient')
plt.legend(handles=[pos_patch,neg_patch])

plt.title('Ridge Top 20 Coefficient')
plt.xlabel(r'Coefficient Value')
plt.ylabel('')

g.set_yticklabels(labels = [
    r'Above Ground $ft^{2}$ $\sqrt{~~}$',
    r'Years Since Built $\sqrt{~~}$',
    r'Total $ft^{2}$ $\sqrt{~~}$',
    r'Overall Score $\sqrt{~~}$',
    r'Lot Area $\sqrt{~~}$',
    r'Overall Quality$^{3}$',
    'Total High Quality $ft^{2}$',
    'Neighborhood Comp',
    'Overall Condition',
    'Neighborhood: Crawford',
    r'Basement Exposure$^{3}$',
    'Closest School: Abbie Sawyer',
    'Brick Face Exterior1st',
    'Garage Cars',
    'Neighborhood: Brookside',
    'Garage Area',
    'MSZoning Comp',
    'Neighborhood: Sawyer West',
    r'Kitchen Qualilty$^{3}$',
    'Basement $ft^{2}$ Finish 1',
])

plt.show()