# Regularization Notebook

### Goal: I have reached a point where I believe my model suffers from bias, and am looking for a way to add complexity without overfitting, so I am going to implement regularization to attempt to achieve that.

In [55]:
# import necessary modules
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import Lasso, LassoCV
from sklearn.linear_model import Ridge, RidgeCV

sns.set_style('darkgrid')
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

pd.options.display.max_columns = 1000

In [56]:
# load training data
ames = pd.read_csv("../data/train.csv")

In [57]:
# define function to clean the data
def clean_ames_data(df):
    '''Generalized function to clean a sample of Ames Housing Data'''
    
    # convert column names to useable format
    df.columns = [x.lower().replace(' ','_') for x in df.columns]
    
    # drop 'id' and 'pid' columns
    #df.drop(['id','pid'], axis=1, inplace=True)

    # Dealing with NaN values. Handling the special case of Masonry Veneer Type first
    
    df['mas_vnr_type'].fillna(value = 'None', inplace = True) # Assuming 'NaN' should be 'None' for Masonry Type
    df['mas_vnr_area'].fillna(value = 0.0, inplace = True) # Assuming masonry area is 0.0 for houses with 'NaN' type
    
    # for categorical variables, the missing values should actually be marked 'NA'
    nulls = df.columns[df.isnull().any()]
    for col in df[nulls].select_dtypes(include = 'object').columns:
        df[col].fillna(value = 'NA', inplace = True)
    
    # filtering for houses with no basement, replacing numerical columns 'NaNs' with 0.0
    no_bsmt = df['bsmt_qual'] == 'NA'    
    for col in df[no_bsmt].filter(regex = 'bsmt'):
        df[col].fillna(value = 0.0, inplace = True)
        
    # use the same procedure to handle numerical columns for houses with no garage
    no_garage = df['garage_type'] == 'NA' 
    for col in df[no_garage].filter(regex = 'garage'):
        df[col].fillna(value = 0.0, inplace = True)

In [58]:
# clean the data
clean_ames_data(ames)

In [59]:
# Split data into X an y
X = ames.loc[:,ames.columns != 'saleprice']
y = ames['saleprice']

In [60]:
# Create categorical variable for Location
X['Location'] = X['neighborhood']
X['Location'].replace({'MeadowV':'Low','IDOTRR': 'Low','BrDale': 'Low','OldTown': 'Low',
                                'Edwards':'Low','BrkSide':'Low', 'Landmrk': 'LowMed','Sawyer': 'LowMed',
                                'SWISU':'LowMed','NAmes':'LowMed','NPkVill':'LowMed','Blueste':'LowMed',
                                'Mitchel':'LowMed','Gilbert':'MedHigh','Greens':'MedHigh','SawyerW':'MedHigh',
                                'NWAmes':'MedHigh','Blmngtn':'MedHigh','CollgCr':'MedHigh','ClearCr':'MedHigh',
                                'Crawfor':'MedHigh','Somerst':'High','Timber':'High','Veenker':'High','GrnHill':'High',
                                'NoRidge':'High','NridgHt':'High','StoneBr':'High'}, inplace = True)

In [61]:
# Create the dummy variables I will need

# define a function to create the dummy variables I need
style_mask = X['house_style'] == '2.5Fin'

# Create a dummy to indicate house style is 2.5Fin
X['StyleDummy'] = np.where(X['house_style'] == '2.5Fin', 1, 0)

# Create a dummy for being adjacent to or near a positive feature
X['PosFeature'] = np.where((X['condition_2'] == 'PosN') | (X['condition_2'] == 'PosA'), 1, 0)

# Create dummies for Total Rooms Above Grade
X = pd.get_dummies(X, columns = ['ms_zoning', 'full_bath',
                                 'sale_type','central_air','Location',
                                'garage_cars','exter_qual'], drop_first = True)

In [62]:
numeric_columns = ['overall_qual', 'garage_area','gr_liv_area','year_built','totrms_abvgrd']

In [9]:
dummy_columns = ['StyleDummy','PosFeature',
                 'full_bath_1','full_bath_2','full_bath_3','full_bath_4',
                 'central_air_Y','Location_Low','Location_LowMed','Location_MedHigh',
                 'garage_cars_1.0','garage_cars_2.0','garage_cars_3.0','garage_cars_4.0','garage_cars_5.0',
                 'exter_qual_Fa','exter_qual_Gd','exter_qual_TA']

In [10]:
dummy_columns2 = ['StyleDummy','PosFeature',
                 'central_air_Y','Location_Low','Location_LowMed','Location_MedHigh',
                 'exter_qual_Fa','exter_qual_Gd','exter_qual_TA']

In [63]:
dummy_columns3 = ['central_air_Y','Location_Low','Location_LowMed','Location_MedHigh']

In [44]:
dummy_columns4 = ['PosFeature','central_air_Y','Location_Low','Location_LowMed','Location_MedHigh']

In [64]:
columns = numeric_columns + dummy_columns4

In [65]:
predictors = X[columns]

In [66]:
poly = PolynomialFeatures(include_bias = False)
X_poly = poly.fit_transform(predictors)

In [67]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X_poly, y)

In [68]:
ss = StandardScaler()
kf = KFold(n_splits = 10, shuffle = True)

In [69]:
X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.fit_transform(X_test)

In [70]:
lasso_model = LassoCV(cv = kf)
lasso_model = lasso_model.fit(X_train_scaled, y_train)



In [71]:
lasso_optimal_alpha = lasso_model.alpha_
lasso_optimal_alpha

67.63273182050736

In [72]:
lasso_quick = Lasso(alpha = lasso_optimal_alpha)

In [73]:
cross_val_score(lasso_quick, X_train_scaled, y_train)



array([0.81032253, 0.85569173, 0.8577275 ])