In [1]:
import pandas as pd
import numpy as np
import helper
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.style as style
style.use('fivethirtyeight')

from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

In [2]:
# loading and splitting data
housing = pd.read_csv('Ames_Housing_Price_Data.csv', index_col = 0,
                      low_memory=False)

train, test = helper.data_processing_wrapper(housing,
                                               num_to_cat_list = ['MSSubClass','MoSold'],
                                             remove_PID = False
                                        )

In [3]:
cat_feats = train.select_dtypes(['object','bool']).columns.to_list()

In [4]:
# scaler = StandardScaler(with_mean=False)
# net = ElasticNet()

# X = train.drop(['SalePrice','PID'],axis=1)
# transformer = ColumnTransformer([("Cat", 
#                                   OneHotEncoder(handle_unknown = 'ignore'), 
#                                   cat_feats)], remainder='passthrough')
# X = transformer.fit_transform(X)
# X = scaler.fit_transform(X)
# y = np.log(train['SalePrice'])
# net.fit(X, y)

# X_tst = test.drop(['SalePrice','PID'],axis=1)
# X_tst = transformer.transform(X_tst)
# X_tst = scaler.transform(X_tst)
# y_tst = np.log(test['SalePrice'])

# net.score(X_tst,y_tst)

In [5]:
# ## Set parameters
# l1_ratio = [0.1, 0.3, 0.5, 0.6, 0.7, 0.8, 0.85, 0.9, 0.95, 1]
# alphas = [0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1, 3, 6]
# max_iter = 50000
# cv = 3

In [6]:
# tuned_parameters = [{'alpha': alphas, 'l1_ratio': l1_ratio}]
# # print(f'Performing Grid Search with alphas of: {alphas}')
# clf = GridSearchCV(net, tuned_parameters, cv=cv)
# clf.fit(X, y)

In [7]:
# clf.best_params_
# best_alpha = clf.best_params_['alpha']
# best_l1_ratio = clf.best_params_['l1_ratio']

In [8]:
# ## Set parameters - round 2
# l1_ratio = [best_l1_ratio * .85, best_l1_ratio * .9, best_l1_ratio * .95, best_l1_ratio, best_l1_ratio * 1.05, 
#             best_l1_ratio * 1.1, best_l1_ratio * 1.15]
# alphas = [best_alpha * .6, best_alpha * .7, best_alpha * .8, best_alpha * .9, 
#                                     best_alpha, best_alpha * 1.1, best_alpha * 1.2, best_alpha * 1.3, 
#                                     best_alpha * 1.4]
# max_iter = 50000
# cv = 3

# tuned_parameters = [{'alpha': alphas, 'l1_ratio': l1_ratio}]
# # print(f'Performing Grid Search with alphas of: {alphas}')
# clf = GridSearchCV(net, tuned_parameters, cv=cv)
# clf.fit(X, y)

In [9]:
def add_price_comp_log_feature(train_, test_,comp_feature):
    temp = train_.copy()
    temp['log_SalePrice'] = np.log(temp['SalePrice'])
    temp = temp.groupby(comp_feature).agg({'log_SalePrice':'median'})
    temp.columns = [comp_feature+'_log_comp']
    train_ = train_.merge(temp, how='left', on=comp_feature)
    test_ = test_.merge(temp, how='left', on=comp_feature)
    return train_, test_

In [10]:
def add_score_feature(df):
    df['overall_score'] = df['OverallQual']*df['OverallCond']
    df['exter_score'] = df['ExterQual']*df['ExterCond']
    df['bsmt_score'] = df['BsmtQual']*df['BsmtCond']
    df['garage_score'] = df['GarageQual']*df['GarageCond']
    return df


In [11]:
def net_grid(train, test, cat_feats, alpha, l1_ratio, cv_):
    
    scaler = StandardScaler(with_mean=False)
    net = ElasticNet(max_iter = 50000)

    X = train.drop(['SalePrice','PID'],axis=1)
    transformer = ColumnTransformer([("Cat", 
                                      OneHotEncoder(handle_unknown = 'ignore'), 
                                      cat_feats)], remainder='passthrough')
    X = transformer.fit_transform(X)
    X = scaler.fit_transform(X)
    y = np.log(train['SalePrice'])
    net.fit(X, y)

    X_tst = test.drop(['SalePrice','PID'],axis=1)
    X_tst = transformer.transform(X_tst)
    X_tst = scaler.transform(X_tst)
    y_tst = np.log(test['SalePrice'])
    
    tuned_parameters = [{'alpha': alpha, 'l1_ratio': l1_ratio}]
    # print(f'Performing Grid Search with alphas of: {alphas}')
    clf = GridSearchCV(net, tuned_parameters, cv=cv_)
    clf.fit(X, y)
    
    
    tst_score = clf.score(X_tst, y_tst)
    print(f"test score: {tst_score}")
    
    return clf


In [12]:
## Add feature engineered features

comp_feature = 'Neighborhood'
train, test = add_price_comp_log_feature(train, test,comp_feature)
train = add_score_feature(train)
test = add_score_feature(test)

In [13]:
## set parameters, first try

l1_ratio = [0.1, 0.3, 0.5, 0.6, 0.7, 0.8, 0.85, 0.9, 0.95, 1]
alpha = [0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1, 3, 6]
cv = 5
clf = net_grid(train, test, cat_feats, alpha, l1_ratio, cv)

test score: 0.9113049605165071


In [14]:
# round 1 best score
clf.best_score_

0.9396399225651535

In [15]:
# get best alpha and ratio from round 1
best_alpha = clf.best_params_['alpha']
best_l1_ratio = clf.best_params_['l1_ratio']

In [16]:
# new parameters around best from round 1
l1_ratio = [best_l1_ratio * .85, best_l1_ratio * .9, best_l1_ratio * .95, best_l1_ratio, best_l1_ratio * 1.05, 
            best_l1_ratio * 1.1, best_l1_ratio * 1.15]
alpha = [best_alpha * .6, best_alpha * .7, best_alpha * .8, best_alpha * .9, 
                                    best_alpha, best_alpha * 1.1, best_alpha * 1.2, best_alpha * 1.3, 
                                    best_alpha * 1.4]
max_iter = 50000
cv = 5

clf = net_grid(train, test, cat_feats, alpha, l1_ratio, cv)

test score: 0.9112368324085806


In [17]:
clf.best_score_

0.9396566058585527

In [20]:
clf.best_params_

{'alpha': 0.0027, 'l1_ratio': 0.575}