In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from scipy import stats
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.style as style
style.use('fivethirtyeight')
import matplotlib.cm as cm
import helper

In [2]:
colors = ["#FF0B04", "#F1BE48",
           "#B9975B", "#8B5B29",
           "#524727",
         ]
sns.set_palette(sns.color_palette(colors))

In [3]:
df = pd.read_csv('Ames_Housing_Price_Data.csv', 
                             index_col=0,low_memory = False)

In [4]:
train, test = helper.data_processing_wrapper(df, num_to_cat_list=[], remove_PID=False)

In [5]:
categorical = ['MSZoning', 'MSSubClass','Street','Alley','LotShape','LandContour','LotConfig',
               'LandSlope','Neighborhood','Condition1','Condition2','BldgType','HouseStyle',
               'RoofStyle','Foundation', 'BsmtFinType1','BsmtFinType2','Heating','GarageType',
               'GarageFinish','PavedDrive','MiscFeature','SaleType','SaleCondition',
               'BedroomAbvGr', 'CentralAir', 'Utilities', 'RoofMatl', 'Exterior1st',
               'Exterior2nd', 'MasVnrType', 'Electrical', 'Functional', 'Fence','KitchenAbvGr',
               'MoSold','YrSold'
              ]

In [6]:
train['LogSalePrice'] = np.log(train['SalePrice'])
test['LogSalePrice'] = np.log(test['SalePrice'])

In [7]:
nhds = train.groupby('Neighborhood').median()[['LogSalePrice']]
nhds['LogSalePrice'] = stats.zscore(nhds['LogSalePrice'])

In [8]:
def segment(y):
    if round(y,2) < -0.75:
        return 0
    elif (round(y,2) >= -0.75) and (round(y,2) < 0.75):
        return 1
    else:
        return 2

In [9]:
nhds['Segment'] = nhds.apply(lambda x: segment(x['LogSalePrice']),axis = 1)
nhds.sort_values('LogSalePrice')
seg_dict = nhds.drop('LogSalePrice', axis=1).to_dict()['Segment']

In [10]:
train['Segment'] = train.apply(lambda x: seg_dict[x['Neighborhood']], axis=1)
test['Segment'] = test.apply(lambda x: seg_dict[x['Neighborhood']], axis=1)

In [None]:
X_train = train.drop(['SalePrice', 'PID', 'TotalBsmtSF', 'GrLivArea', 'LogSalePrice'], axis=1)
y_train = train.loc[:, ['LogSalePrice', 'Segment']]


In [None]:
def stacked_selector(X, y, selector_params):
    
    score_dict = {'score_0':0, 'score_1':0, 'score_2':0}
    select_dict = {'select_0':[], 'select_1':[], 'select_2':[]}
    
    for j in range(3):
              
            X_levj = pd.DataFrame(X.loc[X['Segment']==j, :].drop('Segment', axis=1))
            y_levj = y.loc[y['Segment']==j, :].drop('Segment', axis=1)

            transformer = ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), categorical)], 
                                            remainder='passthrough')
            X_levj = transformer.fit_transform(X_levj)
            scaler = StandardScaler(with_mean=False)
            X_levj = scaler.fit_transform(X_levj)
                         
            lasso = Lasso(alpha=selector_params[j])
            selector = SelectFromModel(estimator=lasso)

            X_levj = selector.fit_transform(X_levj, y_levj)

            ols = LinearRegression()
            ols.fit(X_levj,y_levj)
            if ols.score(X_levj,y_levj) > score_dict[f'score_{j}']:
                score_dict[f'score_{j}'] = ols.score(X_levj,y_levj)
                mask = selector.get_support()
                feat_names = transformer.get_feature_names()
                select_dict[f'select_{j}'] = [name for name, boo in zip(feat_names, mask) if boo]

    return score_dict, select_dict

In [None]:
selections = stacked_selector(X_train, y_train, [0.01, 0.01, 0.01])
selections

In [None]:
train['LogSalePrice'] = np.log(train['SalePrice'])

In [None]:
comp_dict = train.groupby(['Neighborhood', 'BedroomAbvGr', 'BldgType',
               'OverallQual', 'FullBath', 'KitchenQual', 'GarageCars']).mean()['LogSalePrice'].to_dict()

In [None]:
train['Comp'] = train.apply(lambda x: comp_dict[(x['Neighborhood'], x['BedroomAbvGr'], x['BldgType'],
               x['OverallQual'], x['FullBath'], x['KitchenQual'], x['GarageCars'])],axis=1)

In [None]:
X_train = train.drop(['SalePrice', 'LogSalePrice', 'PID', 'TotalBsmtSF', 'GrLivArea'], axis=1)
y_train = pd.DataFrame({'LogSalePrice':train['LogSalePrice'], 'NhdCluster':train['NhdCluster']})

In [None]:
selections = multilev_selector(X_train, y_train, [0.01, 0.01, 0.02])

In [None]:
selections[1]['select_0']

In [None]:
dict(enumerate(categorical))

In [None]:
select_0 = ['MSSubClass',
 'SaleType',
 'OverallQual',
 'YearRemodAdd',
 'ExterQual',
 'BsmtFinSF1',
 'HeatingQC',
 '1stFlrSF',
 '2ndFlrSF',
 'BsmtFullBath',
 'HalfBath',
 'TotRmsAbvGrd',
 'Fireplaces',
 'FireplaceQu',
 'OpenPorchSF',
 'Comp']

In [None]:
cats_0 = [col for col in select_0 if col in categorical]

In [None]:
X = X_train
X = X.loc[X['NhdCluster']==0, :].drop('NhdCluster', axis=1)
X = X[select_0]

y = y_train
y = y.loc[y['NhdCluster']==0, :].drop('NhdCluster', axis=1)


pipe = Pipeline([('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown='ignore'), cats_0)], 
                                            remainder='passthrough')),
                 ('scaler', StandardScaler()),
                 ('ridge', Ridge())])

param_grid = {'ridge__alpha':[0.001, 0.1, 1, 10]}
cv = RepeatedKFold(n_splits=5)

grid = GridSearchCV(pipe, param_grid, scoring='r2', cv=cv, n_jobs=-1)

grid.fit(X,y)

In [None]:
print(grid.cv_results_['mean_test_score'])
print(grid.best_params_)
print(grid.best_score_)

In [None]:
selections[1]['select_1']

In [None]:
dict(enumerate(categorical))

In [None]:
select_1 = ['MSSubClass',
 'BsmtFinType1',
 'OverallCond',
 'YearRemodAdd',
 'BsmtQual',
 'BsmtExposure',
 'BsmtFinSF1',
 '1stFlrSF',
 'HalfBath',
 'TotRmsAbvGrd',
 'Fireplaces',
 'WoodDeckSF',
 'ScreenPorch',
 'Comp']

In [None]:
cats_1 = [col for col in select_1 if col in categorical]

In [None]:
X = X_train
X = X.loc[X['NhdCluster']==1, :].drop('NhdCluster', axis=1)
X = X[select_1]

y = y_train
y = y.loc[y['NhdCluster']==1, :].drop('NhdCluster', axis=1)


pipe = Pipeline([('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown='ignore'), cats_1)], 
                                            remainder='passthrough')),
                 ('scaler', StandardScaler()),
                 ('ridge', Ridge())])

param_grid = {'ridge__alpha':[0.001, 0.1, 1, 10]}
cv = RepeatedKFold(n_splits=5)

grid = GridSearchCV(pipe, param_grid, scoring='r2', cv=cv, n_jobs=-1)

grid.fit(X,y)

In [None]:
print(grid.cv_results_['mean_test_score'])
print(grid.best_params_)
print(grid.best_score_)

In [None]:
selections[1]['select_2']

In [None]:
dict(enumerate(categorical))

In [None]:
select_2 = ['BsmtFinType1',
 'LotArea',
 'MasVnrArea',
 'BsmtFinSF1',
 '1stFlrSF',
 '2ndFlrSF',
 'TotRmsAbvGrd',
 'GarageArea',
 'Comp']

In [None]:
cats_2 = [col for col in select_2 if col in categorical]

In [None]:
X = X_train
X = X.loc[X['NhdCluster']==2, :].drop('NhdCluster', axis=1)
X = X[select_1]

y = y_train
y = y.loc[y['NhdCluster']==2, :].drop('NhdCluster', axis=1)


pipe = Pipeline([('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown='ignore'), cats_2)], 
                                            remainder='passthrough')),
                 ('scaler', StandardScaler()),
                 ('ridge', Ridge())])

param_grid = {'ridge__alpha':[0.001, 0.1, 1, 10]}
cv = RepeatedKFold(n_splits=5)

grid = GridSearchCV(pipe, param_grid, scoring='r2', cv=cv, n_jobs=-1)

grid.fit(X,y)

In [None]:
print(grid.cv_results_['mean_test_score'])
print(grid.best_params_)
print(grid.best_score_)