In [15]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from scipy import stats
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.style as style
style.use('fivethirtyeight')
import matplotlib.cm as cm
import helper

In [2]:
colors = ["#FF0B04", "#F1BE48",
           "#B9975B", "#8B5B29",
           "#524727",
         ]
sns.set_palette(sns.color_palette(colors))

In [3]:
df = pd.read_csv('Ames_Housing_Price_Data.csv', 
                             index_col=0,low_memory = False)

In [4]:
train, test = helper.data_processing_wrapper(df, num_to_cat_list=[], remove_PID=False)

In [5]:
train['LogSalePrice'] = np.log(train['SalePrice'])
test['LogSalePrice'] = np.log(test['SalePrice'])

In [6]:
categorical = train.select_dtypes(['object','bool']).columns.to_list()

In [7]:
nhds = train.groupby('Neighborhood').median()[['LogSalePrice']]
nhds['LogSalePrice'] = stats.zscore(nhds['LogSalePrice'])

In [8]:
def segment(y):
    if round(y,2) < -0.75:
        return 0
    elif (round(y,2) >= -0.75) and (round(y,2) < 0.75):
        return 1
    else:
        return 2

In [9]:
nhds['Segment'] = nhds.apply(lambda x: segment(x['LogSalePrice']),axis = 1)
nhds.sort_values('LogSalePrice')
seg_dict = nhds.drop('LogSalePrice', axis=1).to_dict()['Segment']

In [10]:
train['Segment'] = train.apply(lambda x: seg_dict[x['Neighborhood']], axis=1)
test['Segment'] = test.apply(lambda x: seg_dict[x['Neighborhood']], axis=1)

In [11]:
X0_train = train.loc[train['Segment']==0,:].drop(['SalePrice', 'LogSalePrice', 'PID', 'TotalBsmtSF', 'GrLivArea', 'Segment'], axis=1)
y0_train = train.loc[train['Segment']==0, 'LogSalePrice']
X0_test = test.loc[test['Segment']==0,:].drop(['SalePrice', 'LogSalePrice', 'PID', 'TotalBsmtSF', 'GrLivArea', 'Segment'], axis=1)
y0_test = test.loc[test['Segment']==0, 'LogSalePrice']
X1_train = train.loc[train['Segment']==1,:].drop(['SalePrice', 'LogSalePrice', 'PID', 'TotalBsmtSF', 'GrLivArea', 'Segment'], axis=1)
y1_train = train.loc[train['Segment']==1, 'LogSalePrice']
X1_test = test.loc[test['Segment']==1,:].drop(['SalePrice', 'LogSalePrice', 'PID', 'TotalBsmtSF', 'GrLivArea', 'Segment'], axis=1)
y1_test = test.loc[test['Segment']==1, 'LogSalePrice']
X2_train = train.loc[train['Segment']==2,:].drop(['SalePrice', 'LogSalePrice', 'PID', 'TotalBsmtSF', 'GrLivArea', 'Segment'], axis=1)
y2_train = train.loc[train['Segment']==2, 'LogSalePrice']
X2_test = test.loc[test['Segment']==2,:].drop(['SalePrice', 'LogSalePrice', 'PID', 'TotalBsmtSF', 'GrLivArea', 'Segment'], axis=1)
y2_test = test.loc[test['Segment']==2, 'LogSalePrice']

In [12]:
print(X0_train.shape)
print(X1_train.shape)
print(X2_train.shape)
print('\n')
print(y0_train.shape)
print(y1_train.shape)
print(y2_train.shape)
print('\n')
print(X0_test.shape)
print(X1_test.shape)
print(X2_test.shape)
print('\n')
print(y0_test.shape)
print(y1_test.shape)
print(y2_test.shape)

(448, 77)
(1058, 77)
(365, 77)


(448,)
(1058,)
(365,)


(151, 77)
(352, 77)
(121, 77)


(151,)
(352,)
(121,)


In [16]:
def Lasso_select(X, y, alpha):

    pipe = Pipeline(
        [
            ('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), categorical)], 
                                                remainder='passthrough')),
            ('scaler', StandardScaler(with_mean=False))
        ]
    )
    
    X = pipe.fit_transform(X)

    cv = KFold(n_splits=4, shuffle=True, random_state=42)

    cross = cross_val_score(Lasso(alpha=alpha, max_iter=10000), X, y, scoring='r2', cv=cv, n_jobs=-1)
    
    selector = SelectFromModel(Lasso(alpha=alpha, max_iter=10000))
    selector.fit(X,y)
    num_features = np.sum(selector.get_support())
    
    return cross, num_features

In [17]:
for alpha in np.logspace(-5, -1, 5):
    print(Lasso_select(X0_train,y0_train,alpha))

(array([0.78941879, 0.76863669, 0.72941127, 0.66847486]), 171)
(array([0.79817129, 0.78525107, 0.77375949, 0.6763447 ]), 169)
(array([0.84104472, 0.82330012, 0.82245825, 0.73584817]), 137)
(array([0.82073526, 0.85426339, 0.82676473, 0.77259892]), 45)
(array([0.45329195, 0.50554314, 0.46004183, 0.39552772]), 3)


In [18]:
for alpha in np.logspace(-5, -1, 5):
    print(Lasso_select(X1_train,y1_train,alpha))

(array([0.85240851, 0.89211622, 0.89601581, 0.8804302 ]), 183)
(array([0.85880552, 0.89333105, 0.89892948, 0.89224062]), 176)
(array([0.87665655, 0.90040676, 0.91032468, 0.91019296]), 131)
(array([0.87556795, 0.87537563, 0.89184603, 0.87375896]), 31)
(array([0.41619543, 0.42964148, 0.44899612, 0.38193065]), 3)


In [19]:
for alpha in np.logspace(-5, -1, 5):
    print(Lasso_select(X2_train,y2_train,alpha))

(array([0.81788142, 0.62022047, 0.63117031, 0.68286282]), 140)
(array([0.89087294, 0.84811896, 0.65904893, 0.76879268]), 135)
(array([0.92610475, 0.8820161 , 0.86910954, 0.88193346]), 96)
(array([0.90474649, 0.86021809, 0.91197627, 0.88873893]), 37)
(array([0.6154174 , 0.56001479, 0.54235256, 0.5568941 ]), 4)
