In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.style as style
style.use('fivethirtyeight')
import matplotlib.cm as cm
import helper

In [2]:
colors = ["#FF0B04", "#F1BE48",
           "#B9975B", "#8B5B29",
           "#524727",
         ]
sns.set_palette(sns.color_palette(colors))

In [3]:
df = pd.read_csv('Ames_Housing_Price_Data.csv',
                 index_col=0,
                 low_memory = False)

In [4]:
train, test = helper.data_processing_wrapper(df,
                                             num_to_cat_list = ['MSSubClass','MoSold'],
                                             remove_PID=False)

In [5]:
train['LogSalePrice'] = np.log(train['SalePrice'])
test['LogSalePrice'] = np.log(test['SalePrice'])

In [6]:
nhds = train.groupby('Neighborhood').median()[['LogSalePrice', 'GrLivArea']]

In [7]:
weights = train.groupby('Neighborhood').count().apply(lambda x: x['PID']/len(train) ,axis=1).to_list()

In [8]:
scaler = StandardScaler()
_ = scaler.fit_transform(nhds)
clusterer = KMeans(n_clusters=2, random_state=42)
cluster_labels = clusterer.fit_predict(_, sample_weight=weights)
nhds['Cluster'] = cluster_labels

In [9]:
cluster_dict = nhds['Cluster'].to_dict()

In [10]:
train['NhdCluster'] = train.apply(lambda x: cluster_dict[x['Neighborhood']], axis=1)
test['NhdCluster'] = test.apply(lambda x: cluster_dict[x['Neighborhood']], axis=1)

In [11]:
categorical = train.select_dtypes(['object','bool']).columns.to_list()

In [12]:
X0_train = train.loc[train['NhdCluster']==0,:].drop(['SalePrice', 'LogSalePrice', 'PID', 'TotalBsmtSF', 'GrLivArea', 'NhdCluster'], axis=1)
y0_train = train.loc[train['NhdCluster']==0, 'LogSalePrice']
X0_test = test.loc[test['NhdCluster']==0,:].drop(['SalePrice', 'LogSalePrice', 'PID', 'TotalBsmtSF', 'GrLivArea', 'NhdCluster'], axis=1)
y0_test = test.loc[test['NhdCluster']==0, 'LogSalePrice']
X1_train = train.loc[train['NhdCluster']==1,:].drop(['SalePrice', 'LogSalePrice', 'PID', 'TotalBsmtSF', 'GrLivArea', 'NhdCluster'], axis=1)
y1_train = train.loc[train['NhdCluster']==1, 'LogSalePrice']
X1_test = test.loc[test['NhdCluster']==1,:].drop(['SalePrice', 'LogSalePrice', 'PID', 'TotalBsmtSF', 'GrLivArea', 'NhdCluster'], axis=1)
y1_test = test.loc[test['NhdCluster']==1, 'LogSalePrice']

In [13]:
pipe = Pipeline(
        [
            ('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), categorical)], 
                                                remainder='passthrough')),
            ('scaler', StandardScaler())
        ]
    )
    
X = pipe.fit_transform(X0_train)
y = y0_train

selector = SelectFromModel(Lasso(alpha=0.04, max_iter=10000))
selector.fit(X,y)
mask0 = selector.get_support()
feat_names = pipe.named_steps['transformer'].get_feature_names()
names0 = [name for name, boo in zip(feat_names, mask0) if boo]
print(len(names0))
names0

12


['OverallQual',
 'MasVnrArea',
 'BsmtQual',
 'BsmtFinSF1',
 '1stFlrSF',
 '2ndFlrSF',
 'KitchenQual',
 'TotRmsAbvGrd',
 'Fireplaces',
 'FireplaceQu',
 'GarageCars',
 'GarageArea']

In [14]:
pipe = Pipeline(
        [
            ('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), categorical)], 
                                                remainder='passthrough')),
            ('scaler', StandardScaler())
        ]
    )
    
X = pipe.fit_transform(X1_train)
y = y1_train

selector = SelectFromModel(Lasso(alpha=0.0465, max_iter=10000))
selector.fit(X,y)
mask1 = selector.get_support()
feat_names = pipe.named_steps['transformer'].get_feature_names()
names1 = [name for name, boo in zip(feat_names, mask1) if boo]
print(len(names1))
names1

12


['Cat__x0_30',
 'Cat__x22_N',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearRemodAdd',
 'BsmtFinSF1',
 '1stFlrSF',
 'KitchenQual',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageArea']

In [15]:
[name for name in names1 if name not in names0]

['Cat__x0_30', 'Cat__x22_N', 'LotArea', 'OverallCond', 'YearRemodAdd']

In [16]:
[name for name in names0 if name not in names1]

['MasVnrArea', 'BsmtQual', '2ndFlrSF', 'FireplaceQu', 'GarageCars']

In [17]:
dict(enumerate(categorical))

{0: 'MSSubClass',
 1: 'MSZoning',
 2: 'Street',
 3: 'LotShape',
 4: 'LandContour',
 5: 'Utilities',
 6: 'LotConfig',
 7: 'LandSlope',
 8: 'Neighborhood',
 9: 'Condition1',
 10: 'Condition2',
 11: 'BldgType',
 12: 'HouseStyle',
 13: 'RoofStyle',
 14: 'RoofMatl',
 15: 'Exterior1st',
 16: 'Exterior2nd',
 17: 'MasVnrType',
 18: 'Foundation',
 19: 'BsmtFinType1',
 20: 'BsmtFinType2',
 21: 'Heating',
 22: 'CentralAir',
 23: 'Electrical',
 24: 'Functional',
 25: 'GarageType',
 26: 'Fence',
 27: 'MiscFeature',
 28: 'MoSold',
 29: 'SaleType',
 30: 'SaleCondition'}