In [3]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.style as style
style.use('fivethirtyeight')
import helper
import missingno as msno

In [4]:
colors = ["#FF0B04", "#F1BE48",
           "#B9975B", "#8B5B29",
           "#524727",
         ]
sns.set_palette(sns.color_palette(colors))

In [5]:
train = pd.read_csv('train.csv', index_col=0)

In [6]:
target = ['SalePrice']
key = ['PID'] 
numeric = ['GrLivArea','LotFrontage','LotArea','YearBuilt','YearRemodAdd','MasVnrArea',
           'BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','1stFlrSF','2ndFlrSF',
           'LowQualFinSF','GarageYrBlt','GarageArea','WoodDeckSF','OpenPorchSF',
           'EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MiscVal','YrSold'
          ] 
## Let's say you wanted to classify 'YearBuilt' (for example) as ordinal. From a math point of view,
## does it make any difference?
numeric_to_categorical = ['MSSubClass','MoSold']

## Had to remove 'MasVnrType','Electrical','RoofMatl', 'Exterior1st','Exterior2nd', 'Utilities'
## because they get dropped later and am running these categorical through
## OneHotEncoder
categorical = ['MSZoning','Street','Alley','LotShape','LandContour','LotConfig',
               'LandSlope','Neighborhood','Condition1','Condition2','BldgType','HouseStyle',
               'RoofStyle','Foundation', 'BsmtFinType1','BsmtFinType2','Heating','GarageType',
               'GarageFinish','PavedDrive','MiscFeature','SaleType','SaleCondition',
               'BedroomAbvGr', 'CentralAir'
              ]
## Moved 'BedroomAbvGr' to categorical for one-hot encoding. Moved 'CentralAir' to categorical because
## although binary was currently encoded as Y/N.
    
cat_ordinal = [
               'OverallQual','OverallCond', # numeric?
               'ExterQual','ExterCond', # {'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5}
               'BsmtQual','BsmtCond', # {'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5}
               'BsmtExposure', # {'No':1,'Mn':2,'Av':3,'Gd':4}
               'HeatingQC', # {'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5}
               'BsmtFullBath','BsmtHalfBath', # numeric?
               'FullBath','HalfBath', # numeric?
               'KitchenAbvGr', # numeric?
               'KitchenQual', # {'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5}
               'TotRmsAbvGrd', # numeric?
               'Functional', # {'Sal':1,'Sev':2,'Maj2':3,'Maj1':4,
                             #  'Mod':5,'Min2':6,'Min1':7,'Typ':8}
               'Fireplaces', # numeric?
               'FireplaceQu', # {'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5}
               'GarageCars', # numeric?
               'GarageQual', # {'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5}
               'GarageCond', # {'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5}
               'PoolQC', # {'Fa':1,'TA':2,'Gd':3,'Ex':4}
               'Fence' #{'MnWw':1,'GdWo':2,'MnPrv':3,'GdPrv':4}
               ]

## train cleaning

In [7]:
# converting all similar mappings together
# most popular mapping

## There is some imputing hidden in the convert_cat_ordinal function.

cat_ordinal_features = [
    'GarageQual','GarageCond',
    'FireplaceQu',
    'KitchenQual',
    'ExterQual','ExterCond',
    'BsmtQual','BsmtCond',
    'HeatingQC'
    ]
cat_ordinal_dict = {'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5}
train = helper.convert_cat_ordinal_vars_to_num(train,
                                               cat_ordinal_features,
                                               cat_ordinal_dict)
# now just unique mappings
# BsmtExposure
cat_ordinal_features = [
    'BsmtExposure'
]
cat_ordinal_dict = {'No':1,'Mn':2,'Av':3,'Gd':4}
train = helper.convert_cat_ordinal_vars_to_num(train,
                                               cat_ordinal_features,
                                               cat_ordinal_dict)
# Functional
cat_ordinal_features = [
    'Functional'
]
cat_ordinal_dict = {'Sal':1,'Sev':2,'Maj2':3,'Maj1':4,
                    'Mod':5,'Min2':6,'Min1':7,'Typ':8}
train = helper.convert_cat_ordinal_vars_to_num(train,
                                               cat_ordinal_features,
                                               cat_ordinal_dict)
# PoolQC
cat_ordinal_features = [
    'PoolQC'
]
cat_ordinal_dict = {'Fa':1,'TA':2,'Gd':3,'Ex':4}
train = helper.convert_cat_ordinal_vars_to_num(train,
                                               cat_ordinal_features,
                                               cat_ordinal_dict)
# Fence
cat_ordinal_features = [
    'Fence'
]
cat_ordinal_dict = {'MnWw':1,'GdWo':2,'MnPrv':3,'GdPrv':4}
train = helper.convert_cat_ordinal_vars_to_num(train,
                                               cat_ordinal_features,
                                               cat_ordinal_dict)

In [8]:
LotFrontage_dict = {'1Fam':0.7139, 'TwnhsE':0.5849, 'Twnhs':0.5227, 'Duplex':0.7725, '2fmCon':0.6922}

In [9]:
train.loc[train['LotFrontage'].isna(), 'LotFrontage'] = train.loc[train['LotFrontage'].isna(), :].apply(
    lambda x: LotFrontage_dict[x['BldgType']]*np.sqrt(x['LotArea']), axis=1
)

In [75]:
## Removed 'LotFrontage' from here. Imputed above. Added Electrical here because there was NA in test.
## Also 'RoofMatl' because of 'Roll' in test.
## Also 'Exterior1st', 'Exterior2nd'
# weirdest nas. lot frontage. probably worth removing
# not dealing with them out of expediance. 
drop_now_but_look_at_later = ['MasVnrArea','GarageYrBlt','MasVnrType', 'Electrical', 'RoofMatl', 
                              'Exterior1st', 'Exterior2nd', 'Utilities']
train.drop(drop_now_but_look_at_later, axis=1,inplace = True)

In [10]:
na_none_features = ['MiscFeature','Alley','BsmtFinType1','BsmtFinType2',
                   'GarageFinish','GarageType']
for na_none_feature in na_none_features:
    train[na_none_feature] = train[na_none_feature].fillna(value = 'None')

In [11]:
## Think there was a typo here.
na_zero_features = ['BsmtFullBath','BsmtHalfBath','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF']
for na_zero_feature in na_zero_features:
    train[na_zero_feature] = train[na_zero_feature].fillna(value = 0)

In [12]:
cols_na = train.loc[:,train.isna().any(axis=0)].columns.to_list()
cols_na

['MasVnrType', 'MasVnrArea', 'GarageYrBlt']

In [37]:
nhds = train.loc[:,['Neighborhood', 'GrLivArea']]

In [38]:
nhds['LogSalePrice'] = np.log(train['SalePrice'])

In [41]:
nhds = nhds.groupby('Neighborhood').agg(
    Sqft_mean=pd.NamedAgg('GrLivArea',np.mean),
    Sqft_sd=pd.NamedAgg('GrLivArea',np.std),
    LogPrice_mean=pd.NamedAgg('LogSalePrice',np.mean),
    LogPrice_sd=pd.NamedAgg('LogSalePrice',np.std)
).fillna(0)

In [42]:
from sklearn.cluster import KMeans

In [50]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [51]:
X = nhds

In [52]:
# We choose 3 clusters based on silhouette score.
range_n_clusters = [2, 3, 4, 5, 6]

for n_clusters in range_n_clusters:

    clusterer = KMeans(n_clusters=n_clusters, random_state=10)
    cluster_labels = clusterer.fit_predict(X)
    
    silhouette_avg = silhouette_score(X, cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)

For n_clusters = 2 The average silhouette_score is : 0.4976499122299036
For n_clusters = 3 The average silhouette_score is : 0.5021221675777995
For n_clusters = 4 The average silhouette_score is : 0.46937052279661323
For n_clusters = 5 The average silhouette_score is : 0.390305250984152
For n_clusters = 6 The average silhouette_score is : 0.4164877860743004


In [60]:
# We choose 3 clusters based on silhouette score. It is important
# to recognize this creates a cluster with a single neighborhood, 
# 'NoRidge'. A look at the data suggests this is because of the 
# large mean square footage. There are 50 properties in 'NoRidge' 
# which is not too small for a sensible regression, therefore, it
# seems appropriate (even desirable) to allow what might at first
# glance have seemed anomalous.
clusterer = KMeans(n_clusters=3, random_state=10)
cluster_labels = clusterer.fit_predict(X)
nhds['Cluster'] = cluster_labels
nhds

Unnamed: 0_level_0,Sqft_mean,Sqft_sd,LogPrice_mean,LogPrice_sd,Cluster
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Blmngtn,1447.588235,141.282544,12.206503,0.138488,1
Blueste,1127.625,99.087173,11.803928,0.17205,1
BrDale,1104.368421,150.796482,11.570003,0.120983,1
BrkSide,1195.315068,353.090006,11.696983,0.295337,1
ClearCr,1767.035714,466.461739,12.274598,0.230419,0
CollgCr,1485.8,397.787877,12.166988,0.240871,0
Crawfor,1668.746269,448.328287,12.145134,0.29729,0
Edwards,1281.669492,406.126292,11.735905,0.325592,1
Gilbert,1635.854369,310.003618,12.143588,0.145145,0
Greens,1207.333333,88.149116,12.223814,0.059277,1


In [69]:
cluster_dict = pd.DataFrame(data = {'Nhd_cluster':nhds.Cluster}, index=nhds.index).to_dict()['Nhd_cluster']

In [73]:
train['NhdCluster'] = train.apply(lambda x: cluster_dict[x['Neighborhood']], axis=1)

In [74]:
from sklearn.linear_model import LogisticRegression

In [79]:
# cats are the above categorical with 'Neighborhood' removed, otherwise
# the ColumnTransformer would have a problem being fed categorical having
# dropped 'Neighborhood' from the features being included in X
cats = ['MSZoning','Street','Alley','LotShape','LandContour','LotConfig',
               'LandSlope','Condition1','Condition2','BldgType','HouseStyle',
               'RoofStyle','Foundation', 'BsmtFinType1','BsmtFinType2','Heating','GarageType',
               'GarageFinish','PavedDrive','MiscFeature','SaleType','SaleCondition',
               'BedroomAbvGr', 'CentralAir'
              ]

In [81]:
X = train.drop(['SalePrice', 'GrLivArea', 'Neighborhood', 'NhdCluster'],axis=1)
transformer = ColumnTransformer([("Cat", OneHotEncoder(drop='first'), cats)], remainder='passthrough')
X = transformer.fit_transform(X)

In [84]:
clf = LogisticRegression(penalty='l1', random_state=10, solver='saga', max_iter=10000)

In [85]:
# This takes a while. Notice I set max_iter to 10,000. With the 
# default of 100 it did not converge. The score of .54 is a bit
# disappointing. Wonder if some columns currently being excluded
# could play a role here, like 'Exterior1st', 'Exterior2nd'.
y = train['NhdCluster']
clf.fit(X,y)
clf.score(X,y)

0.5403527525387494

In [89]:
# Not able to use the L1 penalty (Lasso) to actually identify any useful features. This plus the poor score
# above suggests that (unless the missing features have something else to say) it is very hard to 
# predict NhdCluster based on the features that don't appear in the clustering. And it is even harder
# to identify any particular features as useful. So nothing pops out as a differentiator. Attempting 
# to use the clusters still makes sense, but features should be identified solely on their usefulness in
# that direction. Now that I've thought about this more, simply allowing the submodels to use different sets
# of features seems a more effective way of achieving the aim in any case. After all, even if a feature were
# useful in differentiating between neighborhoods, it might not be useful for predicting price, and the model 
# will identify those best at that in any case, why would you try to prescribe some of them.
from sklearn.feature_selection import SelectFromModel
clf = LogisticRegression(penalty='l1', C=1, random_state=10, solver='saga', max_iter=10000)
selector = SelectFromModel(estimator=clf).fit(X, y)
selector.transform(X)

  warn("No features were selected: either the data is"


array([], shape=(1871, 0), dtype=float64)

In [12]:
X = train.drop('SalePrice',axis=1)
transformer = ColumnTransformer([("Cat", OneHotEncoder(drop='first'), categorical)], remainder='passthrough')
X = transformer.fit_transform(X)

In [13]:
y = np.log(train['SalePrice'])

In [14]:
ols = linear_model.LinearRegression()

In [15]:
ols.fit(X, y)
ols.score(X, y)

0.9531732835572826