In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans

import helper


In [2]:
# colors = ["#FF0B04", "#F1BE48",
#            "#B9975B", "#8B5B29",
#            "#524727",
#          ]
# sns.set_palette(sns.color_palette(colors))

In [3]:
df = pd.read_csv('Ames_Housing_Price_Data.csv', 
                             index_col=0,low_memory = False)

In [4]:
train, test = helper.data_processing_wrapper(df, num_to_cat_list=['MSSubClass','MoSold'], remove_PID=False)

In [5]:
schools = pd.read_csv('schoolFeatures.csv',index_col = 0)
school_keep = [
    'PID',
    'closestSchool'
]
schools = schools[school_keep]

In [6]:
train = train.merge(schools, how = 'left', left_on = 'PID', right_on = 'PID')
test = test.merge(schools, how = 'left', left_on = 'PID', right_on = 'PID')

train = train.dropna(subset=['closestSchool'])
train = train.reset_index(drop=True)

test = test.dropna(subset=['closestSchool'])
test = test.reset_index(drop=True)

In [7]:
train['LogSalePrice'] = np.log(train['SalePrice'])
test['LogSalePrice'] = np.log(test['SalePrice'])

In [8]:
nhds = train.groupby('Neighborhood').median()[['LogSalePrice', 'GrLivArea']]

In [9]:
weights = train.groupby('Neighborhood').count().apply(lambda x: x['PID']/len(train) ,axis=1).to_list()

In [10]:
scaler = StandardScaler()
_ = scaler.fit_transform(nhds)
clusterer = KMeans(n_clusters=2, random_state=42)
cluster_labels = clusterer.fit_predict(_, sample_weight=weights)
nhds['Cluster'] = cluster_labels

In [11]:
nhds

Unnamed: 0_level_0,LogSalePrice,GrLivArea,Cluster
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Blmngtn,12.180755,1504.0,0
Blueste,11.727633,1118.0,1
BrDale,11.566466,1030.0,1
BrkSide,11.747998,1203.0,1
ClearCr,12.345835,1743.0,0
CollgCr,12.200557,1536.0,0
Crawfor,12.170445,1640.0,0
Edwards,11.739263,1188.5,1
Gilbert,12.11997,1594.0,0
Greens,12.248806,1226.0,1


In [12]:
cluster_dict = nhds['Cluster'].to_dict()

In [13]:
train['NhdCluster'] = train.apply(lambda x: cluster_dict[x['Neighborhood']], axis=1)
test['NhdCluster'] = test.apply(lambda x: cluster_dict[x['Neighborhood']], axis=1)

In [14]:
# myfunction does the log of them. and it'll just be easier for me later to drop this now.

train = train.drop(columns = ['LogSalePrice'])
test = test.drop(columns = ['LogSalePrice'])

In [15]:
# add price comps
train, test = helper.add_price_comp_log_feature(train, test,'Neighborhood')
train, test = helper.add_price_comp_log_feature(train, test,'GarageCars')
train, test = helper.add_price_comp_log_feature(train, test,'BldgType')
train, test = helper.add_price_comp_log_feature(train, test,'MSZoning')
train, test = helper.add_price_comp_log_feature(train, test,'Condition1')

In [16]:
num_features_to_transform = [
    'OverallQual',
    'overall_score',
    'total_sf',
    'GrLivArea',
    'year_since_built',
    'LotArea',
    'GarageArea',
    'year_since_remod',
    'BsmtExposure',
    'KitchenQual'
]

In [17]:
train = helper.add_year_since_feature(train)
train = helper.add_score_feature(train)
train = helper.add_combined_related_num_features(train)
train = helper.add_non_linear_transformed_features(train,num_features_to_transform)

In [18]:
test = helper.add_year_since_feature(test)
test = helper.add_score_feature(test)
test = helper.add_combined_related_num_features(test)
test = helper.add_non_linear_transformed_features(test,num_features_to_transform)

In [19]:
cat_feats = train.select_dtypes(['object','bool']).columns.to_list()

In [20]:
num_cols = train.select_dtypes(['float64','int64']).columns.to_list()
num_cols.remove('SalePrice')
num_cols.remove('PID')

In [21]:
train_0 = train.loc[train['NhdCluster']==0,:]
test_0 = test.loc[test['NhdCluster']==0,:]
train_1 = train.loc[train['NhdCluster']==1,:]
test_1 = test.loc[test['NhdCluster']==1,:]

### cluster only as a feature

In [31]:
clf, transformer, scaler = helper.lasso_grid_cv(train,cat_feats,n_jobs_ = -1)

Performing Grid Search with alphas of: [0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1]
Current best alpha: 0.001
Current best CV R2: 0.9473111529537663
Performing Grid Search with alphas of: [0.0003 0.0004 0.0005 0.0006 0.0007 0.0008 0.0009 0.001  0.0011 0.0012
 0.0013 0.0014 0.0015 0.0016 0.0017 0.0018 0.0019]
Current best alpha: 0.0013000000000000002
Current best CV R2: 0.9474134044270726
Modeling complete :)


In [32]:
clf.best_score_

0.9474134044270726

In [33]:
X_tst = test.drop(['SalePrice','PID'],axis=1)
X_tst = transformer.transform(X_tst)
X_tst = scaler.transform(X_tst)
y_tst = np.log(test['SalePrice'])

clf.score(X_tst,y_tst)

0.9209695499003754

### cluster 0 seperated but cluster 1 train on the whole data set.

In [22]:
clf, transformer, scaler = helper.lasso_grid_cv(train_0,cat_feats,n_jobs_ = -1)

Performing Grid Search with alphas of: [0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1]
Current best alpha: 0.003
Current best CV R2: 0.9317064729739271
Performing Grid Search with alphas of: [0.0009 0.0012 0.0015 0.0018 0.0021 0.0024 0.0027 0.003  0.0033 0.0036
 0.0039 0.0042 0.0045 0.0048 0.0051 0.0054 0.0057]
Current best alpha: 0.0021
Current best CV R2: 0.9328977982767975
Performing Grid Search with alphas of: [0.00063 0.00084 0.00105 0.00126 0.00147 0.00168 0.00189 0.0021  0.00231
 0.00252 0.00273 0.00294 0.00315 0.00336 0.00357 0.00378 0.00399]
Current best alpha: 0.0021
Current best CV R2: 0.9328977982767975
Modeling complete :)


In [23]:
clf.best_score_

0.9328977982767975

In [24]:
X_tst = test_0.drop(['SalePrice','PID'],axis=1)
X_tst = transformer.transform(X_tst)
X_tst = scaler.transform(X_tst)
y_tst = np.log(test_0['SalePrice'])

clf.score(X_tst,y_tst)

0.9366054507869461

In [25]:
clf, transformer, scaler = helper.lasso_grid_cv(train,cat_feats,n_jobs_ = -1)

Performing Grid Search with alphas of: [0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1]
Current best alpha: 0.001
Current best CV R2: 0.9473111529537663
Performing Grid Search with alphas of: [0.0003 0.0004 0.0005 0.0006 0.0007 0.0008 0.0009 0.001  0.0011 0.0012
 0.0013 0.0014 0.0015 0.0016 0.0017 0.0018 0.0019]
Current best alpha: 0.0013000000000000002
Current best CV R2: 0.9474134044270726
Modeling complete :)


In [26]:
clf.best_score_

0.9474134044270726

In [27]:
X_tst = test_1.drop(['SalePrice','PID'],axis=1)
X_tst = transformer.transform(X_tst)
X_tst = scaler.transform(X_tst)
y_tst = np.log(test_1['SalePrice'])

clf.score(X_tst,y_tst)

0.7672036038751455

### cluster 1 train and tested only on itself

In [28]:
clf, transformer, scaler = helper.lasso_grid_cv(train_1,cat_feats,n_jobs_ = -1)

Performing Grid Search with alphas of: [0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1]
Current best alpha: 0.003
Current best CV R2: 0.8714889105186439
Performing Grid Search with alphas of: [0.0009 0.0012 0.0015 0.0018 0.0021 0.0024 0.0027 0.003  0.0033 0.0036
 0.0039 0.0042 0.0045 0.0048 0.0051 0.0054 0.0057]
Current best alpha: 0.0027
Current best CV R2: 0.8715606275139433
Modeling complete :)


In [29]:
clf.best_score_

0.8715606275139433

In [30]:
X_tst = test_1.drop(['SalePrice','PID'],axis=1)
X_tst = transformer.transform(X_tst)
X_tst = scaler.transform(X_tst)
y_tst = np.log(test_1['SalePrice'])

clf.score(X_tst,y_tst)

0.782570071594103