In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.style as style
style.use('fivethirtyeight')
import matplotlib.cm as cm
import helper

In [2]:
colors = ["#FF0B04", "#F1BE48",
           "#B9975B", "#8B5B29",
           "#524727",
         ]
sns.set_palette(sns.color_palette(colors))

In [3]:
df = pd.read_csv('Ames_Housing_Price_Data.csv', 
                             index_col=0,low_memory = False)

In [4]:
train, test = helper.data_processing_wrapper(df, num_to_cat_list=[], remove_PID=False)

In [5]:
train['LogSalePrice'] = np.log(train['SalePrice'])

In [6]:
test['LogSalePrice'] = np.log(test['SalePrice'])

In [7]:
nhds = train.loc[:,['Neighborhood', 'LogSalePrice', 'GrLivArea']]

In [8]:
nhds = nhds.groupby('Neighborhood').agg(
    Sqft_med=pd.NamedAgg('GrLivArea',np.median),
    LogPrice_med=pd.NamedAgg('LogSalePrice',np.median)
).fillna(0)

In [9]:
weights = train.groupby('Neighborhood').count().apply(lambda x: x['PID']/len(train) ,axis=1).to_list()

In [10]:
scaler = StandardScaler()
_ = scaler.fit_transform(nhds)
clusterer = KMeans(n_clusters=2, random_state=42)
cluster_labels = clusterer.fit_predict(_, sample_weight=weights)
nhds['Cluster'] = cluster_labels

In [11]:
cluster_dict = nhds['Cluster'].to_dict()

In [12]:
train['NhdCluster'] = train.apply(lambda x: cluster_dict[x['Neighborhood']], axis=1)

In [13]:
test['NhdCluster'] = test.apply(lambda x: cluster_dict[x['Neighborhood']], axis=1)

In [14]:
comp_dict = train.groupby(['Neighborhood', 'BedroomAbvGr', 'BldgType',
               'OverallQual', 'FullBath', 'KitchenQual', 'GarageCars']).mean()['LogSalePrice'].to_dict()

In [15]:
train['Comp'] = train.apply(lambda x: comp_dict[(x['Neighborhood'], x['BedroomAbvGr'], x['BldgType'],
               x['OverallQual'], x['FullBath'], x['KitchenQual'], x['GarageCars'])], axis=1)

In [16]:
alt_dict = train.groupby('Neighborhood').mean()['LogSalePrice'].to_dict()

In [17]:
def test_comp(x):
    if (x['Neighborhood'], x['BedroomAbvGr'], x['BldgType'],
               x['OverallQual'], x['FullBath'], x['KitchenQual'], x['GarageCars']) in comp_dict.keys():
        return comp_dict[(x['Neighborhood'], x['BedroomAbvGr'], x['BldgType'],
               x['OverallQual'], x['FullBath'], x['KitchenQual'], x['GarageCars'])]
    else:
        return alt_dict[x['Neighborhood']]    

In [18]:
test['Comp'] = test.apply(lambda x: test_comp(x), axis=1)

In [19]:
X0_train = train.loc[train['NhdCluster']==0,:].drop(['SalePrice', 'LogSalePrice', 'PID', 'TotalBsmtSF', 'GrLivArea', 'NhdCluster'], axis=1)
y0_train = train.loc[train['NhdCluster']==0, 'LogSalePrice']
X0_test = test.loc[test['NhdCluster']==0,:].drop(['SalePrice', 'LogSalePrice', 'PID', 'TotalBsmtSF', 'GrLivArea', 'NhdCluster'], axis=1)
y0_test = test.loc[test['NhdCluster']==0, 'LogSalePrice']
X1_train = train.loc[train['NhdCluster']==1,:].drop(['SalePrice', 'LogSalePrice', 'PID', 'TotalBsmtSF', 'GrLivArea', 'NhdCluster'], axis=1)
y1_train = train.loc[train['NhdCluster']==1, 'LogSalePrice']
X1_test = test.loc[test['NhdCluster']==1,:].drop(['SalePrice', 'LogSalePrice', 'PID', 'TotalBsmtSF', 'GrLivArea', 'NhdCluster'], axis=1)
y1_test = test.loc[test['NhdCluster']==1, 'LogSalePrice']

In [20]:
print(X0_train.shape)
print(X1_train.shape)
print('\n')
print(y0_train.shape)
print(y1_train.shape)
print('\n')
print(X0_test.shape)
print(X1_test.shape)
print('\n')
print(y0_test.shape)
print(y1_test.shape)

(896, 78)
(975, 78)


(896,)
(975,)


(298, 78)
(326, 78)


(298,)
(326,)
