# lasso model tested on clusters

Testing if splitting the dataset on the clusters made by Alex can improve the score. The results were that cluster 0 consistenlty does well but that cluster 1 performs very poorly with a lasso model.

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans

import helper


## data prep

In [2]:
df = pd.read_csv('Ames_Housing_Price_Data.csv', 
                             index_col=0,low_memory = False)

In [3]:
train, test = helper.data_processing_wrapper(df, num_to_cat_list=['MSSubClass','MoSold'], remove_PID=False)

In [4]:
schools = pd.read_csv('schoolFeatures.csv',index_col = 0)
school_keep = [
    'PID',
    'closestSchool'
]
schools = schools[school_keep]

In [5]:
train = train.merge(schools, how = 'left', left_on = 'PID', right_on = 'PID')
test = test.merge(schools, how = 'left', left_on = 'PID', right_on = 'PID')

train = train.dropna(subset=['closestSchool'])
train = train.reset_index(drop=True)

test = test.dropna(subset=['closestSchool'])
test = test.reset_index(drop=True)

In [6]:
train['LogSalePrice'] = np.log(train['SalePrice'])
test['LogSalePrice'] = np.log(test['SalePrice'])

##  Alex's Clustering 

In [7]:
nhds = train.groupby('Neighborhood').median()[['LogSalePrice', 'GrLivArea']]

In [8]:
weights = train.groupby('Neighborhood').count().apply(lambda x: x['PID']/len(train) ,axis=1).to_list()

In [9]:
scaler = StandardScaler()
_ = scaler.fit_transform(nhds)
clusterer = KMeans(n_clusters=2, random_state=42)
cluster_labels = clusterer.fit_predict(_, sample_weight=weights)
nhds['Cluster'] = cluster_labels

In [10]:
cluster_dict = nhds['Cluster'].to_dict()

In [11]:
train['NhdCluster'] = train.apply(lambda x: cluster_dict[x['Neighborhood']], axis=1)
test['NhdCluster'] = test.apply(lambda x: cluster_dict[x['Neighborhood']], axis=1)

In [12]:
# myfunction does the log of them. and it'll just be easier for me later to drop this now.

train = train.drop(columns = ['LogSalePrice'])
test = test.drop(columns = ['LogSalePrice'])

## feature engineering

In [13]:
# add price comps
train, test = helper.add_price_comp_log_feature(train, test,'Neighborhood')
train, test = helper.add_price_comp_log_feature(train, test,'GarageCars')
train, test = helper.add_price_comp_log_feature(train, test,'BldgType')
train, test = helper.add_price_comp_log_feature(train, test,'MSZoning')
train, test = helper.add_price_comp_log_feature(train, test,'Condition1')

AttributeError: module 'helper' has no attribute 'add_price_comp_log_feature'

In [None]:
num_features_to_transform = [
    'OverallQual',
    'overall_score',
    'total_sf',
    'GrLivArea',
    'year_since_built',
    'LotArea',
    'GarageArea',
    'year_since_remod',
    'BsmtExposure',
    'KitchenQual'
]

In [None]:
train = helper.add_year_since_feature(train)
train = helper.add_score_feature(train)
train = helper.add_combined_related_num_features(train)
train = helper.add_non_linear_transformed_features(train,num_features_to_transform)

In [None]:
test = helper.add_year_since_feature(test)
test = helper.add_score_feature(test)
test = helper.add_combined_related_num_features(test)
test = helper.add_non_linear_transformed_features(test,num_features_to_transform)

In [None]:
cat_feats = train.select_dtypes(['object','bool']).columns.to_list()

In [None]:
num_cols = train.select_dtypes(['float64','int64']).columns.to_list()
num_cols.remove('SalePrice')
num_cols.remove('PID')

In [None]:
train_0 = train.loc[train['NhdCluster']==0,:]
test_0 = test.loc[test['NhdCluster']==0,:]
train_1 = train.loc[train['NhdCluster']==1,:]
test_1 = test.loc[test['NhdCluster']==1,:]

### cluster only as a feature

In [None]:
clf, transformer, scaler = helper.lasso_grid_cv(train,cat_feats,n_jobs_ = -1)

In [None]:
clf.best_score_

In [None]:
X_tst = test.drop(['SalePrice','PID'],axis=1)
X_tst = transformer.transform(X_tst)
X_tst = scaler.transform(X_tst)
y_tst = np.log(test['SalePrice'])

clf.score(X_tst,y_tst)

## trained on entire data set. tested on each clusters.

In [None]:
train_0 = train.loc[train['NhdCluster']==0,:]
test_0 = test.loc[test['NhdCluster']==0,:]
train_1 = train.loc[train['NhdCluster']==1,:]
test_1 = test.loc[test['NhdCluster']==1,:]

In [None]:
X_tst = test_0.drop(['SalePrice','PID'],axis=1)
X_tst = transformer.transform(X_tst)
X_tst = scaler.transform(X_tst)
y_tst = np.log(test_0['SalePrice'])

clf.score(X_tst,y_tst)

In [None]:
X_tst = test_1.drop(['SalePrice','PID'],axis=1)
X_tst = transformer.transform(X_tst)
X_tst = scaler.transform(X_tst)
y_tst = np.log(test_1['SalePrice'])

clf.score(X_tst,y_tst)

### cluster 0 seperated but cluster 1 train on the whole data set.

In [None]:
clf, transformer, scaler = helper.lasso_grid_cv(train_0,cat_feats,n_jobs_ = -1)

In [None]:
clf.best_score_

In [None]:
X_tst = test_0.drop(['SalePrice','PID'],axis=1)
X_tst = transformer.transform(X_tst)
X_tst = scaler.transform(X_tst)
y_tst = np.log(test_0['SalePrice'])

clf.score(X_tst,y_tst)

### cluster 1 train and tested only on itself

In [None]:
clf, transformer, scaler = helper.lasso_grid_cv(train_1,cat_feats,n_jobs_ = -1)

In [None]:
clf.best_score_

In [None]:
X_tst = test_1.drop(['SalePrice','PID'],axis=1)
X_tst = transformer.transform(X_tst)
X_tst = scaler.transform(X_tst)
y_tst = np.log(test_1['SalePrice'])

clf.score(X_tst,y_tst)