In [26]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import pickle

In [27]:
data = pd.read_csv('../communal/Ames_Housing_Price_Data_cleaned_2.csv', header = [0])

pd.set_option("display.max_columns", None)

data.drop(['PID', 'lot_bucket', 'mean_LotFrontage', 'Prop_Addr', 'GarageYrBlt', 'lat', 'long'], axis = 1, inplace = True)

In [28]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler

In [4]:
#data['GrLivArea_log'] = data['GrLivArea'].apply(lambda x: np.log(x))
#data.drop('GrLivArea', axis = 1, inplace = True)

In [29]:
y = data['SalePrice'].apply(lambda x: np.log(x))

In [30]:
x = data.drop(['SalePrice'], axis = 1)

In [31]:
x = pd.get_dummies(x, drop_first = True)

## Lasso with all features

In [None]:
coefs = grid_search.best_estimator_.coef_
coef_list = list(zip(coefs, x.columns))
sorted(coef_list, key = lambda x: abs(x[0]), reverse = True)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

In [None]:
x_scaled = MinMaxScaler()
x_train = x_scaled.fit_transform(x_train)

In [None]:
lasso_model = linear_model.Lasso()

params = {'alpha': np.linspace(1e-6, 1e-3, 1000)}

grid_search = GridSearchCV(linear_model.Lasso(max_iter = 1000000),
                          params,
                          cv = 5)

In [None]:
grid_search.fit(x_train, y_train)

In [None]:
grid_search.score(x_train, y_train)

In [None]:
x_test = x_scaled.fit_transform(x_test)

In [None]:
grid_search.score(x_test, y_test)

In [None]:
coefs = grid_search.best_estimator_.coef_

In [None]:
grid_search.best_estimator_.alpha

In [None]:
coef_list = list(zip(coefs, x.columns))

In [None]:
sorted(coef_list, key = lambda x: abs(x[0]), reverse = True)

## Lasso with feature selection

In [8]:
data.drop(['YearRemodAdd', 'Exterior2nd', 'BsmtFinSF1', 'BsmtFinSF2',
           'BsmtUnfSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 
           'TotRmsAbvGrd', 'GarageCars'], axis = 1, inplace = True)
data['Total_Porch'] = data['OpenPorchSF'] + data['EnclosedPorch'] + \
                      data['3SsnPorch'] + data['ScreenPorch']
data.drop(['OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch'],
           axis = 1, inplace = True)

In [45]:
y = data['SalePrice'].apply(lambda x: np.log(x))

In [46]:
x = data.drop(['SalePrice'], axis = 1)

In [47]:
x = pd.get_dummies(x, drop_first = True)

In [48]:
train_scores = []
test_scores = []
alpha_list = []

In [49]:
for i in range(10):
    np.random.seed(i)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
    x_scaled = MinMaxScaler()
    x_train = x_scaled.fit_transform(x_train)
    lasso_model = linear_model.Lasso()
    params = {'alpha': np.linspace(1e-6, 1e-3, 1000)}
    grid_search = GridSearchCV(linear_model.Lasso(max_iter = 1000000), params, cv = 5)
    grid_search.fit(x_train, y_train)
    train_scores.append(grid_search.score(x_train, y_train))
    x_test = x_scaled.fit_transform(x_test)
    test_scores.append(grid_search.score(x_test, y_test))
    alpha_list.append(grid_search.best_estimator_.alpha)
    pickle.dump(grid_search, open('lasso{num}.pickle'.format(num = i), 'wb'))

In [50]:
train_scores

[0.9299583002552687,
 0.9378698103302403,
 0.9425603208321138,
 0.917653815911975,
 0.9460390754450917,
 0.9427712068034613,
 0.9378009884880388,
 0.9291332663888276,
 0.9438321000649128,
 0.9301489669479803]

In [51]:
test_scores

[0.9103035926344657,
 0.9119986523509891,
 0.8364559115064465,
 0.9053757599298827,
 0.8762514895937815,
 0.8396936203149413,
 0.9081879984295078,
 0.857637382574845,
 0.8782579600583194,
 0.9127110144706014]

In [52]:
alpha_list

[0.0002690000000000001,
 0.00021000000000000004,
 0.00016500000000000003,
 0.0007500000000000001,
 0.000142,
 0.00017100000000000004,
 0.00024400000000000005,
 0.0004090000000000001,
 0.00025200000000000005,
 0.0003940000000000001]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

In [None]:
x_scaled = MinMaxScaler()
x_train = x_scaled.fit_transform(x_train)

In [None]:
lasso_model = linear_model.Lasso()

params = {'alpha': np.linspace(1e-6, 1e-3, 1000)}

grid_search = GridSearchCV(linear_model.Lasso(max_iter = 1000000),
                          params,
                          cv = 5)

In [None]:
grid_search.fit(x_train, y_train)

In [None]:
grid_search.score(x_train, y_train)

In [None]:
x_test = x_scaled.fit_transform(x_test)

In [None]:
grid_search.score(x_test, y_test)

In [None]:
coefs = grid_search.best_estimator_.coef_

In [None]:
grid_search.best_estimator_.alpha

In [None]:
coef_list = list(zip(coefs, x.columns))

In [None]:
sorted(coef_list, key = lambda x: abs(x[0]), reverse = True)

In [None]:
best_coefs = pd.Series(lasso.coef_, index = x.columns)

In [None]:
best_alpha

In [None]:
max(trainR2)

In [None]:
max(testR2)

## Tree models

In [10]:
from sklearn import tree
from sklearn import ensemble
from sklearn import ensemble
randomForest = ensemble.RandomForestRegressor()

In [11]:
x = pd.get_dummies(x, drop_first = True)

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

In [13]:
grid_para_forest = [{
    "max_depth": range(1,40),
    "n_estimators": range(100,2000,100),
#    "min_samples_leaf": range(1, 10),
#    "min_samples_split": np.linspace(start=2, stop=30, num=15, dtype=int),
    "random_state": [42]}]
grid_search_forest = GridSearchCV(randomForest, grid_para_forest, cv=5, n_jobs=-1)
grid_search_forest.fit(x_train, y_train)
pickle.dump(grid_search_forest, open('random_forest.pickle', 'wb'))



In [14]:
grid_search_forest.score(x_train, y_train)

0.9847671540111933

In [15]:
grid_search_forest.score(x_test, y_test)

0.8789236483558582

In [53]:
grid_search_forest.best_estimator_.get_params

<bound method BaseEstimator.get_params of RandomForestRegressor(max_depth=14, n_estimators=500, random_state=42)>

## Gradient Boosting

In [32]:
from sklearn.ensemble import GradientBoostingRegressor

In [33]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

In [34]:
gbm = GradientBoostingRegressor()

In [41]:
grid_para_boosting = [{
    "max_depth": range(1,10),
    "n_estimators": range(100,1000,100),
#    "min_samples_leaf": range(1, 10),
#    "min_samples_split": np.linspace(start=2, stop=30, num=15, dtype=int),
    "learning_rate": np.linspace(1e-3, 1, 10),
    "random_state": [42]}]
grid_search_boosting = GridSearchCV(gbm, grid_para_boosting, cv=5, n_jobs=-1)
grid_search_boosting.fit(x_train, y_train)
pickle.dump(grid_search_boosting, open('gradient_boosting.pickle', 'wb'))



In [42]:
grid_search_boosting.score(x_train, y_train)

0.9952901638576778

In [43]:
grid_search_boosting.score(x_test, y_test)

0.9130658497991263

In [44]:
grid_search_boosting.best_estimator_.get_params

<bound method BaseEstimator.get_params of GradientBoostingRegressor(learning_rate=0.112, max_depth=4, n_estimators=400,
                          random_state=42)>