In [1]:
import matplotlib.pyplot as plt
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings("ignore")

In [16]:
from sklearn.model_selection import StratifiedKFold
import xgboost

In [3]:
from sklearn.model_selection import GridSearchCV

In [2]:
train = pd.read_csv("E:\\Data Science\\Datasets\\HousePrices\\train_data0909.csv")
test = pd.read_csv("E:\\Data Science\\Datasets\\HousePrices\\test_data0909.csv")

In [3]:
train_lab = train.pop('SalePrice')

In [4]:
train_feat = np.array(train)

In [5]:
train_feat

array([[-0.29333824, -0.08625604,  1.16564572, ..., -0.81172989,
        -0.36003761,  0.35849803],
       [-0.08102604, -0.08625604, -0.79437517, ...,  0.28215182,
        -0.36003761, -0.05582134],
       [ 0.22359582, -0.08625604,  1.193187  , ..., -0.63929533,
        -0.36003761,  0.64098851],
       ...,
       [-0.18404361,  4.94698484,  1.84958744, ...,  0.08277437,
        -0.36003761, -1.03512166],
       [-0.05942558, -0.08625604, -0.79437517, ..., -0.21359752,
         1.46916417, -1.09161976],
       [-0.01880933, -0.08625604, -0.79437517, ...,  0.26598608,
        -0.36003761, -0.92212547]])

In [6]:
results = []

def forest(X_train, y_train, X_test, y_test, n_est, depth, leaf_samples, features, res_list):
    model = RandomForestRegressor(n_estimators=n_est, max_depth=depth, max_features=features, min_samples_leaf=leaf_samples)
    model.fit(X_train, y_train)
    
    predictions = model.predict(X_test)
    score = np.sqrt(mean_squared_error(predictions, y_test))
    
    params = {}
    params["n_est"] = n_est
    params["depth"] = depth
    params["score"] = score
    
    res_list.append(params)

def tree(X_train, y_train, X_test, y_test, depth, res_list):
    model = DecisionTreeRegressor(max_depth=depth)
    model.fit(X_train, y_train)
    
    predictions = model.predict(X_test)
    score = np.sqrt(mean_squared_error(predictions, y_test))
    
    params = {}
    params["n_est"] = 1
    params["depth"] = depth
    params["score"] = score
    
    res_list.append(params)

In [23]:
def xgb_func(X, y, X_test, y_test, k, est, depth, lr=[0.0015]):
    model = xgboost.XGBRegressor()
    
    parameters = {'n_estimators':est, 
                  'learning_rate': lr,
                  'max_depth':depth}
    
    xgb = GridSearchCV(model, parameters, 
                      scoring = 'neg_mean_squared_error',
                      cv = k)
    
    xgb.fit(X, y)
    
    predictions = model.predict(X_test)
    score = np.sqrt(mean_squared_error(predictions, y_test))
    
    params = {}
    params["flag"] = "xgb"
    params["n_est"] = est
    params["depth"] = depth
    params["score"] = score
    
    res_list.append(params)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(train_feat, train_lab, test_size=0.33)

In [8]:
test.min()

LotArea         -1.684821
MiscVal         -0.092212
2ndFlrSF        -0.774988
PoolArea        -0.057207
MasVnrArea      -0.563123
LowQualFinSF     0.000000
GrLivArea        6.008813
TotalBsmtSF     -2.356667
3SsnPorch        0.000000
ScreenPorch      0.000000
WoodDeckSF      -0.729382
1stFlrSF        -1.882468
EnclosedPorch   -0.360615
GarageArea      -2.173890
dtype: float64

In [9]:
tree_depths = [2,3,4,5,6,7,8,10,12,15,20]

for depth in tree_depths:
    tree(X_train, y_train, X_test, y_test, depth=depth, res_list=results)

In [10]:
results

[{'n_est': 1, 'depth': 2, 'score': 0.26112671595102105},
 {'n_est': 1, 'depth': 3, 'score': 0.24517237402024727},
 {'n_est': 1, 'depth': 4, 'score': 0.22639271402185765},
 {'n_est': 1, 'depth': 5, 'score': 0.22585539153243228},
 {'n_est': 1, 'depth': 6, 'score': 0.21749408183494134},
 {'n_est': 1, 'depth': 7, 'score': 0.2207615613168952},
 {'n_est': 1, 'depth': 8, 'score': 0.23469550652338414},
 {'n_est': 1, 'depth': 10, 'score': 0.23450491670010123},
 {'n_est': 1, 'depth': 12, 'score': 0.2525763454027633},
 {'n_est': 1, 'depth': 15, 'score': 0.2527956601330888},
 {'n_est': 1, 'depth': 20, 'score': 0.24537795384450542}]

In [11]:
estimators = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 60, 70, 80, 100, 150, 200, 250, 300, 400, 500, 1000]

for est in estimators:
    for depth in tree_depths:
        forest(X_train, y_train, X_test, y_test, n_est=est, depth=depth, leaf_samples=2, features=5, res_list=results)

In [12]:
results

[{'n_est': 1, 'depth': 2, 'score': 0.26112671595102105},
 {'n_est': 1, 'depth': 3, 'score': 0.24517237402024727},
 {'n_est': 1, 'depth': 4, 'score': 0.22639271402185765},
 {'n_est': 1, 'depth': 5, 'score': 0.22585539153243228},
 {'n_est': 1, 'depth': 6, 'score': 0.21749408183494134},
 {'n_est': 1, 'depth': 7, 'score': 0.2207615613168952},
 {'n_est': 1, 'depth': 8, 'score': 0.23469550652338414},
 {'n_est': 1, 'depth': 10, 'score': 0.23450491670010123},
 {'n_est': 1, 'depth': 12, 'score': 0.2525763454027633},
 {'n_est': 1, 'depth': 15, 'score': 0.2527956601330888},
 {'n_est': 1, 'depth': 20, 'score': 0.24537795384450542},
 {'n_est': 5, 'depth': 2, 'score': 0.24045832703071937},
 {'n_est': 5, 'depth': 3, 'score': 0.21434531020845674},
 {'n_est': 5, 'depth': 4, 'score': 0.2094287162504363},
 {'n_est': 5, 'depth': 5, 'score': 0.20285941151756073},
 {'n_est': 5, 'depth': 6, 'score': 0.20181734469031676},
 {'n_est': 5, 'depth': 7, 'score': 0.19883939846400167},
 {'n_est': 5, 'depth': 8, 'scor

In [13]:
min_score = results[0]["score"]
line = results[0]

for res in results:
    if res["score"] < min_score:
        min_score = res["score"]
        line = res

In [14]:
line

{'n_est': 70, 'depth': 15, 'score': 0.18090162786202665}

In [63]:
model = RandomForestRegressor(n_estimators=line["n_est"], max_depth=line["depth"])
model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=15,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=70,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [39]:
gar_mode = test["GarageCars"].mode()[0]
bsmt_mode = test["TotalBsmtSF"].mode().max()

0.0

In [40]:
test["GarageCars"].fillna(gar_mode, inplace=True)
test["TotalBsmtSF"].fillna(bsmt_mode, inplace=True)

In [64]:
test = np.array(test)
test

array([[5.00000000e+00, 6.79794041e+00, 1.00000000e+00, ...,
        1.00000000e+00, 1.96100000e+03, 3.00000000e+00],
       [6.00000000e+00, 7.19218206e+00, 1.00000000e+00, ...,
        1.00000000e+00, 1.95800000e+03, 3.00000000e+00],
       [5.00000000e+00, 7.39572161e+00, 2.00000000e+00, ...,
        2.00000000e+00, 1.99700000e+03, 3.00000000e+00],
       ...,
       [5.00000000e+00, 7.10987946e+00, 2.00000000e+00, ...,
        1.00000000e+00, 1.96000000e+03, 3.00000000e+00],
       [5.00000000e+00, 6.87729607e+00, 0.00000000e+00, ...,
        1.00000000e+00, 1.99200000e+03, 3.00000000e+00],
       [7.00000000e+00, 7.60090246e+00, 3.00000000e+00, ...,
        2.00000000e+00, 1.99300000e+03, 3.00000000e+00]])

In [65]:
preds = model.predict(test)

In [66]:
preds

array([11.69754571, 11.80525059, 11.95937253, ..., 11.81633696,
       11.54015541, 12.32028239])

In [67]:
df = pd.read_csv("E:\\Data Science\\Datasets\\HousePrices\\data_test.csv")

In [68]:
df.index.values + 1461

array([1461, 1462, 1463, ..., 2917, 2918, 2919], dtype=int64)

In [69]:
norm_preds = np.exp(preds)
norm_preds

array([120276.16010342, 133953.84695053, 156274.99582014, ...,
       135447.17084127, 102760.4063257 , 224197.44349856])

In [70]:
df_predictions = pd.DataFrame({"Id" : df.index.values + 1461, "SalePrice" : norm_preds})
df_predictions

Unnamed: 0,Id,SalePrice
0,1461,120276.160103
1,1462,133953.846951
2,1463,156274.995820
3,1464,183996.831618
4,1465,190162.529973
...,...,...
1454,2915,84266.651819
1455,2916,91639.539749
1456,2917,135447.170841
1457,2918,102760.406326


In [21]:
df_predictions = pd.DataFrame({"Id" : df.index.values + 1461, "SalePrice" : preds})
df_predictions

Unnamed: 0,Id,SalePrice
0,1461,122060.122846
1,1462,151333.260164
2,1463,162918.184327
3,1464,178862.249679
4,1465,208060.450855
...,...,...
1454,2915,87247.946193
1455,2916,85299.478415
1456,2917,146772.422136
1457,2918,126321.372358


In [71]:
df_predictions.to_csv("E:\\Data Science\\Datasets\\HousePrices\\predictions06.09.2143.csv", index=False)

In [72]:
q = pd.read_csv("E:\\Data Science\\Datasets\\HousePrices\\predictions06.09.2143.csv")
q

Unnamed: 0,Id,SalePrice
0,1461,120276.160103
1,1462,133953.846951
2,1463,156274.995820
3,1464,183996.831618
4,1465,190162.529973
...,...,...
1454,2915,84266.651819
1455,2916,91639.539749
1456,2917,135447.170841
1457,2918,102760.406326
