In [1]:
import pandas as pd
df = pd.read_csv('clean_data.csv')
df.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtUnfSF,...,MiscFeature_2,MiscFeature_4,SaleType_COD,SaleType_New,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Normal,SaleCondition_Partial,Id
0,60,65.0,8450,7,5,2003,2003,196.0,706,150,...,0,1,0,0,1,0,0,1,0,1
1,20,80.0,9600,6,8,1976,1976,0.0,978,284,...,0,1,0,0,1,0,0,1,0,2
2,60,68.0,11250,7,5,2001,2002,162.0,486,434,...,0,1,0,0,1,0,0,1,0,3
3,70,60.0,9550,7,5,1915,1970,0.0,216,540,...,0,1,0,0,1,1,0,0,0,4
4,60,84.0,14260,8,5,2000,2000,350.0,655,490,...,0,1,0,0,1,0,0,1,0,5


In [2]:
from matplotlib import pyplot as plt

df['SalePrice'].hist(bins=50, facecolor='green', alpha=0.5)
plt.xlabel('House Price')
plt.ylabel('Freq.')
fig = plt.gcf()
fig.set_size_inches(5,5)
plt.show()

<Figure size 500x500 with 1 Axes>

In [3]:
df = df.drop(df[df['SalePrice'] > 450000].index)
df = df.drop(columns=['Id'])
df = df.drop(df[df['SalePrice'] <  50000].index)

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import numpy as np

y = np.log(df['SalePrice'].values)
x = df.drop(columns=['SalePrice']).values

from sklearn.decomposition import PCA

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [5]:
scaler = StandardScaler()
scaler.fit(x_train)
x_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

pca = PCA(n_components=0.97)

pca.fit(x_scaled)
x_pca = pca.transform(x_scaled)
x_test_pca = pca.transform(x_test_scaled)

# x_train = x_pca
# x_test = x_test_pca

# x_train = x_scaled
# x_test = x_test_scaled

In [6]:
# Here we make functions in order to evaluate many different classification models

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import numpy as np

def get_houses_eval_metric(y_actual, y_pred):
    total = 0
    for y_a, y_p in zip(y_actual, y_pred):
        total += (np.log(y_a) - np.log(y_p))**2
    return (total/len(y_actual))**(1/2)

# Print out machine learning model metrics
def evaluation(y_actual, y_pred):
    f = lambda x: np.exp(x)
    y_actual = f(y_actual)
    y_pred = f(y_pred)
    r2 = r2_score(y_actual, y_pred)
    mse = mean_squared_error(y_actual, y_pred)
    rmse = mse**(1/2)
    houses_metric = get_houses_eval_metric(y_actual, y_pred)
    print('R2          : ', round(r2, 4))
    print('RMSE        : ', round(rmse, 2))
    print('HOUSE METRIC: ', round(houses_metric, 4))
    return [r2, mse, rmse, houses_metric]
      
def try_regressor(regressor, name):
    best_mse = np.Inf
    for _ in range(1):
        # Specify that these are global variables
        global x_train, y_train
        global x_test, y_test
        regressor.fit(x_train, y_train)

        if mean_squared_error(y_test, regressor.predict(x_test)) < best_mse:
            print('Evaluation for', name, '\n')
            # Evaluate the test set
            metrics = evaluation(y_test, regressor.predict(x_test))
            best_mse = mean_squared_error(y_test, regressor.predict(x_test))
    print("Best rmse: ", best_mse**(1/2))
    return (metrics + [name])

# Make a list to store model evaluation metrics
regressor_results = list()

In [7]:
rf_regr = RandomForestRegressor(n_estimators=100, min_samples_leaf=3)

regressor_results.append(try_regressor(rf_regr, 'Random Forest'))

Evaluation for Random Forest 

R2          :  0.8893
RMSE        :  21257.53
HOUSE METRIC:  0.1156
Best rmse:  0.11563592272988517


In [8]:
from sklearn.ensemble import AdaBoostRegressor

ada_regr = AdaBoostRegressor()

regressor_results.append(try_regressor(ada_regr, "Ada Boost"))

Evaluation for Ada Boost 

R2          :  0.8141
RMSE        :  27550.32
HOUSE METRIC:  0.1554
Best rmse:  0.15543821660855733


In [9]:
from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor

gb_regr = GradientBoostingRegressor(min_samples_leaf=3, n_estimators=3500, learning_rate=0.01)

regressor_results.append(try_regressor(gb_regr, "Gradient Boosting"))

Evaluation for Gradient Boosting 

R2          :  0.9105
RMSE        :  19118.38
HOUSE METRIC:  0.1044
Best rmse:  0.10441140539948468


In [10]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

lr_regr = Ridge()

regressor_results.append(try_regressor(lr_regr, "Ridge Regression"))

Evaluation for Ridge Regression 

R2          :  0.8969
RMSE        :  20518.88
HOUSE METRIC:  0.1141
Best rmse:  0.11405025906758758


In [11]:
from sklearn.kernel_ridge import KernelRidge

kr_regr = KernelRidge(alpha=1.0, coef0=1, degree=3, gamma=None, kernel='linear')

regressor_results.append(try_regressor(kr_regr, "Kernal Ridge Regression"))

Evaluation for Kernal Ridge Regression 

R2          :  0.8996
RMSE        :  20252.23
HOUSE METRIC:  0.1144
Best rmse:  0.11437226568465196


In [14]:
df = pd.read_csv('clean_data.csv')

In [15]:
df.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtUnfSF,...,MiscFeature_2,MiscFeature_4,SaleType_COD,SaleType_New,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Normal,SaleCondition_Partial,Id
0,60,65.0,8450,7,5,2003,2003,196.0,706,150,...,0,1,0,0,1,0,0,1,0,1
1,20,80.0,9600,6,8,1976,1976,0.0,978,284,...,0,1,0,0,1,0,0,1,0,2
2,60,68.0,11250,7,5,2001,2002,162.0,486,434,...,0,1,0,0,1,0,0,1,0,3
3,70,60.0,9550,7,5,1915,1970,0.0,216,540,...,0,1,0,0,1,1,0,0,0,4
4,60,84.0,14260,8,5,2000,2000,350.0,655,490,...,0,1,0,0,1,0,0,1,0,5


In [16]:
df_test = pd.read_csv('clean_test.csv')

In [17]:
df_test.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,PoolQC_3,MiscFeature_2,MiscFeature_4,SaleType_COD,SaleType_New,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Normal,SaleCondition_Partial
0,1461,20,80.0,11622,5,6,1961,1961,0.0,468.0,...,1,0,1,0,0,1,0,0,1,0
1,1462,20,81.0,14267,6,6,1958,1958,108.0,923.0,...,1,0,1,0,0,1,0,0,1,0
2,1463,60,74.0,13830,5,5,1997,1998,0.0,791.0,...,1,0,1,0,0,1,0,0,1,0
3,1464,60,78.0,9978,6,6,1998,1998,20.0,602.0,...,1,0,1,0,0,1,1,0,0,0
4,1465,120,43.0,5005,8,5,1992,1992,0.0,263.0,...,1,0,1,0,0,1,0,0,1,0


In [18]:
df_sub = pd.read_csv('sample_submission.csv')
df_sub.head()

Unnamed: 0,Id,SalePrice
0,1461,169277.052498
1,1462,187758.393989
2,1463,183583.68357
3,1464,179317.477511
4,1465,150730.079977


In [19]:
df_sub['SalePrice'] = gb_regr.predict(df_test.drop(columns=['Id']).values)
df_sub['SalePrice'] = df_sub['SalePrice'].apply(lambda x: np.exp(x))

In [20]:
df_sub.to_csv('submission.csv', index=False)