In [1]:
import pandas as pd
df = pd.read_csv('clean_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,MoSold_8,MoSold_9,MoSold_10,MoSold_11,MoSold_12,YrSold_2006,YrSold_2007,YrSold_2008,YrSold_2009,YrSold_2010
0,0,60,65.0,8450,7,5,196.0,706,0,150,...,0,0,0,0,0,0,0,1,0,0
1,1,20,80.0,9600,6,8,0.0,978,0,284,...,0,0,0,0,0,0,1,0,0,0
2,2,60,68.0,11250,7,5,162.0,486,0,434,...,0,1,0,0,0,0,0,1,0,0
3,3,70,60.0,9550,7,5,0.0,216,0,540,...,0,0,0,0,0,1,0,0,0,0
4,4,60,84.0,14260,8,5,350.0,655,0,490,...,0,0,0,0,1,0,0,1,0,0


In [2]:
from matplotlib import pyplot as plt

df['SalePrice'].hist(bins=50, facecolor='green', alpha=0.5)
plt.xlabel('House Price')
plt.ylabel('Freq.')
fig = plt.gcf()
fig.set_size_inches(5,5)
plt.show()

<Figure size 500x500 with 1 Axes>

In [3]:
df = df.drop(df[df['SalePrice'] > 580000].index)

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

y = df['SalePrice'].values
x = df.drop(columns=['SalePrice']).values

from sklearn.decomposition import PCA

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)

In [5]:
scaler = StandardScaler()
scaler.fit(x_train)
x_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

pca = PCA(n_components=0.99)
pca.fit(x_scaled)

x_pca = pca.transform(x_scaled)
x_test_pca = pca.transform(x_test_scaled)

x_pca = x_train
x_test_pca = x_test

In [6]:
# Here we make functions in order to evaluate many different classification models

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# Print out machine learning model metrics
def evaluation(y_actual, y_pred):
    
    r2 = r2_score(y_actual, y_pred)
    mse = mean_squared_error(y_actual, y_pred)
    rmse = mse**(1/2)
    print('R2          : ', round(r2, 4))
    print('RMSE        : ', round(rmse, 0))
    return [r2, mse, rmse]
      
def try_regressor(regressor, name):
    # Specify that these are global variables
    global x_train, y_train
    global x_test, y_test
    regressor.fit(x_train, y_train)
    print('Evaluation for', name, '\n')
    # Evaluate the test set
    metrics = evaluation(y_test, regressor.predict(x_test))
    return (metrics + [name])

# Make a list to store model evaluation metrics
regressor_results = list()

In [8]:
rf_regr = RandomForestRegressor(n_estimators=100, min_samples_leaf=3)

regressor_results.append(try_regressor(rf_regr, 'Random Forest'))

Evaluation for Random Forest 

R2          :  0.9042
RMSE        :  23131.0


In [11]:
from sklearn.ensemble import AdaBoostRegressor

ada_regr = AdaBoostRegressor()

regressor_results.append(try_regressor(ada_regr, "Ada Boost"))

Evaluation for Ada Boost 

R2          :  0.8406
RMSE        :  29840.0


In [13]:
from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor

gb_regr = GradientBoostingRegressor()

regressor_results.append(try_regressor(gb_regr, "Gradient Boosting"))

Evaluation for Gradient Boosting 

R2          :  0.9148
RMSE        :  21810.0
