In [476]:
import pandas as pd
import numpy as np
import sklearn
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.base import TransformerMixin
import mlxtend
from scipy import stats
from sklearn import ensemble

In [477]:
# Importing and Inspecting Data:
housing_data = pd.read_excel("training.xls")
test = pd.read_csv('randTestHousing-NoSale.csv')
print(housing_data)

# Removing NaNs
housing_data = housing_data.fillna(housing_data.mean()) # Removes NaNs for numerical data (mean imputation)
test = test.fillna(test.mean())
class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

housing_data = DataFrameImputer().fit_transform(housing_data) # Removes NaNs for categorical columns (mode imputation)
test = DataFrameImputer().fit_transform(test)
print(housing_data)
# Isolating Numerical Columns:
numerical = housing_data.select_dtypes(include=[np.number])
# Isolating Categorical Columns:
categorical = ['MS Zoning', 'Street', 'Alley', 'Lot Shape',

                                'Land Contour', 'Utilities', 'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',

                                'Condition 2', 'Bldg Type', 'House Style', 'Roof Style', 'Roof Matl', 'Exterior 1st', 'Exterior 2nd',

                               'Mas Vnr Type', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'Bsmt Qual',

                               'BsmtFin Type 2', 'Heating', 'Heating QC', 'Central Air', 'Electrical', 'Kitchen Qual', 'Functional', 'Fireplace Qu', 'Garage Type', 'Garage Finish', 'Garage Qual',

                               'Garage Cond', 'Paved Drive', 'Pool QC', 'Fence', 'Misc Feature', 'Sale Type', 'Sale Condition']
            
'''
# Normalize:
#scaler = StandardScaler()
#filled[numerical.columns] = scaler.fit_transform(filled[numerical.columns])
# One hot encoding:
housing_data = pd.get_dummies(housing_data)
'''

# Hashing
label_encoder = LabelEncoder()
for i in categorical:
    housing_data[i] = label_encoder.fit_transform(housing_data[i])
    test[i] = label_encoder.fit_transform(test[i])
'''           
One hot encoding:
housing_data = pd.get_dummies(housing_data)  
'''
nan_check = []
for i in housing_data.isna().sum(): # Checks that there are no NaN values in the data anymore
    nan_check.append(i)
print(nan_check)
print(housing_data['SalePrice'].isna().sum())


      MS SubClass MS Zoning  Lot Frontage  Lot Area Street Alley Lot Shape  \
0              60        RL          71.0      8220   Pave   NaN       IR1   
1              20        RL          87.0     10037   Pave   NaN       Reg   
2              60        RL          46.0     20544   Pave   NaN       IR1   
3              20        RL         129.0     16870   Pave   NaN       IR1   
4             120        RM           NaN      4224   Pave   NaN       Reg   
...           ...       ...           ...       ...    ...   ...       ...   
1695           20        RL           NaN     16635   Pave   NaN       IR1   
1696           20        RL          60.0      7200   Pave   NaN       Reg   
1697           20        RL          74.0      7785   Pave   NaN       IR1   
1698           80        RL          65.0      8385   Pave   NaN       Reg   
1699           80        RL          74.0      9620   Pave   NaN       Reg   

     Land Contour Utilities Lot Config  ... Pool Area Pool QC  

In [478]:
# Outlier Removal:
'''
z = np.abs(stats.zscore(housing_data))
outlier_rows = set(np.where(z > 4)[0])
housing_data['Id'] = range(0, len(data))
housing_data = housing_data.set_index('Id')
housing_data = housing_data.drop(outlier_rows)
print("DROPPED ", len(outlier_rows), " OUTLIERS")
'''
# Probably removes too many rows


'\nz = np.abs(stats.zscore(housing_data))\noutlier_rows = set(np.where(z > 4)[0])\nhousing_data[\'Id\'] = range(0, len(data))\nhousing_data = housing_data.set_index(\'Id\')\nhousing_data = housing_data.drop(outlier_rows)\nprint("DROPPED ", len(outlier_rows), " OUTLIERS")\n'

In [479]:
# Train Test Split:
housing_data = housing_data[['SalePrice', 'Overall Qual', 'Kitchen AbvGr', 'Bedroom AbvGr', 'Bsmt Full Bath', 'Overall Cond', 'Full Bath', 'Half Bath', 'TotRms AbvGrd', 'Bldg Type', 'Bsmt Qual', 'Bsmt Unf SF', 'BsmtFin Type 1', 'Exter Qual', 'Foundation', 'Heating QC', 'Kitchen Qual', 'Lot Shape', 'Total Bsmt SF']]
test = test[['Overall Qual', 'Kitchen AbvGr', 'Bedroom AbvGr', 'Bsmt Full Bath', 'Overall Cond', 'Full Bath', 'Half Bath', 'TotRms AbvGrd', 'Bldg Type', 'Bsmt Qual', 'Bsmt Unf SF', 'BsmtFin Type 1', 'Exter Qual', 'Foundation', 'Heating QC', 'Kitchen Qual', 'Lot Shape', 'Total Bsmt SF']]
#housing_data = housing_data[['SalePrice', 'Overall Qual', 'Kitchen AbvGr', 'Bedroom AbvGr', 'Bsmt Full Bath', 'Overall Cond', 'Full Bath', 'Half Bath', 'TotRms AbvGrd']]
#test = test[['Overall Qual', 'Kitchen AbvGr', 'Bedroom AbvGr', 'Bsmt Full Bath', 'Overall Cond', 'Full Bath', 'Half Bath', 'TotRms AbvGrd']]
y = housing_data['SalePrice']
X = housing_data.loc[:, housing_data.columns != 'SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3) 
# Gradient Boosting: (I just experiment with different methods here)
params = {'n_estimators': 500, 'max_depth': 3, 'min_samples_split': 2,
          'learning_rate': 0.1, 'loss': 'ls'}
gdboost = ensemble.GradientBoostingRegressor(**params)
#reg = BaggingRegressor(n_estimators=10,max_samples=0.5, max_features=0.5)
#voting = VotingRegressor(estimators=[('gb', gdboost), ('bg', reg), ('las', lasso)])
gdboost.fit(X_train, y_train)
# Linear Regression:
linreg = LinearRegression()
lasso = Lasso()
ridge = Ridge()
elastic = ElasticNet()
#lasso.fit(X_train,y_train)
pred = gdboost.predict(X_test)
y_test = np.array(y_test)
#print(y_test.isna().sum()
print(pred)
print(y_test)
mse = mean_squared_error(pred, y_test)
rmse = sqrt(mse)
print(rmse)

[328629.04390449 210799.43038007 172956.31341053 163460.9798267
 308857.42193262 271579.63489163 122544.94224844 106664.31803587
 120162.45761618 116768.54379466 338594.02440444 106371.33533459
 122142.39869351 227905.86531653 156500.33578025 154563.54404997
 117039.58929366 186459.32036626 111571.37188045 327480.39293213
 198405.27380348 165475.67796469 188940.95817876 375833.55084514
 102869.97179975 192231.60951332 181572.54066793 211394.39758306
 146164.95752576 108180.22937785 181734.75969626 402453.57423667
 126887.41904435 161780.73161994 337927.28035804 121998.44731452
 279201.3465107  361913.1452305  164918.30547079 134604.60662338
  80903.06203131 147862.0199018  315331.19969729 125693.01088646
 247818.6414708  177094.58621628 186202.08332221 324205.70678857
 146377.97681754 150011.09546349 189140.63571442 177536.58504844
 160758.50001964 124237.28836146 226592.11709241 168319.69254461
 163603.86992437 126767.07235093 259078.14627854 145787.52632398
 181235.59159704 105802.69

In [480]:
# Test on the testing data:
test_pred = gdboost.predict(test)
print(test_pred)
print(len(test_pred))

[168067.03878179  72106.97025538 186918.39602963 ... 201795.33703592
 127410.27989841 225490.85826284]
1230


In [481]:
with open('patrick_predictions.csv', 'w') as csv:
    csv.write('Id,SalePrice' + '\n')
    for i, pred in enumerate(test_pred):
        csv.write(str(i+1) + ',' + str(pred) + '\n')