# Import basic libraries and load the data

Import basic required libraries

In [1]:
import pandas as pd
import numpy as np
import math

Load training data

In [2]:
train_csv = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Housing Prices/train_housing.csv")

Set Id column to be index of dataframe

In [None]:
train_csv.set_index("Id")

# Visualize the data (optional)

Take a quick glance at data

In [None]:
train_csv.head()

In [None]:
train_csv.info()

In [None]:
train_csv.describe()

Plot numerical data

In [None]:
import matplotlib.pyplot as plt


plt.rc("font", size = 14)
plt.rc("axes", labelsize = 12, titlesize = 12)

train_csv.hist(bins = 50, figsize = (24,16))
plt.show()

# Look for correlations between the features and the sale price (optional)

Look for which attributes correlate with target variable

In [None]:
corr_matrix = train_csv.corr()
corr_matrix.SalePrice.sort_values(ascending=False)

In [None]:
train_csv.plot(kind="scatter", x="OverallQual", y="SalePrice", alpha=0.1)

Create copy of data to test new features and test their correlation to the target value

In [None]:
corr_train_csv = train_csv
corr_train_csv["LotAreaPerOvrCond"] = corr_train_csv["LotArea"] / corr_train_csv["OverallCond"]
corr_train_csv["1stFlrVs2ndFlr"] = corr_train_csv["1stFlrSF"] - corr_train_csv["2ndFlrSF"]
corr_train_csv["TotalGrade"] = corr_train_csv["OverallCond"] + corr_train_csv["OverallQual"]
corr_train_csv["FullAndHalfBsmtBath"] = corr_train_csv["BsmtFullBath"] + corr_train_csv["BsmtHalfBath"] 

Create new correaltion matrix and check correlation with new features

In [None]:
new_corr_matrix = corr_train_csv.corr()
new_corr_matrix.SalePrice.sort_values(ascending=False)

# Prepare training data by creating features and labels and numerical and string features

Seperate data from target values

In [4]:
housingFeatures = train_csv.drop("SalePrice", axis = 1)
housingLabels = train_csv.SalePrice

Seperate numerical features from string features

In [7]:
numFeats = housingFeatures.select_dtypes(exclude="object").columns
strFeats = housingFeatures.select_dtypes(include="object").columns

# Create custom class and methods to deal with numerical features

Define a custom transformer for numerical features

In [8]:
from sklearn.base import BaseEstimator, TransformerMixin


listOfNumFeats = numFeats.to_list()

class New_Attributes(BaseEstimator, TransformerMixin):
  def __init__(self, add_new_features = True):
    self.add_new_features = add_new_features
  def fit(self, X, y=None):
    return self
  def transform(self, X, y=None):
    if self.add_new_features == True:
      LotAreaPerOvrCond = X[:, listOfNumFeats.index("LotArea")] / X[:, listOfNumFeats.index("OverallCond")]
      FirstFlrVsSecondFlr = X[:, listOfNumFeats.index("1stFlrSF")] - X[:, listOfNumFeats.index("2ndFlrSF")]
      TotalGrade = X[:, listOfNumFeats.index("OverallCond")] + X[:, listOfNumFeats.index("OverallQual")]
      FullAndHalfBsmtBath = X[:, listOfNumFeats.index("BsmtFullBath")] + X[:, listOfNumFeats.index("BsmtHalfBath")]
      return np.c_[X, LotAreaPerOvrCond, FirstFlrVsSecondFlr, TotalGrade, FullAndHalfBsmtBath]

Define method to handle numerical columns with some string values

In [9]:
def string_to_numerical(column):
  column = pd.to_numeric(column, errors="coerce")
  median = np.nanmedian(column)
  return column.fillna(median)

# Create data pipeline

Create data pipelines for numerical features and string features

In [10]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler


num_pipeline = Pipeline([("imputer", SimpleImputer(strategy="median")), 
                         ("newAttr", New_Attributes()),
                         ("stdScaler", StandardScaler())])

str_pipeline = Pipeline([("ordinal", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
                         ("oneHot", OneHotEncoder(handle_unknown="ignore"))])

Create full uniform data pipeline

In [11]:
from sklearn.compose import ColumnTransformer


data_pipeline = ColumnTransformer([("num", num_pipeline, numFeats),
                                   ("str", str_pipeline, strFeats)])
trainTransformed = data_pipeline.fit_transform(train_csv)

# Train multiple models and evaulate with the training set

Train a linear regression model and evaluate on the training set

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


linRegModel = LinearRegression()
linRegModel.fit(trainTransformed, housingLabels)

housingPredictionsLinReg = linRegModel.predict(trainTransformed)
linRegMSE = mean_squared_error(housingLabels, housingPredictionsLinReg)
linRegRMSE = math.sqrt(linRegMSE)

print("The RMSE based on a sample of the training data is: ")
print(linRegRMSE)

The RMSE based on a sample of the training data is: 
20509.962973173017


Train a decision tree regression model and evaluate on the training set

In [13]:
from sklearn.tree import DecisionTreeRegressor

decTreeModel = DecisionTreeRegressor()
decTreeModel.fit(trainTransformed, housingLabels)

housingPredictionsDecTree = decTreeModel.predict(trainTransformed)
decTreeMSE = mean_squared_error(housingLabels, housingPredictionsDecTree)
decTreeRMSE = math.sqrt(decTreeMSE)

print("The RMSE based on a sample of the training data is: ")
print(decTreeRMSE)

The RMSE based on a sample of the training data is: 
0.0


Train a random forest regression model and evaluate on the training set

In [14]:
from sklearn.ensemble import RandomForestRegressor

randomForestModel = RandomForestRegressor()
randomForestModel.fit(trainTransformed, housingLabels)

housingPredictionsRandomForest = randomForestModel.predict(trainTransformed)
randomForestMSE = mean_squared_error(housingLabels, housingPredictionsDecTree)
randomForestRMSE = math.sqrt(randomForestMSE)

print("The RMSE based on a sample of the training data is: ")
print(randomForestRMSE)

The RMSE based on a sample of the training data is: 
0.0


# Test the models with better methods

Test the models using cross-validation

In [15]:
from sklearn.model_selection import cross_val_score

linRegKFoldScores = cross_val_score(linRegModel, trainTransformed, housingLabels, scoring="neg_mean_squared_error", cv=10)
linRegRMSE = np.sqrt(-linRegKFoldScores)
decTreeKFoldScores = cross_val_score(decTreeModel, trainTransformed, housingLabels, scoring="neg_mean_squared_error", cv=10)
decTreeRMSE = np.sqrt(-decTreeKFoldScores)
randomForestKFoldScores = cross_val_score(randomForestModel, trainTransformed, housingLabels, scoring="neg_mean_squared_error", cv=10)
randomForestRMSE = np.sqrt(-randomForestKFoldScores)

def print_kfold_scores(modelScores):
  print("Scores: ", modelScores)
  print("Mean: ", modelScores.mean())
  print("Standard Deviation: ", modelScores.std())

print("Linear Regression Model:")
print_kfold_scores(linRegRMSE)
print("")
print("Decision Tree Model:")
print_kfold_scores(decTreeRMSE)
print("")
print("Random Forest Model:")
print_kfold_scores(randomForestRMSE)

Linear Regression Model:
Scores:  [ 23889.3737259   34682.04022781  23887.48863947  41793.03086404
  30277.88733018  44431.84890643  24271.42117851  22721.21812131
  67921.30743632 106646.79415449]
Mean:  42052.24105844713
Standard Deviation:  25278.536069081307

Decision Tree Model:
Scores:  [34717.76865413 39990.15930458 31259.13745381 41653.81533512
 41365.09545093 36972.30090466 31941.95646644 31882.47149576
 48691.5559371  37568.31789353]
Mean:  37604.25788960654
Standard Deviation:  5224.52916473791

Random Forest Model:
Scores:  [25760.68597705 26269.62257968 22526.11695281 38990.9670051
 33097.26979665 26181.38327914 24276.99588896 24036.4149626
 43144.99613047 26797.46549972]
Mean:  29108.19180721786
Standard Deviation:  6604.315883943053


Evaluate the best model on the test set

In [21]:
test_csv = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Housing Prices/test_housing.csv")

testFeaturesTransformed = data_pipeline.transform(test_csv)
testPredictions = randomForestModel.predict(testFeaturesTransformed)

# Create the csv file for submission

Create a function to transform an array of predictions into a dataframe

In [49]:
def predictions_to_df(predictions):
  testId = test_csv.Id

  idSeries = pd.Series(testId)
  predictionsSeries = pd.Series(predictions)
  df = pd.DataFrame({"Id":idSeries, "SalePrice":predictionsSeries})

  df.set_index("Id")

  return df

Create csv of predictions

In [53]:
finalDf = predictions_to_df(testPredictions)
finalDf.to_csv("submission.csv", index=False)