In [1]:
# Dependencies and setup
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
from scipy import stats
from sklearn.preprocessing import Imputer
%matplotlib inline

In [2]:
# Set maximum rows to a high number
pd.set_option('display.max_rows', 100)

In [3]:
# Load datasets
training_data = pd.read_csv("raw_data/train.csv",index_col=0)
testing_data = pd.read_csv("raw_data/train.csv",index_col=0)

## Cleaning

In [4]:
# Replace unusual values in DataFrame columns

# Training data
training_data["MSZoning"].replace("C (all)", "C", inplace=True)

# Testing data
testing_data["MSZoning"].replace("C (all)", "C", inplace=True)

In [5]:
# Fill LotFrontage N/As with median values for each neighborhood

# Training data
training_data["LotFrontage"] = training_data.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))

# Testing data
testing_data["LotFrontage"] = testing_data.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))

In [6]:
# For some string variables, N/A means that there isn't a specific feature on a house. For example N/A on "BsmtCond" (Basement Condition) means that there is no basement. Therefore, replace N/A values with "None"

# Training data
training_data.update(training_data[["Alley",
                                    "MasVnrType",
                                    "BsmtQual",
                                    "BsmtCond",
                                    "BsmtExposure",
                                    "BsmtFinType1",
                                    "BsmtFinType2",
                                    "FireplaceQu",
                                    "GarageType",
                                    "GarageYrBlt",
                                    "GarageFinish",
                                    "GarageQual",
                                    "GarageCond",
                                    "PoolQC",
                                    "Fence",
                                    "MiscFeature",
                                    "MiscVal"]].fillna("None"))

# Testing data
testing_data.update(testing_data[["Alley",
                                    "MasVnrType",
                                    "BsmtQual",
                                    "BsmtCond",
                                    "BsmtExposure",
                                    "BsmtFinType1",
                                    "BsmtFinType2",
                                    "FireplaceQu",
                                    "GarageType",
                                    "GarageYrBlt",
                                    "GarageFinish",
                                    "GarageQual",
                                    "GarageCond",
                                    "PoolQC",
                                    "Fence",
                                    "MiscFeature",
                                    "MiscVal"]].fillna("None"))

In [7]:
# Some N/A values are better replaced with 0s. For example, when BsmtFinSF1 (finished basement square feet) is N/A it means that there is no basement. Therefore, N/As for some columns are replaced with 0s

# Training data
training_data.update(training_data[["MasVnrArea",
                                    "BsmtFinSF1",
                                    "BsmtFinSF2",
                                    "TotalBsmtSF",
                                    "BsmtUnfSF"]].fillna(0))

# Testing data
testing_data.update(testing_data[["MasVnrArea",
                                    "BsmtFinSF1",
                                    "BsmtFinSF2",
                                    "TotalBsmtSF",
                                    "BsmtUnfSF"]].fillna(0))

In [8]:
# In the "Electrical" column, there are only a couple of N/As and most homes have the same type of electrical. Replacing N/As with the mode result in each neighborhood

# Training data
training_data["Electrical"] = training_data.groupby("Neighborhood")["Electrical"].transform(lambda x: x.fillna(x.mode()[0]))

# Testing data
testing_data["Electrical"] = testing_data.groupby("Neighborhood")["Electrical"].transform(lambda x: x.fillna(x.mode()[0]))

In [14]:
# Many variables like housing classes, years, months, and ratings are represented as integers when they actually function like strings. Converting incorrectly classified variables as strings
training_data[["MSSubClass",
               "YearBuilt",
               "YearRemodAdd",
               "GarageYrBlt",
               "YrSold",
               "MoSold",
               "OverallCond",
               "OverallQual"]] = training_data[["MSSubClass",
                                                "YearBuilt",
                                                "YearRemodAdd",
                                                "GarageYrBlt",
                                                "YrSold",
                                                "MoSold",
                                                "OverallCond",
                                                "OverallQual"]].astype(str) 

In [11]:
training_data.isnull().sum(axis = 0)

MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
Street           0
Alley            0
LotShape         0
LandContour      0
Utilities        0
LotConfig        0
LandSlope        0
Neighborhood     0
Condition1       0
Condition2       0
BldgType         0
HouseStyle       0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
RoofStyle        0
RoofMatl         0
Exterior1st      0
Exterior2nd      0
MasVnrType       0
MasVnrArea       0
ExterQual        0
ExterCond        0
Foundation       0
BsmtQual         0
BsmtCond         0
BsmtExposure     0
BsmtFinType1     0
BsmtFinSF1       0
BsmtFinType2     0
BsmtFinSF2       0
BsmtUnfSF        0
TotalBsmtSF      0
Heating          0
HeatingQC        0
CentralAir       0
Electrical       0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
BsmtFullBath     0
BsmtHalfBath     0
FullBath         0
HalfBath         0
BedroomAbvGr     0
KitchenAbvGr     0
KitchenQual 