## Surprise Housing Price Prediction - Assignment Solution

### Data Understanding and Exploration


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
%matplotlib inline

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Reading the dataset
dfHousing = pd.read_csv("train.csv", na_values="NAN")

In [None]:
# Let's take a look at the first few rows
dfHousing.head()

### EDA on Housing Data Set

In [None]:
print(dfHousing.info())

In [None]:
sns.distplot(dfHousing['SalePrice'])

In [None]:
print("Skewness:", dfHousing['SalePrice'].skew())
print("Kurtosis:", dfHousing['SalePrice'].kurt())

### Data Corrections

In [None]:
def assessMissingData():
    missing_data = dfHousing.isnull().sum()/dfHousing.shape[0] * 100
    print(missing_data[missing_data != 0].sort_values(ascending=False))

assessMissingData()


#### Special treatment of NaN values:

Some of the columns have a NaN value. But it has a semantic association according to the data definition. We fill NaN value with "None" where it has a meaning

In [None]:
columns_with_na = ["Alley", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "FireplaceQu", "GarageType", "GarageFinish", "GarageQual", "GarageCond", "PoolQC", "Fence", "MiscFeature"]

for col in columns_with_na:
    print('replacing NA with None for column:', col)
    dfHousing[col].fillna("None", inplace=True)

dfHousing.sample(10)

In [None]:
# removed NA values for Categorical columns

assessMissingData()

In [None]:
# LotFrontage has almost 17% nulls, so removing it 

dfHousing.drop('LotFrontage', axis=1, inplace=True)

dfHousing.columns

In [None]:
# Mark Null values as None for MasVnrType

dfHousing['MasVnrType'].fillna('None', inplace=True)

In [None]:
dfHousing["GarageYrBlt"].fillna(dfHousing["GarageYrBlt"].median(), inplace=True)
dfHousing["MasVnrArea"].fillna(dfHousing["MasVnrArea"].median(), inplace=True)

In [None]:
assessMissingData()

In [None]:
dfHousing[dfHousing['Electrical'].isnull()]

In [None]:
# Drop the single row with Electrical as NaN

dfHousing.dropna(inplace=True)

#### Data Types

In [None]:
dfHousing.info()

In [70]:
dfColTypes = pd.DataFrame(dfHousing.dtypes)

dfColTypes.reset_index(inplace=True)
dfColTypes.rename(columns = {'index': 'name', 0: 'datatype'}, inplace=True)


In [73]:
dfColTypes

categorical_types = dfColTypes[dfColTypes['datatype'] == 'object']['name']

categorical_types

2          MSZoning
4            Street
5             Alley
6          LotShape
7       LandContour
8         Utilities
9         LotConfig
10        LandSlope
11     Neighborhood
12       Condition1
13       Condition2
14         BldgType
15       HouseStyle
20        RoofStyle
21         RoofMatl
22      Exterior1st
23      Exterior2nd
24       MasVnrType
26        ExterQual
27        ExterCond
28       Foundation
29         BsmtQual
30         BsmtCond
31     BsmtExposure
32     BsmtFinType1
34     BsmtFinType2
38          Heating
39        HeatingQC
40       CentralAir
41       Electrical
52      KitchenQual
54       Functional
56      FireplaceQu
57       GarageType
59     GarageFinish
62       GarageQual
63       GarageCond
64       PavedDrive
71           PoolQC
72            Fence
73      MiscFeature
77         SaleType
78    SaleCondition
Name: name, dtype: object

In [None]:
for col in categorical_types:
    dfHousing[col] = dfHousing[col].astype('category')

dfHousing.info()