In [2]:
# Core Modules
import pandas as pd
import numpy as np

# Basic modules for data visualization
import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline

In [3]:
# Load data into a pandas DataFrame from given filepath
df_train = pd.read_csv('../../Datasets/house-prices-advanced-regression-techniques/train.csv')

### Null Values

In [4]:
df_train.isnull().sum().sort_values(ascending=False)

PoolQC           1453
MiscFeature      1406
Alley            1369
Fence            1179
FireplaceQu       690
                 ... 
CentralAir          0
SaleCondition       0
Heating             0
TotalBsmtSF         0
Id                  0
Length: 81, dtype: int64

In [5]:
# Imputing Missing Values

df_train_processed = df_train

# Categorical columns:
cat_cols_fill_none = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu',
                     'GarageCond', 'GarageQual', 'GarageFinish', 'GarageType',
                     'BsmtFinType2', 'BsmtExposure', 'BsmtFinType1', 'BsmtQual', 'BsmtCond',
                     'MasVnrType']

# Replace missing values for categorical columns with None
for cat in cat_cols_fill_none:
    df_train_processed[cat] = df_train_processed[cat].fillna("None")
    
# Group by neighborhood and fill in missing value by the median LotFrontage of all the neighborhood
df_train_processed['LotFrontage'] = df_train_processed.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median()))    

# Garage: GarageYrBlt, GarageArea and GarageCars these are numerical columns, replace with zero
for col in ['GarageYrBlt', 'GarageArea', 'GarageCars']:
    df_train_processed[col] = df_train_processed[col].fillna(int(0))
    
# MasVnrArea : replace with zero
df_train_processed['MasVnrArea'] = df_train_processed['MasVnrArea'].fillna(int(0))

# Use the mode value 
df_train_processed['Electrical'] = df_train_processed['Electrical'].fillna(df_train_processed['Electrical']).mode()[0]

# There is no need of Utilities so let's just drop this column
df_train_processed = df_train_processed.drop(['Utilities'], axis=1)

In [6]:
df_train_processed.isnull().apply(sum).max()

0

### Outliers

In [7]:
num_attributes = df_train_processed.select_dtypes(exclude='object')

high_quant = df_train_processed.quantile(.999)

for i in num_attributes.columns:
    df_train_processed = df_train_processed.drop(df_train_processed[i][df_train_processed[i]>high_quant[i]].index)

df_train_processed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1421 entries, 0 to 1457
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1421 non-null   int64  
 1   MSSubClass     1421 non-null   int64  
 2   MSZoning       1421 non-null   object 
 3   LotFrontage    1421 non-null   float64
 4   LotArea        1421 non-null   int64  
 5   Street         1421 non-null   object 
 6   Alley          1421 non-null   object 
 7   LotShape       1421 non-null   object 
 8   LandContour    1421 non-null   object 
 9   LotConfig      1421 non-null   object 
 10  LandSlope      1421 non-null   object 
 11  Neighborhood   1421 non-null   object 
 12  Condition1     1421 non-null   object 
 13  Condition2     1421 non-null   object 
 14  BldgType       1421 non-null   object 
 15  HouseStyle     1421 non-null   object 
 16  OverallQual    1421 non-null   int64  
 17  OverallCond    1421 non-null   int64  
 18  YearBuil

### Correlated Attributes

In [8]:
attributes_drop = ['MiscVal', 'MoSold', 'YrSold', 'BsmtFinSF2','BsmtHalfBath','MSSubClass',
                   'GarageArea', 'GarageYrBlt', '3SsnPorch']

df_train_processed = df_train_processed.drop(attributes_drop, axis=1)

In [9]:
df_train_processed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1421 entries, 0 to 1457
Data columns (total 71 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1421 non-null   int64  
 1   MSZoning       1421 non-null   object 
 2   LotFrontage    1421 non-null   float64
 3   LotArea        1421 non-null   int64  
 4   Street         1421 non-null   object 
 5   Alley          1421 non-null   object 
 6   LotShape       1421 non-null   object 
 7   LandContour    1421 non-null   object 
 8   LotConfig      1421 non-null   object 
 9   LandSlope      1421 non-null   object 
 10  Neighborhood   1421 non-null   object 
 11  Condition1     1421 non-null   object 
 12  Condition2     1421 non-null   object 
 13  BldgType       1421 non-null   object 
 14  HouseStyle     1421 non-null   object 
 15  OverallQual    1421 non-null   int64  
 16  OverallCond    1421 non-null   int64  
 17  YearBuilt      1421 non-null   int64  
 18  YearRemo

In [10]:
df_train_processed.to_csv('clean_train.csv', index = False)