In [1]:
# importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [2]:
# importing datasets
train_df = pd.read_csv("dataset/train.csv")
train_df.drop(['Id'], axis=1, inplace=True)
print(train_df.shape)
train_df.head()

(1460, 80)


Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


## Data Preprocessing

In [3]:
# finding missing values..
train_df.isnull().sum().sort_values(ascending=False).head(10)

PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
FireplaceQu      690
LotFrontage      259
GarageType        81
GarageCond        81
GarageFinish      81
GarageQual        81
dtype: int64

In [4]:
# Dropping top 4 columns as more than 50% of the data are missing...
train_df.drop(labels=['PoolQC', 'MiscFeature', 'Alley', 'Fence'], axis=1, inplace=True)

In [5]:
# getting information of each columns to handle missing values
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 76 columns):
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-null object
Exterior2nd      1460 non-null object
MasVnrType       1452 no

In [6]:
# separating categorical and quantitative dtypes..
# and fixxing missing categorical features with mode value and quantitative features with mean value
quantitative_features = []
categorical_features = []

for col in train_df.columns:
    if train_df[col].dtype == 'object':
        categorical_features.append(col)
        train_df[col].fillna(value=train_df[col].mode()[0], inplace=True)
    else:
        quantitative_features.append(col)
        train_df[col].fillna(value=train_df[col].mean(), inplace=True)
        
# print('C Feature: \n', categorical_features)
# print('Q Feature: \n', quantitative_features)

In [7]:
train_df.shape

(1460, 76)

## Handeling categorical features..

In [8]:
train_copy = train_df.copy()

In [15]:
# imorting precessed test dataset inorder to implement OneHotEncoder

test_df=pd.read_csv("2_formulated_test.csv")
test_df.drop('Id', inplace=True, axis=1)
test_df.shape

(1459, 75)

In [16]:
# concating train and test dataset as there may be some categorical features in test dataset that are missing in train dataset
temp_df = pd.concat([train_df, test_df], axis=0, join='inner')


In [17]:
temp_df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,61,0,0,0,0,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,42,0,0,0,0,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,35,272,0,0,0,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,84,0,0,0,0,0,12,2008,WD,Normal


In [12]:
categorical_features

['MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'SaleType',
 'SaleCondition']

In [13]:
from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder()

for feature in categorical_features:
    print(feature)
    
    # creating dummy of each categorical feature
    dummy_df = pd.DataFrame(onehotencoder.fit_transform(temp_df[[feature]]).toarray(), columns=onehotencoder.categories_)

    # droping original column 
    temp_df.drop(feature, axis=1, inplace=True)
    
    # concatinating dummy with original dataframe
    temp_df = pd.concat([temp_df, dummy_df], axis=1)

MSZoning


InvalidIndexError: Reindexing only valid with uniquely valued Index objects

In [18]:
feature = 'Street'

print(feature)
    
# creating dummy of each categorical feature
dummy_df = pd.DataFrame(onehotencoder.fit_transform(temp_df[[feature]]).toarray(), columns=onehotencoder.categories_)
dummy_df

Street


Unnamed: 0,Grvl,Pave
0,0.0,1.0
1,0.0,1.0
2,0.0,1.0
3,0.0,1.0
4,0.0,1.0
...,...,...
2914,0.0,1.0
2915,0.0,1.0
2916,0.0,1.0
2917,0.0,1.0


In [19]:
# droping original column 
temp_df.drop(feature, axis=1, inplace=True)
temp_df

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,...,61,0,0,0,0,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Reg,Lvl,AllPub,FR2,Gtl,Veenker,...,0,0,0,0,0,0,5,2007,WD,Normal
2,60,RL,68.0,11250,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,...,42,0,0,0,0,0,9,2008,WD,Normal
3,70,RL,60.0,9550,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,...,35,272,0,0,0,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,...,84,0,0,0,0,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,160,RM,21.0,1936,Reg,Lvl,AllPub,Inside,Gtl,MeadowV,...,0,0,0,0,0,0,6,2006,WD,Normal
1455,160,RM,21.0,1894,Reg,Lvl,AllPub,Inside,Gtl,MeadowV,...,24,0,0,0,0,0,4,2006,WD,Abnorml
1456,20,RL,160.0,20000,Reg,Lvl,AllPub,Inside,Gtl,Mitchel,...,0,0,0,0,0,0,9,2006,WD,Abnorml
1457,85,RL,62.0,10441,Reg,Lvl,AllPub,Inside,Gtl,Mitchel,...,32,0,0,0,0,700,7,2006,WD,Normal


In [22]:
# concatinating dummy with original dataframe
#temp_df = pd.concat([temp_df, dummy_df], axis=1, )
dummy_df.shape

(2919, 2)

In [None]:
# feature = 'SalePrice'

# dummy_df = pd.DataFrame(onehotencoder.fit_transform(temp_df[[feature]]).toarray(), columns=onehotencoder.categories_)
