In [240]:
import pandas as pd

In [241]:
trainData=pd.read_csv('train.csv')

In [242]:
trainData.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [243]:
trainData.shape

(1460, 81)

# Taking care of Missing Data

In [244]:
#print columns with missing value (nan)
trainData.columns[trainData.isnull().any()]

Index(['LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Electrical', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence',
       'MiscFeature'],
      dtype='object')

In [245]:
#Create copy of the Train data
trainData_copy1=trainData.copy()

In [246]:
# Create Function---drop missing rows with missing NA
def drop_missing_row (dataFrame, List_of_ColumnName):
    dataFrame=dataFrame.dropna(subset=List_of_ColumnName)
    return dataFrame

In [247]:
# Create Function---Change missing values to 'NA'
def fill_NA_missing_value (dataFrame, list_of_ColumnName):
    dataFrame[list_of_ColumnName]=dataFrame[list_of_ColumnName].fillna(value='None')
    return dataFrame

In [248]:
# Create Function---Change missing values to '0'
def fill_0_missing_value (dataFrame, list_of_ColumnName):
    dataFrame[list_of_ColumnName]=dataFrame[list_of_ColumnName].fillna(value=0)
    return dataFrame

In [249]:
#drop rows with missing value based on column
trainData_copy1=drop_missing_row(trainData_copy1, ['Electrical'])


In [250]:
#Fill missing values with NA
trainData_copy1=fill_NA_missing_value(trainData_copy1, ['LotFrontage', 'Alley', 'MasVnrType', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence',
       'MiscFeature'])


In [251]:
#Fill missing values with 0
trainData_copy1=fill_0_missing_value(trainData_copy1, ['MasVnrArea'])

In [252]:
trainData_copy1.shape

(1459, 81)

In [253]:
#Check if there are any missing values
trainData_copy1.isnull().sum()

Id               0
MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
Street           0
Alley            0
LotShape         0
LandContour      0
Utilities        0
LotConfig        0
LandSlope        0
Neighborhood     0
Condition1       0
Condition2       0
BldgType         0
HouseStyle       0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
RoofStyle        0
RoofMatl         0
Exterior1st      0
Exterior2nd      0
MasVnrType       0
MasVnrArea       0
ExterQual        0
ExterCond        0
Foundation       0
                ..
BedroomAbvGr     0
KitchenAbvGr     0
KitchenQual      0
TotRmsAbvGrd     0
Functional       0
Fireplaces       0
FireplaceQu      0
GarageType       0
GarageYrBlt      0
GarageFinish     0
GarageCars       0
GarageArea       0
GarageQual       0
GarageCond       0
PavedDrive       0
WoodDeckSF       0
OpenPorchSF      0
EnclosedPorch    0
3SsnPorch        0
ScreenPorch      0
PoolArea         0
PoolQC      

# Data Cleaning

### Obtain List of Categorical/Oridnal Data Column Names from Excel Spread Sheet

In [254]:
Variable_df=pd.read_excel('Variable_Analysis.xlsx')
Variable_df.head()

Unnamed: 0,Column Number,Name,Individual_dataType,Team_dataType,Missing Value?,Replace Missing Value with?,Status-Huimin,Notes-Huimin
0,1,ID,,,,,,
1,2,MSSubClass,categorical,,,,,
2,3,MSZoning,categorical,,,,,
3,4,LotFrontage,numerical,,Yes,,not sure,na is not missing. It means not connected
4,5,LotArea,numerical,,,,,


In [255]:
List_colName_Categorical=Variable_df[Variable_df['Individual_dataType']=='categorical']['Name'].tolist()
len(List_colName_Categorical)

36

In [256]:
List_colName_Ordinal=Variable_df[Variable_df['Individual_dataType']=='ordinal']['Name'].tolist()
len(List_colName_Ordinal)

9

In [257]:
List_colName_Numerical=Variable_df[Variable_df['Individual_dataType']=='numerical']['Name'].tolist()
len(List_colName_Ordinal)

9

### OneHotEncoder for Categorical Data

In [258]:
def OneHotEncode_categorical_data (dataFrame, List_colName_Categorical):
    data=dataFrame.copy()
    for column_name in List_colName_Categorical:
        dummied_data=pd.get_dummies(data[column_name], prefix=column_name,prefix_sep='__')
        dummied_data=dummied_data.drop(dummied_data.columns[0], axis=1)#drop 1st column
        data=pd.concat([data.drop(column_name,axis=1), dummied_data ], axis=1)
    return data
        

In [259]:
trainData_copy1=OneHotEncode_categorical_data(trainData_copy1, List_colName_Categorical)

In [260]:
trainData_copy1.columns

Index(['Id', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'ExterQual', 'ExterCond',
       ...
       'SaleType__ConLI', 'SaleType__ConLw', 'SaleType__New', 'SaleType__Oth',
       'SaleType__WD', 'SaleCondition__AdjLand', 'SaleCondition__Alloca',
       'SaleCondition__Family', 'SaleCondition__Normal',
       'SaleCondition__Partial'],
      dtype='object', length=251)

In [261]:
train_y=trainData_copy1.SalePrice
train_x=trainData_copy1.drop(['SalePrice'],axis=1)

### Ordinal Encoder for Ordinal Data

In [239]:
def process_ordinal_columns (dataFrame, List_colName_Ordinal):
    data=dataFrame.copy()
    for column_name in List_colName_Ordinal:
        if data.column_name.dtype in ['int32', 'int64']:
            continue
        elif:
            
            

SyntaxError: unexpected EOF while parsing (<ipython-input-239-8d806b922952>, line 2)

In [262]:
trainData_copy1.OverallQual.unique()


array([ 7,  6,  8,  5,  9,  4, 10,  3,  1,  2], dtype=int64)

In [268]:
trainData_copy1.OverallQual.dtype in ['int32', 'int64']

True