In [1]:
import pandas as pd
import numpy as np

train = pd.read_csv('./data/train.csv', index_col = 'Id')
test = pd.read_csv('./data/test.csv', index_col = 'Id', keep_default_na=False)

In [2]:
df_train1 = train.copy()
df_train2 = train.copy()
df_test = test.copy()

# Training Dataset Preprocessing

In [3]:
# Hyelee
## Alley : convert NaN => NA
df_train1['Alley'].fillna('NA',inplace=True)
## LotFrontage : convert NaN => ???? 
# <CASE1> Using LotArea 
df_train1['LotFrontage'].fillna(df_train1['LotArea'] *0.007206024910841549,inplace=True)

# Alyssa
# fill in for masonry stuff with None and 0
df_train1['MasVnrType'].fillna('None',inplace=True)
df_train1['MasVnrArea'].fillna(0,inplace=True)
df_train1.loc[949,'BsmtExposure'] = 'No' # impute the ID949's BsmtExposure with the mode of 'BsmtExposure'

# Kisoo
df_train1.FireplaceQu.fillna('NA',inplace=True) # without Fireplace, there is no FireplaceQu.
df_train1.Electrical.fillna('SBrkr',inplace=True) # since Utility column, there is electricity obviously, so it filled with most common Electrical type 'SBrkr'

# Wonchan
# Feature Engineering for Time Series Columns
df_train1['GarageYrBltImputed'] = np.where(df_train1['GarageYrBlt'].isnull(), 1, 0)

# impute the missing years with the value of the year built plus the mean of the diff of year built and garageyrbuilt
aveDiff = np.mean(df_train1['GarageYrBlt']-df_train1['YearBuilt'])
df_train1['GarageYrBlt'].fillna(df_train1['GarageYrBlt'] + aveDiff,inplace=True)

# fill rest with NA string
df_train1 = df_train1.fillna('NA')

df_train1.to_csv('./data/processed_training1.csv')

In [4]:
# Hyelee
## Alley : convert NaN => NA
df_train2['Alley'].fillna('NA',inplace=True)
## LotFrontage : convert NaN => ???? 
# <CASE2> Using Neighborhood 
df_tmp = df_train2[~df_train2.LotFrontage.isnull()]
df_md_lotfrontage = df_tmp.groupby('Neighborhood').agg('median')[{'LotFrontage'}]
df_md_lotfrontage = df_md_lotfrontage.rename(columns={"LotFrontage" : "mdLotFrontage"})
df_tmp = df_tmp.merge(df_md_lotfrontage, how="inner", on='Neighborhood')
df_train2= pd.concat([df_train2.reset_index().set_index('Neighborhood'),df_md_lotfrontage], axis=1, join='inner').reset_index().set_index('Id')
df_train2['LotFrontage'].fillna(df_train2['mdLotFrontage'],inplace=True)
df_train2= df_train2.drop(columns=['mdLotFrontage'])

# Alyssa
# fill in for masonry stuff with None and 0
df_train2['MasVnrType'].fillna('None',inplace=True)
df_train2['MasVnrArea'].fillna(0,inplace=True)
df_train2.loc[949,'BsmtExposure'] = 'No' # impute the ID949's BsmtExposure with the mode of 'BsmtExposure'

# Kisoo
df_train2.FireplaceQu.fillna('NA',inplace=True) # without Fireplace, there is no FireplaceQu.
df_train2.Electrical.fillna('SBrkr',inplace=True) # since Utility column, there is electricity obviously, so it filled with most common Electrical type 'SBrkr'

# Wonchan
# Feature Engineering for Time Series Columns
df_train2['GarageYrBltImputed'] = np.where(df_train2['GarageYrBlt'].isnull(), 1, 0)

# impute the missing years with the value of the year built plus the mean of the diff of year built and garageyrbuilt
aveDiff = np.mean(df_train2['GarageYrBlt']-df_train2['YearBuilt'])
df_train2['GarageYrBlt'].fillna(df_train2['GarageYrBlt'] + aveDiff,inplace=True)

# fill rest with NA string
df_train2 = df_train2.fillna('NA')

df_train2.to_csv('./data/processed_training2.csv')

# Test Dataset Add Feature

In [5]:
# Feature Engineering for Time Series Columns
df_test['GarageYrBltImputed'] = np.where(df_test['GarageYrBlt'].isnull(), 1, 0)
df_test.to_csv('./data/processed_test.csv')