# Data Preprocessing V3 


In [1]:
import pandas as pd
import numpy as np
import helper

# PART A: Functions used in preprocessing

### A1. Overall helper functions


In [2]:
def identify_feats(df):
    # Categorical features
    cat_feats = df.select_dtypes(['object','bool']).columns.to_list()
    # Numerical features
    num_feats = df.select_dtypes(['int', 'float']).columns.to_list()
    print("categorical: "+ str(df[cat_feats].shape[1]))
    print("numerical: "+ str(df[num_feats].shape[1]))
    print("df dims:" + str(df.shape))
    return(cat_feats,num_feats)

In [3]:
def cols_with_na(df,threshold=0):
    count = 0
    for col in df.columns.to_list():
        if(df[col].isna().sum()/len(df))>threshold:
            count += 1
            print(str(col) + "   " + str(df[col].isna().sum()) + "   " + str(round(df[col].isna().sum()/len(df)*100,2))) 
    print('Total columns with NA values above threshold ' + str(threshold*100) + "%: " + str(count))

###  A2. Functions related to numeric features 
##### (and a few categorical in impute_missing_values)

In [4]:
#Special Case Imputations 

def impute_missing_vals(df):
    #1. Impute LotFrontage missing values with linear regression coefficients 
    # AA: LotFrontage imputed as (coefficient from dict) * sqrt(LotArea)
    LotFrontage_dict = {'1Fam':0.7139, 'TwnhsE':0.5849, 'Twnhs':0.5227, 'Duplex':0.7725, '2fmCon':0.6922}
    df.loc[df['LotFrontage'].isna(), 'LotFrontage'] = df.loc[df['LotFrontage'].isna(), :].apply(
        lambda x: LotFrontage_dict[x['BldgType']]*np.sqrt(x['LotArea']), axis=1)

    #2. All rows with MasVnrArea null values also have MasVnrType as null.\
    idx = df['MasVnrArea'].isna() & (df['MasVnrType'].isna())
    #Assume these properties do not have a veneer, so set MasVnrType as "None" and MasVnrArea as 0 
    df.loc[idx,'MasVnrArea'] = 0 
    df.loc[idx,'MasVnrType'] = "None" #motivated by the null value in test, is this data leakage?

    #3 & 4. BsmtFullBath & BsmtHalfBath nulls corresponds with No basement. Can impute with 0. 
    df.loc[df['BsmtFullBath'].isna() & (df['TotalBsmtSF']==0),'BsmtFullBath'] = 0
    df.loc[df['BsmtHalfBath'].isna() & (df['TotalBsmtSF']==0),'BsmtHalfBath'] = 0 

    #5. GarageYrBuilt - repalce missing year with YearBuilt for properties with non-zero garage area values  
    idx = df['GarageYrBlt'].isna() & (df['GarageArea']!=0.0)
    df.loc[idx,'GarageYrBlt'] = df.loc[idx,'YearBuilt']
    #The rest do not have garages so fill with 0, later convert to None 
    df['GarageYrBlt'] = df['GarageYrBlt'].fillna(value=0)
    
    #6. Impute 'Electrical' null values with the most common type 'SBrKr' -> motivated by the null value in test, is this data leakage?
    ### trainX['Electrical'].mode() = SBrkr
    df.loc[df['Electrical'].isna(),'Electrical'] = 'SBrkr'
    
    #7. JH:Specific additions: Replacing two values of GarageType to None
    df.loc[df['PID'] == 903426160,'GarageType'] = 'None'
    df.loc[df['PID'] == 910201180,'GarageType'] = 'None'
    
    return df

In [5]:
def convert_num_to_categorical(df,num_to_nominal_cat_feats=['GarageCars','MSSubClass','KitchenAbvGr','BedroomAbvGr','MoSold','YrSold']):
    #Features that were originally numeric but should be treated as nominal categories since there is no clear 
    #advantage from applying a rank:
    for feat in num_to_nominal_cat_feats:
        df[feat] = df[feat].astype(str)
    
    return df

### A3. Functions related to categorical features

In [6]:
#According to data dictionary, NA translates to 'None' (No access, No basement etc.) for the following categories:
def replace_na_with_none(df):
    na_means_none_cols = ['Alley','BsmtQual','BsmtCond','BsmtFinType1','BsmtFinType2',
                 'BsmtExposure','FireplaceQu','GarageType','GarageFinish',
                 'GarageQual','GarageCond','PoolQC','Fence','MiscFeature']
    for col in na_means_none_cols:
        df[col] = df[col].fillna(value = 'None')
    return df

In [7]:
def map_ordinal_cat(df):
    #Maps
    common_ranks_dict = {'None':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5}
    replace_map = {
        'ExterQual': common_ranks_dict,
        'ExterCond': common_ranks_dict,
        'BsmtQual': common_ranks_dict,
        'BsmtCond': common_ranks_dict,
        'BsmtExposure': {'None':0,'No':1,'Mn':2,'Av':3,'Gd':4}, 
        'HeatingQC': common_ranks_dict,
        'KitchenQual': common_ranks_dict,
        'FireplaceQu': common_ranks_dict,
        'GarageFinish': {'None':0,'Unf':1,'RFn':2,'Fin':3},
        'GarageQual': common_ranks_dict,
        'GarageCond': common_ranks_dict,
        'PavedDrive': {'N':0,'P':1,'Y':2},
        'PoolQC': {'None':0,'Fa':1,'TA':2,'Gd':3,'Ex':4},
        'Alley': {'None':0,'Grvl':1,'Pave':2}
    }              
    #Replace strings with numbers 
    df.replace(replace_map, inplace=True)
    return df 

# PART B: Overall Preprocessing Pipeline


In [8]:
# Loading and splitting data
housing = pd.read_csv('Ames_Housing_Price_Data.csv')
housing.drop('Unnamed: 0', axis=1, inplace=True)
housing = housing[(housing['SaleCondition'] == 'Normal') | (housing['SaleCondition'] == 'Partial')].reset_index(drop=True)
train_raw, test_raw = helper.stratified_split(housing,'Neighborhood')

In [9]:
#Function to print out information on columns at each step 
def check(df):
    #Print out number catgeorical and numeric feats, and dataframe dimensions 
    cat_feats,num_feats = identify_feats(df)
    print('\n')
    print('Categorical: \n')
    print(cat_feats)
    print('\n Numeric: \n')
    print(num_feats)
    #Print out column, missing value count, missing value percentage 
    print('\n')
    cols_with_na(df)

### Training Data 

In [10]:
#Step 0. Copy data 
train_clean = train_raw.copy()
#check(train_clean)

In [11]:
#Step 1. Special case imputations (mostly numeric)
train_clean = impute_missing_vals(train_clean)
#check(train_clean)

In [12]:
#Step 2. Convert some numeric variables to type 'str' so they are handled as nominal categorical variables (1 hot encoded)
num_to_cat_list =['GarageCars','MSSubClass','KitchenAbvGr','BedroomAbvGr','MoSold','YrSold']
train_clean = convert_num_to_categorical(train_clean,num_to_cat_list) 
#check(train_clean)

In [13]:
#Step 3. Replace 'NA' with 'None' for categorical variables (according to Data Dictionary meaning)
train_clean = replace_na_with_none(train_clean)
#check(train_clean)

In [14]:
#Step 4. Drop columns 
train_clean = train_clean.drop(['PID'],axis='columns')
#check(train_clean)

In [15]:
#Step 5. Encode ordinal categoricals to numbers 
train_clean = map_ordinal_cat(train_clean)
#check(train_clean)

### Test Data 


In [16]:
#Step 0. Copy data 
test_clean = test_raw.copy()
#check(test_clean)

In [17]:
#Step 1. Special case imputations (mostly numeric)
test_clean = impute_missing_vals(test_clean)
#check(test_clean)

In [18]:
#Step 2. Convert some numeric variables to type 'str' so they are handled as nominal categorical variables (1 hot encoded)
num_to_cat_list =['GarageCars','MSSubClass','KitchenAbvGr','BedroomAbvGr','MoSold','YrSold']
test_clean = convert_num_to_categorical(test_clean,num_to_cat_list) 
#check(test_clean)

In [19]:
#Step 3. Replace 'NA' with 'None' for categorical variables (according to Data Dictionary meaning)
test_clean = replace_na_with_none(test_clean)
#check(test_clean)

In [20]:
#Step 4. Drop columns 
test_clean = test_clean.drop(['PID'],axis='columns')
#check(test_clean)

In [21]:
#Step 5. Encode ordinal categoricals to numbers 
test_clean = map_ordinal_cat(test_clean)
#check(test_clean)

# PART C: Export CSVs


In [22]:
train_clean.to_csv('train_clean.csv')
test_clean.to_csv('test_clean.csv')

In [23]:
train_clean.isnull().sum().any(axis=0)

False

In [24]:
test_clean.isnull().sum().any(axis=0)

False