## About this notebook

This notebook will process data features so that they will be "analysis-ready."  Specifically, for
1. Continuous features:
    - impute missing values when needed
    - delete a feature if there are too many missing data
    - handle future missing data
    
2. Categorical features:
    - "collapse" subcategories of features if there are too many subcategories
    - "collapse" subcategories of features if subcategories have very few values (<5%)
    - create a missing subcategory for missing if there are significant missing data
    - create dummy 

# A. import packages

In [59]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

%matplotlib inline

In [60]:
#read in data
ames = pd.read_csv('./data/train')
m = ames.shape[0]

# B. process features (41-60)

### 41. CentralAir [categorical]

In [3]:
def centralair(cols):
    """
    Arguments:
    cols[0] -- dataframe column value for ID
    cols[1] -- dataframe column value for the feature, 'CentralAir': Central air conditioning, with categories 
       N    No
       Y    Yes
    Returns:
    'CentAir' -- if Y
    'NoCentAir' -- if N or missing
    """        
    Id = cols[0]
    CentralAir = cols[1]
    
    if pd.isnull(CentralAir):
        return 'NoCentAir'    
    else:
        if CentralAir == 'Y':
            return 'CentAir'
        else:
            return 'NoCentAir'

ames['CentralAir'] = ames[['Id','CentralAir']].apply(centralair,axis=1)

### 42. Electrical [categorical]

In [4]:
def electric(cols):
    """
    Arguments:
    cols[0] -- dataframe column value for ID
    cols[1] -- dataframe column value for the feature, 'Electrical': Electrical system, with categories 
       SBrkr    Standard Circuit Breakers & Romex
       FuseA    Fuse Box over 60 AMP and all Romex wiring (Average)	
       FuseF    60 AMP Fuse Box and mostly Romex wiring (Fair)
       FuseP    60 AMP Fuse Box and mostly knob & tube wiring (poor)
       Mix      Mixed
    Returns:
    'SBrkr' -- if 'SBrkr'
    'Other' -- if FuseA, FuseF, FuseP, Mix, or missing
    """        
    Id = cols[0]
    Electrical = cols[1]
    
    if pd.isnull(Electrical):
        return 'OtherCBrkr'
    else:
        if Electrical == 'SBrkr':
            return 'StdCBrkr'
        else:
            return 'OtherCBrkr'
        
ames['Electrical'] = ames[['Id','Electrical']].apply(electric,axis=1)

### 43. 1stFlrSF [continuous]

In [5]:
ames.drop(['1stFlrSF'], axis = 1, inplace = True)

### 44. 2ndFlrSF [continuous]

In [6]:
ames.drop(['2ndFlrSF'], axis = 1, inplace = True)

### 45. LowQualFinSF [continuous]

In [7]:
ames.drop(['LowQualFinSF'], axis = 1, inplace = True)

### 46. GrLivArea [continuous]

In [8]:
def grlivarea(cols):
    """
    Arguments:
    cols[0] -- dataframe column value for ID
    cols[1] -- dataframe column value for the feature, 'GrLivArea': Above grade (ground) living area square feet
    
    Returns:
    x -- if x is positive
    1464 -- if x is not positive (assumed to be missing; 1464 is the median in the training set)
    """        
    Id = cols[0]
    GrLivArea = cols[1]
    
    if pd.isnull(GrLivArea):
        return 1464 
    else:
        return GrLivArea
        
ames['GrLivArea'] = ames[['Id','GrLivArea']].apply(grlivarea,axis=1)

### 47-50. BsmtFullBath and FullBath

In [21]:
def totalbaths(cols):
    """
    Arguments:
    cols[0] -- dataframe column value for ID
    cols[1] -- dataframe column value for the new feature, 'FullBaths': with numeric values 
         1,2,3,...  
   
    Returns:
    '<=1FullBath' -- if <=1 total full baths
    '2FullBaths' -- if 2 total full baths
    '>=3FullBaths' -- if 3 or more total full baths
    """    
    
    Id = cols[0]
    BsmtFullBath = cols[1]
    FullBath = cols[2]
    
    total = BsmtFullBath + FullBath 

    if pd.isnull(total):
        return '<=1FullBath'
    else:
        if total <= 1:
            return '<=1FullBath'
        elif total == 2:
            return '2FullBaths'
        else:
            return '>=3FullBaths'
        
ames['FullBaths'] = ames[['Id','BsmtFullBath','FullBath']].apply(totalbaths,axis=1)

### 47-50 BsmtHalfBath and HalfBath

In [10]:
def totalhalfbaths(cols):
    """
    Arguments:
    cols[0] -- dataframe column value for ID
    cols[1] -- dataframe column value for the new feature, 'HalfBaths': with numeric values 
         1,2,3,...  
   
    Returns:
    '0HalfBath' -- if 0 total half baths
    '>=1HalfBaths' -- if 1 or more total half baths
    """    
    
    Id = cols[0]
    BsmtHalfBath = cols[1]
    HalfBath = cols[2]
    
    total = BsmtHalfBath + HalfBath 

    if pd.isnull(total):
        return '0HalfBath'
    else:
        if total == 0:
            return '0HalfBath'
        else:
            return '>=1HalfBaths'
        
ames['HalfBaths'] = ames[['Id','BsmtHalfBath','HalfBath']].apply(totalhalfbaths,axis=1)

### 51. BedroomAbvGr

In [11]:
def bedroomabvgr(cols):
    """
    Arguments:
    cols[0] -- dataframe column value for ID
    cols[1] -- dataframe column value for the feature, 'BedroomAbvGr': with numeric values 
         0, 1, 2, ...  
   
    Returns:
    '<=2Bedr' -- if <=2 total bedrooms
    '=3Bedr' -- if 3 total bedrooms
    '>=4Bedr' -- if 4 or more total bedrooms
    """   
    
    Id = cols[0]
    BedroomAbvGr = cols[1]
    
    if pd.isnull(BedroomAbvGr):
        return '<=2Bedr'
    else:
        if BedroomAbvGr <= 2:
            return '<=2Bedr'
        elif BedroomAbvGr == 3:
            return '=3Bedr'
        else:
            return '>=4Bedr'
        
ames['BedroomAbvGr'] = ames[['Id','BedroomAbvGr']].apply(bedroomabvgr,axis=1)

### 52. KitchenAbvGr

In [12]:
def kitchenabvgr(cols):
    """
    Arguments:
    cols[0] -- dataframe column value for ID
    cols[1] -- dataframe column value for the feature, 'KitchenAbvGr': with numeric values 
         0, 1, 2, ...  
   
    Returns:
    '<=1Ktchn' -- if <=1 total kitchens or missing
    '>=2Ktchn' -- if >=2 total kitchens
    """           
    Id = cols[0]
    KitchenAbvGr = cols[1]
    
    if pd.isnull(KitchenAbvGr):
        return '<=1Ktchn' 
    else:
        if KitchenAbvGr <= 1:
            return '<=1Ktchn'
        else:
            return '>=2Ktchn'
        
ames['KitchenAbvGr'] = ames[['Id','KitchenAbvGr']].apply(kitchenabvgr,axis=1)

### 53. KitchenQual

In [13]:
def kitchenqual(cols):
    """
    Arguments:
    cols[0] -- dataframe column value for ID
    cols[1] -- dataframe column value for the feature, 'KitchenQual': with categories 
               Ex   Excellent
               Gd   Good
               TA   Typical/Average
               Fa   Fair
               Po   Poor
    Returns:
    'AveKtchnQ' -- if average or worse kitchens or missing
    'GdKtchnQ' -- if good kitchens
    'ExKtchnQ' -- if excellent kitchens
    """               
    
    Id = cols[0]
    KitchenQual = cols[1]
    
    if pd.isnull(KitchenQual):
        return 'AveKtchnQ' 
    else:
        if KitchenQual == 'Gd':
            return 'GdKtchnQ'
        elif KitchenQual == 'Ex':
            return 'ExKtchnQ'
        else:
            return 'AveKtchnQ'
        
ames['KitchenQual'] = ames[['Id','KitchenQual']].apply(kitchenqual,axis=1)

### 54. TotRmsAbvGrd

In [14]:
def totrmsabvgrd(cols):
    """
    Arguments:
    cols[0] -- dataframe column value for ID
    cols[1] -- dataframe column value for the feature, 'TotRmsAbvGrd': Total rooms 
               above grade (does not include bathrooms) 
    Returns:
    '<=4TotRms' -- if <=4 total rooms or missing
    '567TotRms' -- if 5, 6, 7 total rooms
    '>=8TotRms' -- if >=8 total rooms   
    """   
    
    Id = cols[0]
    TotRmsAbvGrd = cols[1]
    
    if pd.isnull(TotRmsAbvGrd):
        return '567TotRms'
    else:
        if TotRmsAbvGrd <= 4:
            return '<=4TotRms'
        elif TotRmsAbvGrd in [5, 6, 7]:
            return '567TotRms'
        else:
            return '>=8TotRms'    
        
ames['TotRmsAbvGrd'] = ames[['Id','TotRmsAbvGrd']].apply(totrmsabvgrd,axis=1)

### 55. Functional

In [15]:
def functional(cols):
    """
    Arguments:
    cols[0] -- dataframe column value for ID
    cols[1] -- dataframe column value for the feature, 'Functional': Home functionality 
               (Assume typical unless deductions are warranted) with categories  

               Typ    Typical Functionality
               Min1   Minor Deductions 1
               Min2   Minor Deductions 2
               Mod    Moderate Deductions
               Maj1   Major Deductions 1
               Maj2   Major Deductions 2
               Sev    Severely Damaged
               Sal    Salvage only

    Returns:
    'NTypFunc' -- if not typical functionality or missing
    'TypFunc' -- if typical functionality
    """       
    
    Id = cols[0]
    Functional = cols[1]
    
    if pd.isnull(Functional):
        return 'NTypFunc'
    else:
        if Functional == 'Typ':
            return 'TypFunc'
        else:
            return 'NTypFunc'
        
ames['Functional'] = ames[['Id','Functional']].apply(functional,axis=1)

### 56. Fireplaces

In [51]:
def fireplaces(cols):
    """
    Arguments:
    cols[0] -- dataframe column value for ID
    cols[1] -- dataframe column value for the feature, 'Fireplaces': Number of fireplaces 

    Returns:
    'Fireplace' -- if has fireplace
    'NoFireplace' -- if no fireplace or missing
    """       
    
    Id = cols[0]
    Fireplace = cols[1]
    
    if pd.isnull(Fireplace):
        return 'NoFireplace'
    else:
        if Fireplace > 0:
            return 'Fireplace'
        else:
            return 'NoFireplace'
        
ames['Fireplaces'] = ames[['Id','Fireplaces']].apply(fireplaces,axis=1)

### 57. FireplaceQu

In [54]:
def fireplacequ(cols):
    """
    Arguments:
    cols[0] -- dataframe column value for ID
    cols[1] -- dataframe column value for the feature, 'FireplaceQu': Fireplace quality 
    
       Ex   Excellent - Exceptional Masonry Fireplace
       Gd   Good - Masonry Fireplace in main level
       TA   Average - Prefabricated Fireplace in main living area or Masonry Fireplace in basement
       Fa   Fair - Prefabricated Fireplace in basement
       Po   Poor - Ben Franklin Stove
       NA   No Fireplace
       
    Returns:
    'GdFireplace' -- if Gd or Ex
    'OthFireplace' -- if not Gd or Ex
    """       
    
    Id = cols[0]
    FireplaceQu = cols[1]
    
    if pd.isnull(FireplaceQu):
        return 'OthFireplace'
    else:
        if FireplaceQu in ['Gd','Ex']:
            return 'GdFireplace'
        else:
            return 'OthFireplace'
        
ames['FireplaceQu'] = ames[['Id','FireplaceQu']].apply(fireplacequ,axis=1)

### 58. GarageType

In [61]:
def garagetype(cols):
    """
    Arguments:
    cols[0] -- dataframe column value for ID
    cols[1] -- dataframe column value for the feature, 'GarageType': Garage location 
    
       2Types   More than one type of garage
       Attchd   Attached to home
       Basment  Basement Garage
       BuiltIn  Built-In (Garage part of house - typically has room above garage)
       CarPort  Car Port
       Detchd   Detached from home
       NA       No Garage
       
    Returns:
    'Attached' -- if Attched, BuiltIn, Basment, or 2Types
    'Detached' -- if Detchd, Carport, or missing
    """       
    
    Id = cols[0]
    GarageType = cols[1]
    
    if pd.isnull(GarageType):
        return 'Detached'
    else:
        if GarageType in ['Attchd', 'BuiltIn', 'Basment', '2Types']:
            return 'Attached'
        else:
            return 'Detached'
        
ames['GarageType'] = ames[['Id','GarageType']].apply(garagetype,axis=1)

### 59. GarageYrBlt

In [19]:
ames.drop(['GarageYrBlt'], axis = 1, inplace = True)

### 60. GarageFinish

In [64]:
def garagefinish(cols):
    """
    Arguments:
    cols[0] -- dataframe column value for ID
    cols[1] -- dataframe column value for the feature, 'GarageFinish': Interior finish of the garage 
    
       Fin  Finished
       RFn  Rough Finished
       Unf  Unfinished
       NA   No Garage
       
    Returns:
    'Fin' -- if Finished
    'RFn' -- if Rough Finished
    'Unf' -- if Unfinished
    'NoGarageFin'  -- if missing
    """       
    
    Id = cols[0]
    GarageFinish = cols[1]
    
    if pd.isnull(GarageFinish):
        return 'NoGarageFin'
    else:
        if GarageFinish == 'Fin':
            return 'GarageFin'
        elif GarageFinish == 'RFn':
            return 'RoughGarageFin'
        elif GarageFinish == 'Unf':
            return 'GarageUnf'
        else:
            return 'NoGarageFin'
        
ames['GarageFinish'] = ames[['Id','GarageFinish']].apply(garagefinish,axis=1)