In [1]:
# This code cleans data from the Ames, IA housing price
# competion on Kaggle. 
# https://www.kaggle.com/c/house-prices-advanced-regression-techniques/
# Team Priced2Sell

In [2]:
# Import modules
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import preprocessing

In [3]:
# Import data
train = pd.read_csv('../../data/train.csv', index_col=0)
test = pd.read_csv('../../data/test.csv', index_col=0)

In [4]:
# Filter out commercial properties
MSZoningMask = (test.MSZoning=='A') | (test.MSZoning=='C') | (test.MSZoning=='I') | (test.MSZoning=='C (all)')
#test = test[~MSZoningMask]

In [5]:
# Drop NAs
test = test.fillna(value=0)

In [6]:
# Convert catagorical variables into numbers

def make_num(val):
    new_list = []
    if val == 'Ex':
        num = 5
    elif val == 'Gd':
        num = 4
    elif val == 'TA':
        num = 3
    elif val == 'Fa':
        num = 2
    elif val == 'Po':
        num = 1
    else:
        num = 0
    return int(num)

new_dict = {'Kitchen':0, 'Fireplace': 0, 'GarageQ': 0, 'GarageC':0, 'ExterQ': 0, 'ExterC':0, 'BsmtQ':0, 
            'BsmtC':0, 'HeatingQ': 0, "PoolQ": 0}

name_list = list(new_dict.keys())

orig_list = ['KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 
 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC',
'PoolQC']

i=0
for thing in orig_list:  
    new_list = list(map(make_num,test[thing]))
    new_dict[name_list[i]] = new_list
    
    i+=1
    
new_df = pd.DataFrame(new_dict)
test = pd.concat([test.reset_index(), new_df], sort=True, axis=1)
test = test.drop('Id', axis=1)


In [7]:
# A slightly more flexible version of original make_num

# this function takes a single text value and a 
# rating system (array of text values) and returns 
# single text value. Meant to be used in a list
# comprehension or map function


def make_num_flex(val, rating_system):
    new_list = []
    if val == rating_system[0]:
        num = 0
    elif val == rating_system[1]:
        num = 1
    elif val == rating_system[2]:
        num = 0
    elif val == rating_system[3]:
        num = 0
    elif val == rating_system[4]:
        num = 0
    elif val == rating_system[5]:
        num = 0
    else:
        num = 1
    return int(num)

In [8]:
# Convert garage catagoricals

gar_types = ['Attachd', 'Detchd', 'BuiltIn', 'Basement', 'Carport', '2Types']
test['GarageType'] = [make_num_flex(x, gar_types) for x in test['GarageType'] ]


In [9]:
# Covert Bathroom variables into showers and toilets

# create list for subset of data the includes the word bath 
all_bathrooms = [x for x in test.columns if "Bath" in  x]

# create list for subset of data that tell us the number of showers
with_shower = [x for x in all_bathrooms if 'Full' in x]

test['n_toilets'] = np.sum(test[all_bathrooms], axis=1)
test['n_showers'] = np.sum(test[with_shower], axis =1)

# Did we want to flag the exisistance of a basement bathroom?

# drop old bathroom columns
test = test.drop(test[all_bathrooms[1:]], axis=1)

In [10]:
# Create flag for irregular lot shape
test['LotShape'] = [1 if x == 'Reg' else 0 for x in test['LotShape']]

In [11]:
# Flag basements as livibale (1) or non (2)

# Livible ratings
rates = ['GLQ', 'ALQ', 'BLQ']
# counts livible basements in fintype1
test['Basement1'] = [1 if x in rates else 0 for x in test['BsmtFinType1']]
# counts livible basements in fintype2
test['Basement2'] = [1 if x in rates  else 0 for x in test['BsmtFinType2']]
# combines the livible area
test['Basement'] = test['Basement1'].values + test['Basement2'].values
# counts livible basement as present when there's two types
test['Basement'] = test['Basement'].replace(2,1) 

In [12]:
# Add colums for basement area by quality
test['lowqualbsmt1'] = test['BsmtFinSF1'] * test['Basement1']
test['lowqualbsmt2'] = test['BsmtFinSF2'] * test['Basement2']
test['goodbsmt1'] = test['BsmtFinSF1'] * test['Basement1']
test['goodbsmt2'] = test['BsmtFinSF2'] * test['Basement2']

In [13]:
# Update total square footage to include finished basement
test['GrLivArea'] = test['GrLivArea'] + test['goodbsmt1'] + test['goodbsmt2']

In [14]:
# Define recreational square footage incorporating poarch area and unfinished basements

# Combine porch/deck areas
test['PorchFT'] = test['WoodDeckSF'] + test['OpenPorchSF'] + test['EnclosedPorch'] + test['3SsnPorch'] + test['ScreenPorch']

# Flag the exsistance of a porch
test['PorchYN'] = [0 if x==0 else 1 for x in test['PorchFT']]

# Create the Rec space variable as the sum of deck area and rough basement
test['RecSpaceFt'] = test['PorchFT'] + test['lowqualbsmt1'] + test['lowqualbsmt2']

In [15]:
#######################################
#Neighborhood is captured by date built
#######################################

# # Dummify Neighborhood
# dummies = pd.get_dummies(test['Neighborhood']).rename(columns=lambda x: 'Nhood_' + str(x))
# test = pd.concat([test, dummies], axis=1)

# # Create a list of the original for output, and drop from predictor table
hoods = test[['Neighborhood']]
test = test.drop(['Neighborhood'], axis=1)


In [16]:
# drops extra columns. Overwrites existing table 
test = test.drop(['Basement1', 'Basement2', 'BsmtFinType1', 'lowqualbsmt1','lowqualbsmt2','goodbsmt1', 'goodbsmt2', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'LowQualFinSF', 'PorchFT', 'PorchYN', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch'], axis = 1)

In [17]:
# Drop rows with outliers

#########################
#########################
# DON'T FILTER TEST DATA
#########################
#########################
#test = test[~(test['LotArea'] > 50000)]
#test = test[~(test['GrLivArea'] > 6000)]

In [18]:
# Drop variables with low varience
extract_list = ['Alley','LotFrontage','SaleCondition','SaleType','Fence','MiscFeature','PoolQC',\
'PavedDrive','Functional','CentralAir','Electrical','Heating','BsmtCond','RoofMatl','RoofStyle','HouseStyle',\
'LandSlope','Utilities','Street','LandContour','BsmtExposure','BsmtQual','BsmtUnfSF','TotalBsmtSF','OverallQual',\
'GarageCars','GarageYrBlt','2ndFlrSF','GarageQual', 'MSZoning', 'Condition1', 'Condition2',"KitchenQual", 'FireplaceQu', 'GarageQual', 'GarageCond', 
 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'PoolArea',
'PoolQC', 'PoolQ', 'OverallCond', 'GarageFinish', 'MSSubClass', 'LotShape', 'LotConfig', 'BldgType', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'Foundation', '1stFlrSF','ExterC','BsmtC','RecSpaceFt','MiscVal','GarageQ','GarageC','Fireplaces','MoSold','YrSold','TotRmsAbvGrd','Basement']
clean = test.drop(extract_list, axis=1)

In [19]:
# Dispay options
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 1500)
clean.sample(10)

Unnamed: 0,LotArea,YearBuilt,YearRemodAdd,GrLivArea,BsmtFullBath,BedroomAbvGr,KitchenAbvGr,GarageType,GarageArea,Kitchen,Fireplace,ExterQ,BsmtQ,HeatingQ,n_toilets,n_showers
1256,8314,1997,1998,1694.0,0.0,3,1,0,434.0,4,3,4,4,5,3.0,2.0
802,12328,2005,2005,3527.0,1.0,4,1,0,729.0,5,4,4,5,5,5.0,4.0
853,13204,2006,2007,1458.0,0.0,3,1,0,454.0,4,0,3,4,5,2.0,2.0
881,8847,2005,2005,2307.0,1.0,3,1,1,484.0,4,0,4,4,5,3.0,3.0
164,13008,1956,1956,882.0,0.0,2,1,1,502.0,3,0,3,2,3,1.0,1.0
863,11084,2004,2004,1566.0,1.0,3,1,0,400.0,4,4,4,4,5,4.0,3.0
1034,17808,1946,1950,1242.0,0.0,2,1,1,336.0,3,0,3,3,3,1.0,1.0
976,6600,1962,1962,1416.0,0.0,2,1,1,294.0,3,0,3,3,4,1.0,1.0
700,9345,2007,2007,1615.0,0.0,3,1,1,864.0,4,4,4,4,5,2.0,2.0
659,5520,1920,1997,1284.0,1.0,3,1,0,355.0,3,4,3,3,3,2.0,2.0


In [20]:
clean.columns

Index(['LotArea', 'YearBuilt', 'YearRemodAdd', 'GrLivArea', 'BsmtFullBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'GarageType', 'GarageArea', 'Kitchen',
       'Fireplace', 'ExterQ', 'BsmtQ', 'HeatingQ', 'n_toilets', 'n_showers'],
      dtype='object')

In [21]:
len(clean)

1459

In [25]:
# Write out clean set of predictive variables
clean.to_csv('../../data/test_clean.csv')

In [23]:
# Write out Overall Quality estimate for validations
#test_qual = test[['OverallQual']]
#test_qual.to_csv('../../data/test_qual.csv')

In [24]:
# Write out non-dummy Neighborhoods for testing
#hoods.to_csv('../../data/test_hoods.csv')