In [1]:
# This code cleans data from the Ames, IA housing price
# competion on Kaggle. 
# https://www.kaggle.com/c/house-prices-advanced-regression-techniques/
# Team Priced2Sell

In [2]:
# Import modules
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import preprocessing

In [3]:
# Import data
train = pd.read_csv('data/train.csv', index_col=0)
test = pd.read_csv('data/test.csv', index_col=0)

In [4]:
# Filter out commercial properties
MSZoningMask = (train.MSZoning=='A') | (train.MSZoning=='C') | (train.MSZoning=='I') | (train.MSZoning=='C (all)')
train = train[~MSZoningMask]

In [5]:
# Drop NAs
train = train.fillna(value=0)

In [6]:
# Convert catagorical variables into numbers

def make_num(val):
    new_list = []
    if val == 'Ex':
        num = 5
    elif val == 'Gd':
        num = 4
    elif val == 'TA':
        num = 3
    elif val == 'Fa':
        num = 2
    elif val == 'Po':
        num = 1
    else:
        num = 0
    return int(num)

new_dict = {'Kitchen':0, 'Fireplace': 0, 'GarageQ': 0, 'GarageC':0, 'ExterQ': 0, 'ExterC':0, 'BsmtQ':0, 
            'BsmtC':0, 'HeatingQ': 0, "PoolQ": 0}

name_list = list(new_dict.keys())

orig_list = ['KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 
 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC',
'PoolQC']

i=0
for thing in orig_list:  
    new_list = list(map(make_num,train[thing]))
    new_dict[name_list[i]] = new_list
    
    i+=1
    
new_df = pd.DataFrame(new_dict)
train = pd.concat([train.reset_index(), new_df], sort=True, axis=1)
train = train.drop('Id', axis=1)


In [7]:
# A slightly more flexible version of original make_num

# this function takes a single text value and a 
# rating system (array of text values) and returns 
# single text value. Meant to be used in a list
# comprehension or map function


def make_num_flex(val, rating_system):
    new_list = []
    if val == rating_system[0]:
        num = 0
    elif val == rating_system[1]:
        num = 1
    elif val == rating_system[2]:
        num = 0
    elif val == rating_system[3]:
        num = 0
    elif val == rating_system[4]:
        num = 0
    elif val == rating_system[5]:
        num = 0
    else:
        num = 1
    return int(num)

In [8]:
# Convert garage catagoricals

gar_types = ['Attachd', 'Detchd', 'BuiltIn', 'Basement', 'Carport', '2Types']
train['GarageType'] = [make_num_flex(x, gar_types) for x in train['GarageType'] ]


In [9]:
# Covert Bathroom variables into showers and toilets

# create list for subset of data the includes the word bath 
all_bathrooms = [x for x in train.columns if "Bath" in  x]

# create list for subset of data that tell us the number of showers
with_shower = [x for x in all_bathrooms if 'Full' in x]

train['n_toilets'] = np.sum(train[all_bathrooms], axis=1)
train['n_showers'] = np.sum(train[with_shower], axis =1)

# Did we want to flag the exisistance of a basement bathroom?

# drop old bathroom columns
train = train.drop(train[all_bathrooms[1:]], axis=1)

In [10]:
# Create flag for irregular lot shape
train['LotShape'] = [1 if x == 'Reg' else 0 for x in train['LotShape']]

In [11]:
# Flag basements as livibale (1) or non (2)

# Livible ratings
rates = ['GLQ', 'ALQ', 'BLQ']
# counts livible basements in fintype1
train['Basement1'] = [1 if x in rates else 0 for x in train['BsmtFinType1']]
# counts livible basements in fintype2
train['Basement2'] = [1 if x in rates  else 0 for x in train['BsmtFinType2']]
# combines the livible area
train['Basement'] = train['Basement1'].values + train['Basement2'].values
# counts livible basement as present when there's two types
train['Basement'] = train['Basement'].replace(2,1) 

In [12]:
# Add colums for basement area by quality
train['lowqualbsmt1'] = train['BsmtFinSF1'] * train['Basement1']
train['lowqualbsmt2'] = train['BsmtFinSF2'] * train['Basement2']
train['goodbsmt1'] = train['BsmtFinSF1'] * train['Basement1']
train['goodbsmt2'] = train['BsmtFinSF2'] * train['Basement2']

In [13]:
# Update total square footage to include finished basement
train['GrLivArea'] = train['GrLivArea'] + train['goodbsmt1'] + train['goodbsmt2']

In [14]:
# Define recreational square footage incorporating poarch area and unfinished basements

# Combine porch/deck areas
train['PorchFT'] = train['WoodDeckSF'] + train['OpenPorchSF'] + train['EnclosedPorch'] + train['3SsnPorch'] + train['ScreenPorch']

# Flag the exsistance of a porch
train['PorchYN'] = [0 if x==0 else 1 for x in train['PorchFT']]

# Create the Rec space variable as the sum of deck area and rough basement
train['RecSpaceFt'] = train['PorchFT'] + train['lowqualbsmt1'] + train['lowqualbsmt2']

In [15]:
#######################################
#Neighborhood is captured by date built
#######################################

# # Dummify Neighborhood
# dummies = pd.get_dummies(train['Neighborhood']).rename(columns=lambda x: 'Nhood_' + str(x))
# train = pd.concat([train, dummies], axis=1)

# # Create a list of the original for output, and drop from predictor table
hoods = train[['Neighborhood']]
train = train.drop(['Neighborhood'], axis=1)


In [16]:
# drops extra columns. Overwrites existing table 
train = train.drop(['Basement1', 'Basement2', 'BsmtFinType1', 'lowqualbsmt1','lowqualbsmt2','goodbsmt1', 'goodbsmt2', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'LowQualFinSF', 'PorchFT', 'PorchYN', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch'], axis = 1)

In [17]:
# Drop rows with outliers

#########################
#########################
# DON'T FILTER TEST DATA
#########################
#########################
train = train[~(train['LotArea'] > 50000)]
train = train[~(train['GrLivArea'] > 6000)]

In [18]:
# Drop variables with low varience
extract_list = ['Alley','LotFrontage','SaleCondition','SaleType','Fence','MiscFeature','PoolQC',\
'PavedDrive','Functional','CentralAir','Electrical','Heating','BsmtCond','RoofMatl','RoofStyle','HouseStyle',\
'LandSlope','Utilities','Street','LandContour','BsmtExposure','BsmtQual','BsmtUnfSF','TotalBsmtSF','OverallQual',\
'GarageCars','GarageYrBlt','2ndFlrSF','GarageQual', 'MSZoning', 'Condition1', 'Condition2',"KitchenQual", 'FireplaceQu', 'GarageQual', 'GarageCond', 
 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'PoolArea',
'PoolQC', 'PoolQ', 'OverallCond', 'GarageFinish', 'MSSubClass', 'LotShape', 'LotConfig', 'BldgType', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'Foundation', '1stFlrSF','ExterC','BsmtC','RecSpaceFt','MiscVal','GarageQ','GarageC','Fireplaces','MoSold','YrSold','TotRmsAbvGrd','Basement']
clean = train.drop(extract_list, axis=1)

In [19]:
# Dispay options
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 1500)
clean.sample(10)

Unnamed: 0,LotArea,YearBuilt,YearRemodAdd,GrLivArea,BsmtFullBath,BedroomAbvGr,KitchenAbvGr,GarageType,GarageArea,SalePrice,Kitchen,Fireplace,ExterQ,BsmtQ,HeatingQ,n_toilets,n_showers
273,9196,2003,2003,1560,0,3,1,1,573,201000,4,0,4,5,5,2,2
460,11988,1934,1995,1660,0,3,1,1,240,188700,3,4,3,3,2,2,1
416,8450,1968,1968,1831,1,3,1,1,304,142000,3,2,3,3,5,2,2
816,9900,1940,1950,1489,0,3,1,1,240,139500,3,4,3,3,3,2,2
40,16905,1959,1959,2295,0,2,1,1,308,170000,3,4,3,3,4,3,1
1155,16157,1978,1978,2112,1,2,1,1,588,194000,4,3,3,4,5,3,2
1331,13695,2003,2004,1928,1,3,1,1,576,155000,4,0,3,4,5,2,2
1181,32463,1961,1975,2781,1,3,1,0,1356,168000,3,3,3,3,5,2,2
420,9200,1998,1998,3616,1,4,1,1,696,315000,4,3,4,4,4,4,3
686,21535,1994,1995,5771,0,4,1,1,832,755000,5,5,5,5,5,5,3


In [20]:
clean.shape

(1437, 17)

In [21]:
# Write out clean set of predictive variables
clean.to_csv('data/train_clean.csv')

In [22]:
# Write out Overall Quality estimate for validations
train_qual = train[['OverallQual']]
train_qual.to_csv('data/train_qual.csv')

In [23]:
# Write out non-dummy Neighborhoods for testing
hoods.to_csv('data/train_hoods.csv')