In [1]:
# This code cleans data from the Ames, IA housing price
# competion on Kaggle. 
# https://www.kaggle.com/c/house-prices-advanced-regression-techniques/
# Team Priced2Sell

In [1]:
# Import modules
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import preprocessing

In [6]:
# Import data
train = pd.read_csv('../data/train.csv', index_col=0)
test = pd.read_csv('../data/test.csv', index_col=0)

In [7]:
# Filter out commercial properties
MSZoningMask = (train.MSZoning=='A') | (train.MSZoning=='C') | (train.MSZoning=='I') | (train.MSZoning=='C (all)')
#train = train[~MSZoningMask]

In [8]:
# Drop NAs
train = train.fillna(value=0)

In [9]:
# Convert catagorical variables into numbers

def make_num(val):
    new_list = []
    if val == 'Ex':
        num = 5
    elif val == 'Gd':
        num = 4
    elif val == 'TA':
        num = 3
    elif val == 'Fa':
        num = 2
    elif val == 'Po':
        num = 1
    else:
        num = 0
    return int(num)

new_dict = {'Kitchen':0, 'Fireplace': 0, 'GarageQ': 0, 'GarageC':0, 'ExterQ': 0, 'ExterC':0, 'BsmtQ':0, 
            'BsmtC':0, 'HeatingQ': 0, "PoolQ": 0}

name_list = list(new_dict.keys())

orig_list = ['KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 
 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC',
'PoolQC']

i=0
for thing in orig_list:  
    new_list = list(map(make_num,train[thing]))
    new_dict[name_list[i]] = new_list
    
    i+=1
    
new_df = pd.DataFrame(new_dict)
train = pd.concat([train.reset_index(), new_df], sort=True, axis=1)
train = train.drop('Id', axis=1)


In [10]:
# A slightly more flexible version of original make_num

# this function takes a single text value and a 
# rating system (array of text values) and returns 
# single text value. Meant to be used in a list
# comprehension or map function


def make_num_flex(val, rating_system):
    new_list = []
    if val == rating_system[0]:
        num = 0
    elif val == rating_system[1]:
        num = 1
    elif val == rating_system[2]:
        num = 0
    elif val == rating_system[3]:
        num = 0
    elif val == rating_system[4]:
        num = 0
    elif val == rating_system[5]:
        num = 0
    else:
        num = 1
    return int(num)

In [11]:
# Convert garage catagoricals

gar_types = ['Attachd', 'Detchd', 'BuiltIn', 'Basement', 'Carport', '2Types']
train['GarageType'] = [make_num_flex(x, gar_types) for x in train['GarageType'] ]


In [12]:
# Covert Bathroom variables into showers and toilets

# create list for subset of data the includes the word bath 
all_bathrooms = [x for x in train.columns if "Bath" in  x]

# create list for subset of data that tell us the number of showers
with_shower = [x for x in all_bathrooms if 'Full' in x]

train['n_toilets'] = np.sum(train[all_bathrooms], axis=1)
train['n_showers'] = np.sum(train[with_shower], axis =1)

# Did we want to flag the exisistance of a basement bathroom?

# drop old bathroom columns
train = train.drop(train[all_bathrooms[1:]], axis=1)

In [13]:
# Create flag for irregular lot shape
train['LotShape'] = [1 if x == 'Reg' else 0 for x in train['LotShape']]

In [14]:
# Flag basements as livibale (1) or non (2)

# Livible ratings
rates = ['GLQ', 'ALQ', 'BLQ']
# counts livible basements in fintype1
train['Basement1'] = [1 if x in rates else 0 for x in train['BsmtFinType1']]
# counts livible basements in fintype2
train['Basement2'] = [1 if x in rates  else 0 for x in train['BsmtFinType2']]
# combines the livible area
train['Basement'] = train['Basement1'].values + train['Basement2'].values
# counts livible basement as present when there's two types
train['Basement'] = train['Basement'].replace(2,1) 

In [15]:
# Add colums for basement area by quality
train['lowqualbsmt1'] = train['BsmtFinSF1'] * train['Basement1']
train['lowqualbsmt2'] = train['BsmtFinSF2'] * train['Basement2']
train['goodbsmt1'] = train['BsmtFinSF1'] * train['Basement1']
train['goodbsmt2'] = train['BsmtFinSF2'] * train['Basement2']

In [16]:
# Update total square footage to include finished basement
train['GrLivArea'] = train['GrLivArea'] + train['goodbsmt1'] + train['goodbsmt2']

In [17]:
# Define recreational square footage incorporating poarch area and unfinished basements

# Combine porch/deck areas
train['PorchFT'] = train['WoodDeckSF'] + train['OpenPorchSF'] + train['EnclosedPorch'] + train['3SsnPorch'] + train['ScreenPorch']

# Flag the exsistance of a porch
train['PorchYN'] = [0 if x==0 else 1 for x in train['PorchFT']]

# Create the Rec space variable as the sum of deck area and rough basement
train['RecSpaceFt'] = train['PorchFT'] + train['lowqualbsmt1'] + train['lowqualbsmt2']

In [15]:
# drops extra columns. Overwrites existing table 
#train = train.drop(['Basement1', 'Basement2', 'BsmtFinType1', 'lowqualbsmt1','lowqualbsmt2','goodbsmt1', 'goodbsmt2', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'LowQualFinSF', 'PorchFT', 'PorchYN', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch'], axis = 1)

In [18]:
# Drop rows with outliers

#########################
#########################
# DON'T FILTER TEST DATA
#########################
#########################
train = train[~(train['LotArea'] > 50000)]
train = train[~(train['GrLivArea'] > 6000)]

In [19]:
#######################################
#Neighborhood is captured by date built
#######################################

# # Dummify Neighborhood
dummies = pd.get_dummies(train['Neighborhood']).rename(columns=lambda x: 'Nhood_' + str(x))
train = pd.concat([train, dummies], axis=1)

# # Create a list of the original for output, and drop from predictor table
hoods = train[['Neighborhood']]
train = train.drop(['Neighborhood'], axis=1)

In [27]:
# Drop variables with low varience
extract_list = ['Alley','LotFrontage','SaleCondition','SaleType','Fence','MiscFeature','PoolQC',\
'PavedDrive','Functional','CentralAir','Electrical','Heating','BsmtCond','RoofMatl','RoofStyle','HouseStyle',\
'LandSlope','Utilities','Street','LandContour','BsmtExposure','BsmtQual','BsmtUnfSF','TotalBsmtSF','OverallQual',\
'GarageCars','GarageYrBlt','2ndFlrSF','GarageQual', 'MSZoning', 'Condition1', 'Condition2',"KitchenQual", 'FireplaceQu', 'GarageQual', 'GarageCond', 
 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'PoolArea',
'PoolQC', 'PoolQ', 'OverallCond', 'GarageFinish', 'MSSubClass', 'LotShape', 'LotConfig', 'BldgType', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'Foundation', '1stFlrSF','ExterC','BsmtC','RecSpaceFt','MiscVal','GarageQ','GarageC','Fireplaces','MoSold','YrSold','TotRmsAbvGrd','Basement']
clean = train   #.drop(extract_list, axis=1)

extract_list2 = ['MSSubClass','MSZoning','Street','LandContour','Utilities','LotConfig','LandSlope','Condition1','Condition2','BldgType','HouseStyle','RoofStyle','RoofMatl','Exterior1st','Exterior2nd','MasVnrType','Foundation','ExterQual','ExterCond','BsmtQual','BsmtExposure','BsmtFinType1','BsmtFinType2']

clean=train.drop(extract_list2,axis=1)

In [28]:
# Dispay options
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 1500)
clean.sample(10)

Unnamed: 0,LotFrontage,LotArea,Alley,LotShape,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtCond,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,Kitchen,Fireplace,GarageQ,GarageC,ExterQ,ExterC,BsmtQ,BsmtC,HeatingQ,PoolQ,n_toilets,n_showers,Basement1,Basement2,Basement,lowqualbsmt1,lowqualbsmt2,goodbsmt1,goodbsmt2,PorchFT,PorchYN,RecSpaceFt,Nhood_Blmngtn,Nhood_Blueste,Nhood_BrDale,Nhood_BrkSide,Nhood_ClearCr,Nhood_CollgCr,Nhood_Crawfor,Nhood_Edwards,Nhood_Gilbert,Nhood_IDOTRR,Nhood_MeadowV,Nhood_Mitchel,Nhood_NAmes,Nhood_NPkVill,Nhood_NWAmes,Nhood_NoRidge,Nhood_NridgHt,Nhood_OldTown,Nhood_SWISU,Nhood_Sawyer,Nhood_SawyerW,Nhood_Somerst,Nhood_StoneBr,Nhood_Timber,Nhood_Veenker
826,50.0,6130,0,1,5,6,1924,1950,0.0,TA,784,0,0,784,GasA,Gd,Y,SBrkr,784,0,0,1568,1,2,1,Gd,5,Typ,0,0,1,0.0,0,0,0,0,0,Y,0,0,116,0,0,0,0,0,0,0,5,2008,WD,Normal,109500,4,0,0,0,3,3,3,3,4,0,2,2,1,0,1,784,0,784,0,116,1,900,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1452,35.0,3675,0,1,5,5,2005,2005,80.0,TA,547,0,0,547,GasA,Gd,Y,SBrkr,1072,0,0,1619,1,2,1,TA,5,Typ,0,0,1,2005.0,Fin,2,525,TA,TA,Y,0,28,0,0,0,0,0,0,0,0,5,2006,WD,Normal,145000,3,0,3,3,3,3,4,3,4,0,2,2,1,0,1,547,0,547,0,28,1,575,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
891,70.0,11184,0,1,6,5,1978,1978,92.0,TA,226,500,192,918,GasA,Gd,Y,SBrkr,918,765,0,1683,0,3,1,TA,7,Typ,1,TA,1,1978.0,RFn,2,440,TA,TA,Y,243,0,0,0,0,0,0,0,0,0,7,2009,WD,Normal,172500,3,3,3,3,3,3,3,3,4,0,3,2,0,0,0,0,0,0,0,243,1,243,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
385,43.0,3182,0,1,8,5,2004,2005,16.0,TA,24,0,1232,1256,GasA,Ex,Y,SBrkr,1269,0,0,1293,0,2,1,Gd,6,Typ,1,TA,1,2004.0,Fin,2,430,TA,TA,Y,146,20,0,0,144,0,0,0,0,0,4,2010,WD,Normal,192000,4,3,3,3,4,3,4,3,5,0,2,2,1,0,1,24,0,24,0,310,1,334,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
86,122.0,11911,0,0,6,5,2005,2005,0.0,TA,0,0,684,684,GasA,Ex,Y,SBrkr,684,876,0,1560,0,3,1,Gd,6,Typ,1,Gd,0,2005.0,Fin,2,400,TA,TA,Y,100,38,0,0,0,0,0,0,0,0,3,2009,WD,Normal,174000,4,4,3,3,4,3,4,3,5,0,3,2,0,0,0,0,0,0,0,138,1,138,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
955,82.0,7136,0,0,6,6,1946,1950,423.0,TA,484,0,495,979,GasA,TA,N,FuseF,979,979,0,1958,0,4,2,TA,8,Typ,0,0,1,1946.0,Unf,2,492,TA,TA,Y,0,0,0,0,0,0,0,0,0,0,8,2007,WD,Normal,145000,3,0,3,3,3,3,4,3,3,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
434,21.0,1890,0,1,4,7,1972,1972,0.0,TA,495,0,135,630,GasA,Gd,Y,SBrkr,630,0,0,1125,1,1,1,TA,3,Typ,0,0,1,0.0,0,0,0,0,0,Y,88,0,0,0,0,0,0,0,0,0,6,2008,WD,Normal,81000,3,0,0,0,3,4,4,3,4,0,2,2,1,0,1,495,0,495,0,88,1,583,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
637,50.0,6000,0,1,5,4,1954,1954,0.0,TA,0,0,811,811,GasA,TA,Y,FuseA,811,576,0,1387,0,3,2,Gd,7,Typ,0,0,0,1954.0,Unf,1,256,TA,TA,Y,0,0,0,0,0,0,0,0,0,0,11,2009,WD,Normal,93000,4,0,3,3,3,3,3,3,3,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
884,65.0,7150,0,1,5,5,1967,1967,60.0,TA,432,0,460,892,GasA,TA,Y,SBrkr,892,0,0,1324,0,3,1,TA,5,Typ,0,0,1,1967.0,RFn,1,288,TA,TA,Y,0,0,0,0,0,0,0,GdWo,0,0,7,2009,WD,Normal,100000,3,0,3,3,3,3,3,3,3,0,1,1,1,0,1,432,0,432,0,0,0,432,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
837,21.0,1680,0,1,6,5,1973,1973,158.0,TA,330,0,153,483,GasA,TA,Y,SBrkr,483,504,0,1317,1,2,1,TA,5,Typ,0,0,1,1973.0,Unf,1,264,TA,TA,Y,0,0,0,0,0,0,0,0,0,0,11,2008,WD,Normal,100000,3,0,3,3,3,3,3,3,3,0,3,2,1,0,1,330,0,330,0,0,0,330,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [29]:
clean.shape

(1447, 100)

In [21]:
# Write out clean set of predictive variables
clean.to_csv('data/train_new_clean.csv')

In [22]:
# Write out Overall Quality estimate for validations
#train_qual = train[['OverallQual']]
#train_qual.to_csv('data/train_qual.csv')

In [23]:
# Write out non-dummy Neighborhoods for testing
#hoods.to_csv('data/train_hoods.csv')