In [1]:
# This code cleans data from the Ames, IA housing price
# competion on Kaggle. 
# https://www.kaggle.com/c/house-prices-advanced-regression-techniques/
# Team Priced2Sell

In [2]:
# Import modules
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import preprocessing

In [3]:
# Import data
train = pd.read_csv('data/train.csv', index_col=0)
test = pd.read_csv('data/test.csv', index_col=0)

In [4]:
# Filter out commercial properties
MSZoningMask = (train.MSZoning=='A') | (train.MSZoning=='C') | (train.MSZoning=='I') | (train.MSZoning=='C (all)')
train = train[~MSZoningMask]

In [5]:
# Drop NAs
train = train.fillna(value=0)

In [6]:
# Convert catagorical variables into numbers

def make_num(val):
    new_list = []
    if val == 'Ex':
        num = 5
    elif val == 'Gd':
        num = 4
    elif val == 'TA':
        num = 3
    elif val == 'Fa':
        num = 2
    elif val == 'Po':
        num = 1
    else:
        num = 0
    return int(num)

new_dict = {'Kitchen':0, 'Fireplace': 0, 'GarageQ': 0, 'GarageC':0, 'ExterQ': 0, 'ExterC':0, 'BsmtQ':0, 
            'BsmtC':0, 'HeatingQ': 0, "PoolQ": 0}

name_list = list(new_dict.keys())

orig_list = ['KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 
 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC',
'PoolQC']

i=0
for thing in orig_list:  
    new_list = list(map(make_num,train[thing]))
    new_dict[name_list[i]] = new_list
    
    i+=1
    
new_df = pd.DataFrame(new_dict)
train = pd.concat([train.reset_index(), new_df], sort=True, axis=1)
train = train.drop('Id', axis=1)


In [7]:
# A slightly more flexible version of original make_num

# this function takes a single text value and a 
# rating system (array of text values) and returns 
# single text value. Meant to be used in a list
# comprehension or map function


def make_num_flex(val, rating_system):
    new_list = []
    if val == rating_system[0]:
        num = 0
    elif val == rating_system[1]:
        num = 1
    elif val == rating_system[2]:
        num = 0
    elif val == rating_system[3]:
        num = 0
    elif val == rating_system[4]:
        num = 0
    elif val == rating_system[5]:
        num = 0
    else:
        num = 1
    return int(num)

In [8]:
# Convert garage catagoricals

gar_types = ['Attachd', 'Detchd', 'BuiltIn', 'Basement', 'Carport', '2Types']
train['GarageType'] = [make_num_flex(x, gar_types) for x in train['GarageType'] ]


In [9]:
# Covert Bathroom variables into showers and toilets

# create list for subset of data the includes the word bath 
all_bathrooms = [x for x in train.columns if "Bath" in  x]

# create list for subset of data that tell us the number of showers
with_shower = [x for x in all_bathrooms if 'Full' in x]

train['n_toilets'] = np.sum(train[all_bathrooms], axis=1)
train['n_showers'] = np.sum(train[with_shower], axis =1)

# Did we want to flag the exisistance of a basement bathroom?

# drop old bathroom columns
train = train.drop(train[all_bathrooms[1:]], axis=1)

In [10]:
# Create flag for irregular lot shape
train['LotShape'] = [1 if x == 'Reg' else 0 for x in train['LotShape']]

In [11]:
# Flag basements as livibale (1) or non (2)

# Livible ratings
rates = ['GLQ', 'ALQ', 'BLQ']
# counts livible basements in fintype1
train['Basement1'] = [1 if x in rates else 0 for x in train['BsmtFinType1']]
# counts livible basements in fintype2
train['Basement2'] = [1 if x in rates  else 0 for x in train['BsmtFinType2']]
# combines the livible area
train['Basement'] = train['Basement1'].values + train['Basement2'].values
# counts livible basement as present when there's two types
train['Basement'] = train['Basement'].replace(2,1) 

In [12]:
# Add colums for basement area by quality
train['lowqualbsmt1'] = train['BsmtFinSF1'] * train['Basement1']
train['lowqualbsmt2'] = train['BsmtFinSF2'] * train['Basement2']
train['goodbsmt1'] = train['BsmtFinSF1'] * train['Basement1']
train['goodbsmt2'] = train['BsmtFinSF2'] * train['Basement2']

In [13]:
# Update total square footage to include finished basement
train['GrLivArea'] = train['GrLivArea'] + train['goodbsmt1'] + train['goodbsmt2']

In [14]:
# Define recreational square footage incorporating poarch area and unfinished basements

# Combine porch/deck areas
train['PorchFT'] = train['WoodDeckSF'] + train['OpenPorchSF'] + train['EnclosedPorch'] + train['3SsnPorch'] + train['ScreenPorch']

# Flag the exsistance of a porch
train['PorchYN'] = [0 if x==0 else 1 for x in train['PorchFT']]

# Create the Rec space variable as the sum of deck area and rough basement
train['RecSpaceFt'] = train['PorchFT'] + train['lowqualbsmt1'] + train['lowqualbsmt2']

In [15]:
# Dummify Neighborhood
dummies = pd.get_dummies(train['Neighborhood']).rename(columns=lambda x: 'Nhood_' + str(x))
train = pd.concat([train, dummies], axis=1)
train = train.drop(['Neighborhood'], axis=1)

In [16]:
# drops extra columns. Overwrites existing table 
train = train.drop(['Basement1', 'Basement2', 'BsmtFinType1', 'lowqualbsmt1','lowqualbsmt2','goodbsmt1', 'goodbsmt2', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'LowQualFinSF', 'PorchFT', 'PorchYN', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch'], axis = 1)

In [17]:
# Drop variables with low varience
extract_list = ['Alley','LotFrontage','SaleCondition','SaleType','Fence','MiscFeature','PoolQC',\
'PavedDrive','Functional','CentralAir','Electrical','Heating','BsmtCond','RoofMatl','RoofStyle','HouseStyle',\
'LandSlope','Utilities','Street','LandContour','BsmtExposure','BsmtQual','BsmtUnfSF','TotalBsmtSF','OverallQual',\
'GarageCars','GarageYrBlt','2ndFlrSF','GarageQual', 'MSZoning', 'Condition1', 'Condition2',"KitchenQual", 'FireplaceQu', 'GarageQual', 'GarageCond', 
 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'PoolArea',
'PoolQC', 'PoolQ', 'OverallCond', 'GarageFinish', 'MSSubClass', 'LotShape', 'LotConfig', 'BldgType', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'Foundation', '1stFlrSF']
clean = train.drop(extract_list, axis=1)

In [18]:
# Dispay options
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 1500)
clean.sample(10)

Unnamed: 0,LotArea,YearBuilt,YearRemodAdd,GrLivArea,BsmtFullBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageType,GarageArea,MiscVal,MoSold,YrSold,SalePrice,Kitchen,Fireplace,GarageQ,GarageC,ExterQ,ExterC,BsmtQ,BsmtC,HeatingQ,n_toilets,n_showers,Basement,RecSpaceFt,Nhood_Blmngtn,Nhood_Blueste,Nhood_BrDale,Nhood_BrkSide,Nhood_ClearCr,Nhood_CollgCr,Nhood_Crawfor,Nhood_Edwards,Nhood_Gilbert,Nhood_IDOTRR,Nhood_MeadowV,Nhood_Mitchel,Nhood_NAmes,Nhood_NPkVill,Nhood_NWAmes,Nhood_NoRidge,Nhood_NridgHt,Nhood_OldTown,Nhood_SWISU,Nhood_Sawyer,Nhood_SawyerW,Nhood_Somerst,Nhood_StoneBr,Nhood_Timber,Nhood_Veenker
338,8400,1950,1950,841,0,2,1,4,0,1,294,0,9,2009,82000,3,0,3,3,2,2,3,2,4,1,1,0,274,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
878,5119,1999,2000,2947,1,2,1,5,1,1,506,0,1,2008,328900,4,3,3,3,4,3,5,3,5,3,3,1,1400,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
42,9200,1975,1980,1429,1,3,1,5,0,1,308,0,7,2008,130250,3,0,3,3,3,3,4,3,3,2,2,1,636,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
934,7711,1977,1977,2880,2,4,2,8,0,1,0,0,8,2007,150000,3,0,0,0,3,3,4,3,3,4,4,1,1761,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1249,9587,2005,2005,1821,1,2,1,5,0,1,400,0,7,2008,190000,4,0,3,3,4,3,4,3,5,3,3,1,867,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
638,10793,1969,1969,1907,0,4,1,7,0,1,462,0,4,2007,152000,3,0,3,3,3,3,3,3,5,3,2,1,495,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
731,10463,2005,2005,1801,0,3,1,8,1,1,800,0,6,2006,239900,4,4,3,3,4,3,4,3,5,3,2,0,116,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
549,8777,1949,2003,1126,0,2,1,5,0,1,520,0,5,2009,108000,4,0,3,3,3,3,0,0,5,2,2,0,96,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1118,14572,2004,2004,2830,1,3,1,7,1,1,630,0,11,2007,259000,4,4,3,3,4,3,4,3,5,3,3,1,1480,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
136,15426,1997,1997,2585,1,3,1,7,0,1,470,0,8,2009,231500,4,0,3,3,3,3,4,3,5,4,3,1,1196,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [19]:
clean.shape

(1450, 53)

In [20]:

#clean.to_csv('data/clean_test.csv')