In [2]:
import pandas as pd
import numpy as np

data = pd.read_csv('../data/Ames_Housing_Price_Data.csv', header = [0])
pd.set_option("display.max_columns", None)

# Data Cleaning Part 1 
## Handling Null Values

In [108]:
#Dividing lot area into bins of interval size of 1000. (Max - min)/1000 = 214 bins
data['lot_bucket'] = pd.cut(data['LotArea'],214)

#Here we fill NA LotFrontage values with the mean value of the corresponding bin. We first calculate the mean lot 
#frontage (mean_lf), merge the 2 dataframes, and then fill the NA values with the corresponding mean value.
mean_lf = data.groupby('lot_bucket').agg({'LotFrontage': np.mean}, ignorena = True)
data_new = pd.merge(data, mean_lf, on = 'lot_bucket', how = 'left')
data_new.rename(columns = {'LotFrontage_x':'LotFrontage', 'LotFrontage_y':'mean_LotFrontage'}, inplace = True)
data_new['LotFrontage'].fillna(round(data_new['mean_LotFrontage'],1), inplace = True)

#Replace most N/A values with none and MasVnrType and Electrical with the mode.
categorical_list = ['Alley', 'BsmtQual', 'BsmtExposure', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish',
                    'PoolQC', 'Fence', 'MiscFeature', 'BsmtCond', 'BsmtFinType1', 'BsmtFinType2', 'GarageQual',
                    'GarageCond']

numerical_list = ['MasVnrArea', 'BsmtFullBath','BsmtHalfBath','BsmtFinSF1','BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
                  'GarageCars', 'GarageArea']

data_new['MasVnrType'].fillna(data_new['MasVnrType'].mode()[0], inplace = True)
data_new['Electrical'].fillna(data_new['Electrical'].mode()[0], inplace = True)

for el in categorical_list:
    data_new[el].fillna('None', inplace = True)

for el in numerical_list:
    data_new[el].fillna(0, inplace = True)


'''data_new['Alley'].fillna('None', inplace = True)
data_new['MasVnrType'].fillna(data_new['MasVnrType'].mode()[0], inplace = True)
data_new['MasVnrArea'].fillna(0, inplace = True)
data_new['BsmtQual'].fillna('None', inplace = True)
data_new['BsmtExposure'].fillna('None', inplace = True)
data_new['Electrical'].fillna(data_new['Electrical'].mode()[0], inplace = True)
data_new['BsmtFullBath'].fillna(0, inplace = True)
data_new['BsmtHalfBath'].fillna(0, inplace = True)
data_new['FireplaceQu'].fillna('None', inplace = True)
data_new['GarageType'].fillna('None', inplace = True)
data_new['GarageYrBlt'].fillna('None', inplace = True)
data_new['GarageFinish'].fillna('None', inplace = True)
data_new['PoolQC'].fillna('None', inplace = True)
data_new['Fence'].fillna('None', inplace = True)
data_new['MiscFeature'].fillna('None', inplace = True)
data_new['BsmtCond'].fillna('None', inplace = True)
data_new['BsmtFinType1'].fillna('None', inplace = True)
data_new['BsmtFinSF1'].fillna(0, inplace = True)
data_new['BsmtFinType2'].fillna('None', inplace = True)
data_new['BsmtFinSF2'].fillna(0, inplace = True)
data_new['BsmtUnfSF'].fillna(0, inplace = True)
data_new['TotalBsmtSF'].fillna(0, inplace = True)
data_new['GarageCars'].fillna(0, inplace = True)
data_new['GarageArea'].fillna(0, inplace = True)
data_new['GarageQual'].fillna('None', inplace = True)
data_new['GarageCond'].fillna('None', inplace = True)'''

#402.88 is the ratio of LotArea/LotFrontage for all lot areas > 20,000 to fill in N/A values that did not have
#another value in their lotfrontage bucket.
data_new['LotFrontage'].fillna(data_new['LotArea']/402.88, inplace = True)

#This one house had a garage type listed but all other garage features were NaN
data_new.at[data_new['PID'] == 910201180, 'GarageType'] = 'None'

data_new.to_csv('Ames_Housing_Price_Data_cleaned.csv')

# Data Cleaning Part 2
# Merging  Housing Price Data and Real Estate Data

In [23]:
housing_price_data = pd.read_csv('Ames_Housing_Price_Data_cleaned.csv')
real_estate_data = pd.read_csv('../data/Ames_Real_Estate_Data.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [24]:
#We are dropping the rows we deemed unnecessary from real estate data

real_estate_data = real_estate_data.drop(['Tier',
'Range','ZngCdPr','ZngCdSc','ZngOLPr','ZngOLSc','ClassPr_S','ClassSc_S','Legal_Pr','SchD_S',
'TxD_S','MA_Ownr1','MA_Ownr2','MA_Line1','MA_Line2','MA_City','MA_State','MA_Zip1','MA_Zip2','Rcrd_Yr','Rcrd_Mo','Inst1_No','Inst1_Yr',
'Inst1_Mo','Inst1TPr','LndAc_S','X1TPr_D','X1TSc_D','X2TPr_D','X2TSc_D','X1TPr_S','X1TSc_S',
'X2TPr_S','X2TSc_S','X2TSc_S'],axis = 1)

In [25]:
#We are merging the two datasets by PID (in housing price data) 
#and MapRefNo (real estate data)

housing_price_data = housing_price_data.merge(real_estate_data, how = 'left',\
                                              left_on= 'PID', right_on='MapRefNo')

In [26]:
# Now we are going to merge the housing_price dataset with the geo_location dataset
geo_data = pd.read_csv('../charles/data/geo/neighborhood_location_data.csv')

housing_price_data = housing_price_data.merge(geo_data, how = 'left',\
                         left_on ='Neighborhood_x', right_on = 'Short_hand')

In [28]:
housing_price_data

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,PID,GrLivArea,SalePrice_x,MSSubClass,MSZoning,LotFrontage,LotArea_x,Street,...,PA-UnTyp,PA-UntNo,Date,Source,NmbrBRs,Short_hand,Full_Name,lat,long,distToUni
0,0,1,909176150,856,126000,30,RL,64.9,7890,Pave,...,,,10-Jul-20,Ames City Assessor,2.0,SWISU,South & West of Iowa State University,,,inf
1,1,2,905476230,1049,139500,120,RL,42.0,4235,Pave,...,,,10-Jul-20,Ames City Assessor,2.0,Edwards,Edwards,42.023035,-93.673386,1.39
2,1,2,905476230,1049,139500,120,RL,42.0,4235,Pave,...,,,10-Jul-20,Ames City Assessor,2.0,Edwards,Edwards,42.023035,-93.673386,1.39
3,2,3,911128020,1001,124900,30,C (all),60.0,6060,Pave,...,,,10-Jul-20,Ames City Assessor,2.0,IDOTRR,Iowa DOT and Rail Road,,,inf
4,3,4,535377150,1039,114000,70,RL,80.0,8146,Pave,...,,,10-Jul-20,Ames City Assessor,2.0,OldTown,Old Town,42.033237,-93.617133,1.59


In [29]:
housing_price_data.to_csv('Ames_Housing_Price_Data_cleaned_2.csv')