In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv('../data/Ames_Housing_Price_Data.csv', header = [0])
pd.set_option("display.max_columns", None)

# Data Cleaning Part 1 
## Handling Null Values

In [2]:
#Dividing lot area into bins of interval size of 1000. (Max - min)/1000 = 214 bins
data['lot_bucket'] = pd.cut(data['LotArea'],214)

#Here we fill NA LotFrontage values with the mean value of the corresponding bin. We first calculate the mean lot 
#frontage (mean_lf), merge the 2 dataframes, and then fill the NA values with the corresponding mean value.
mean_lf = data.groupby('lot_bucket').agg({'LotFrontage': np.mean}, ignorena = True)
data_new = pd.merge(data, mean_lf, on = 'lot_bucket', how = 'left')
data_new.rename(columns = {'LotFrontage_x':'LotFrontage', 'LotFrontage_y':'mean_LotFrontage'}, inplace = True)
data_new['LotFrontage'].fillna(round(data_new['mean_LotFrontage'],1), inplace = True)

#Replace most N/A values with none and MasVnrType and Electrical with the mode.
categorical_list = ['Alley', 'BsmtQual', 'BsmtExposure', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish',
                    'PoolQC', 'Fence', 'MiscFeature', 'BsmtCond', 'BsmtFinType1', 'BsmtFinType2', 'GarageQual',
                    'GarageCond']

numerical_list = ['MasVnrArea', 'BsmtFullBath','BsmtHalfBath','BsmtFinSF1','BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
                  'GarageCars', 'GarageArea']

data_new['MasVnrType'].fillna(data_new['MasVnrType'].mode()[0], inplace = True)
data_new['Electrical'].fillna(data_new['Electrical'].mode()[0], inplace = True)

for el in categorical_list:
    data_new[el].fillna('None', inplace = True)

for el in numerical_list:
    data_new[el].fillna(0, inplace = True)


'''data_new['Alley'].fillna('None', inplace = True)
data_new['MasVnrType'].fillna(data_new['MasVnrType'].mode()[0], inplace = True)
data_new['MasVnrArea'].fillna(0, inplace = True)
data_new['BsmtQual'].fillna('None', inplace = True)
data_new['BsmtExposure'].fillna('None', inplace = True)
data_new['Electrical'].fillna(data_new['Electrical'].mode()[0], inplace = True)
data_new['BsmtFullBath'].fillna(0, inplace = True)
data_new['BsmtHalfBath'].fillna(0, inplace = True)
data_new['FireplaceQu'].fillna('None', inplace = True)
data_new['GarageType'].fillna('None', inplace = True)
data_new['GarageYrBlt'].fillna('None', inplace = True)
data_new['GarageFinish'].fillna('None', inplace = True)
data_new['PoolQC'].fillna('None', inplace = True)
data_new['Fence'].fillna('None', inplace = True)
data_new['MiscFeature'].fillna('None', inplace = True)
data_new['BsmtCond'].fillna('None', inplace = True)
data_new['BsmtFinType1'].fillna('None', inplace = True)
data_new['BsmtFinSF1'].fillna(0, inplace = True)
data_new['BsmtFinType2'].fillna('None', inplace = True)
data_new['BsmtFinSF2'].fillna(0, inplace = True)
data_new['BsmtUnfSF'].fillna(0, inplace = True)
data_new['TotalBsmtSF'].fillna(0, inplace = True)
data_new['GarageCars'].fillna(0, inplace = True)
data_new['GarageArea'].fillna(0, inplace = True)
data_new['GarageQual'].fillna('None', inplace = True)
data_new['GarageCond'].fillna('None', inplace = True)'''

#402.88 is the ratio of LotArea/LotFrontage for all lot areas > 20,000 to fill in N/A values that did not have
#another value in their lotfrontage bucket.
data_new['LotFrontage'].fillna(data_new['LotArea']/402.88, inplace = True)

#This one house had a garage type listed but all other garage features were NaN
data_new.at[data_new['PID'] == 910201180, 'GarageType'] = 'None'

data_new.to_csv('Ames_Housing_Price_Data_cleaned.csv')

# Data Cleaning Part 2
# Merging  Housing Price Data and Real Estate Data

In [3]:
housing_price_data = pd.read_csv('Ames_Housing_Price_Data_cleaned.csv')
real_estate_data = pd.read_csv('../data/Ames_Real_Estate_Data.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [4]:
#We are dropping the rows we deemed unnecessary from real estate data

real_estate_data = real_estate_data.drop(['Tier',
'Range','ZngCdPr','ZngCdSc','ZngOLPr','ZngOLSc','ClassPr_S','ClassSc_S','Legal_Pr','SchD_S',
'TxD_S','MA_Ownr1','MA_Ownr2','MA_Line1','MA_Line2','MA_City','MA_State','MA_Zip1','MA_Zip2','Rcrd_Yr','Rcrd_Mo','Inst1_No','Inst1_Yr',
'Inst1_Mo','Inst1TPr','LndAc_S','X1TPr_D','X1TSc_D','X2TPr_D','X2TSc_D','X1TPr_S','X1TSc_S',
'X2TPr_S','X2TSc_S','X2TSc_S'],axis = 1)

In [5]:
#We are merging the two datasets by PID (in housing price data) 
#and MapRefNo (real estate data)

housing_price_data = housing_price_data.merge(real_estate_data, how = 'left',\
                                              left_on= 'PID', right_on='MapRefNo')

In [6]:
# Now we are going to merge the housing_price dataset with the geo_location dataset
geo_data = pd.read_csv('../charles/data/geo/neighborhood_location_data.csv')

housing_price_data = housing_price_data.merge(geo_data, how = 'left',\
                         left_on ='Neighborhood_x', right_on = 'Short_hand')

In [7]:
housing_price_data

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,PID,GrLivArea,SalePrice_x,MSSubClass,MSZoning,LotFrontage,LotArea_x,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood_x,Condition1,Condition2,BldgType_x,HouseStyle_x,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl_x,Exterior1st,Exterior2nd,MasVnrType_x,MasVnrArea,ExterQual,ExterCond,Foundation_x,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating_x,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd_x,Functional,Fireplaces_x,FireplaceQu,GarageType_x,GarageYrBlt,GarageFinish,GarageCars,GarageArea_x,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea_x,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType_x,SaleCondition,lot_bucket,mean_LotFrontage,MapRefNo,GeoRefNo,Prop_Addr,ImpAc_S,OthAc_S,TtlVal_AsrYr,ValType,LndAcX1S,ImpAcX1S,ImpAcX2S,HSTtl_D,MilVal_D,HSTtl_S,MilVal_S,AcreX_S1,AcreGr,AcreNt_S,Neighborhood_y,LotArea_y,ParType,BldgNo_S,DwlgNo_S,BldgType_y,YrBuilt,HouseStyle_y,Foundation_y,RoofMatl_y,Ext1,Ext2,MasVnrType_y,Heating_y,Central Air,GLA,TtlBsmtSF,TotRmsAbvGrd_y,Fireplaces_y,PoolArea_y,GarageType_y,GarYrBlt,Cars,GarageArea_y,YrSold_YYYY,MoSold_MM,SalePrice_y,SaleType_y,SaleCond,ParclRel,PA-Nmbr,PA-PreD,PA-Strt,PA-StSfx,PA-PostD,PA-UnTyp,PA-UntNo,Date,Source,NmbrBRs,Short_hand,Full_Name,lat,long,distToUni
0,0,1,909176150,856,126000,30,RL,64.9,7890,Pave,,Reg,Lvl,AllPub,Corner,Gtl,SWISU,Norm,Norm,1Fam,1Story,6,6,1939,1950,Gable,CompShg,Wd Sdng,Wd Sdng,,0.0,TA,TA,CBlock,TA,TA,No,Rec,238.0,Unf,0.0,618.0,856.0,GasA,TA,Y,SBrkr,856,0,0,1.0,0.0,1,0,2,1,TA,4,Typ,1,Gd,Detchd,1939.0,Unf,2.0,399.0,TA,TA,Y,0,0,0,0,166,0,,,,0,3,2010,WD,Normal,"(7298.458, 8298.201]",64.939130,909176150.0,909176150.0,436 HAYWARD AVE,0.0,111700.0,149000.0,F,0.0,0.0,0.0,,,,,0.0,0.0,0.0,S&W ISU,7890.0,Dwg&Lot,0.0,1.0,1FamDet,1939.0,1-Story,C'Block,CompShg,Wd Sdng,Wd Sdng,,GasFWA,Yes,856.0,856.0,4.0,1.0,0.0,Detachd,1939.0,2.0,399.0,2017.0,2.0,155000.0,WRDConv,Normal,Deed,436,,HAYWARD,AVE,,,,10-Jul-20,Ames City Assessor,2.0,SWISU,South & West of Iowa State University,42.017899,-93.651731,0.66
1,1,2,905476230,1049,139500,120,RL,42.0,4235,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Edwards,Norm,Norm,TwnhsE,1Story,5,5,1984,1984,Gable,CompShg,HdBoard,HdBoard,BrkFace,149.0,Gd,TA,CBlock,Gd,TA,Mn,GLQ,552.0,ALQ,393.0,104.0,1049.0,GasA,TA,Y,SBrkr,1049,0,0,1.0,0.0,2,0,2,1,Gd,5,Typ,0,,Attchd,1984.0,Fin,1.0,266.0,TA,TA,Y,0,105,0,0,0,0,,,,0,2,2009,WD,Normal,"(3299.486, 4299.229]",46.154930,905476230.0,905476230.0,3416 WEST ST,0.0,130600.0,174100.0,F,0.0,0.0,0.0,,,,,0.0,0.0,0.0,Edwards,4235.0,Dwg&Lot,0.0,1.0,Twnhs-E,1984.0,1-Story,C'Block,CompShg,HdBoard,HdBoard,BrkFace,GasFWA,Yes,1049.0,1049.0,5.0,0.0,0.0,Attachd,1984.0,1.0,266.0,2020.0,1.0,174400.0,WRDConv,Normal,Deed,3416,,WEST,ST,,,,10-Jul-20,Ames City Assessor,2.0,Edwards,Edwards,42.023035,-93.673386,1.39
2,1,2,905476230,1049,139500,120,RL,42.0,4235,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Edwards,Norm,Norm,TwnhsE,1Story,5,5,1984,1984,Gable,CompShg,HdBoard,HdBoard,BrkFace,149.0,Gd,TA,CBlock,Gd,TA,Mn,GLQ,552.0,ALQ,393.0,104.0,1049.0,GasA,TA,Y,SBrkr,1049,0,0,1.0,0.0,2,0,2,1,Gd,5,Typ,0,,Attchd,1984.0,Fin,1.0,266.0,TA,TA,Y,0,105,0,0,0,0,,,,0,2,2009,WD,Normal,"(3299.486, 4299.229]",46.154930,905476230.0,905476230.0,3416 WEST ST,0.0,130600.0,174100.0,F,0.0,0.0,0.0,,,,,0.0,0.0,0.0,Edwards,4235.0,Dwg&Lot,0.0,1.0,Twnhs-E,1984.0,1-Story,C'Block,CompShg,HdBoard,HdBoard,BrkFace,GasFWA,Yes,1049.0,1049.0,5.0,0.0,0.0,Attachd,1984.0,1.0,266.0,2019.0,3.0,170000.0,WRDConv,Normal,Deed,3416,,WEST,ST,,,,10-Jul-20,Ames City Assessor,2.0,Edwards,Edwards,42.023035,-93.673386,1.39
3,2,3,911128020,1001,124900,30,C (all),60.0,6060,Pave,,Reg,Lvl,AllPub,Inside,Gtl,IDOTRR,Norm,Norm,1Fam,1Story,5,9,1930,2007,Hip,CompShg,MetalSd,MetalSd,,0.0,Gd,TA,BrkTil,TA,TA,No,ALQ,737.0,Unf,0.0,100.0,837.0,GasA,Ex,Y,SBrkr,1001,0,0,0.0,0.0,1,0,2,1,Gd,5,Typ,0,,Detchd,1930.0,Unf,1.0,216.0,TA,Po,N,154,0,42,86,0,0,,,,0,11,2007,WD,Normal,"(5298.972, 6298.715]",54.060150,911128020.0,911128020.0,320 S 2ND ST,0.0,123200.0,164300.0,F,0.0,0.0,0.0,,,,,0.0,0.0,0.0,IDOT&RR,6060.0,Dwg&Lot,0.0,1.0,1FamDet,1930.0,1-Story,Brk/Til,CompShg,MetalSd,MetalSd,,GasFWA,Yes,1001.0,837.0,5.0,0.0,0.0,Detachd,1930.0,1.0,216.0,,,,,,Deed,320,S,2ND,ST,,,,10-Jul-20,Ames City Assessor,2.0,IDOTRR,Iowa DOT and Rail Road,42.021605,-93.615604,1.64
4,3,4,535377150,1039,114000,70,RL,80.0,8146,Pave,,Reg,Lvl,AllPub,Corner,Gtl,OldTown,Norm,Norm,1Fam,2Story,4,8,1900,2003,Gable,CompShg,MetalSd,MetalSd,,0.0,Gd,Gd,BrkTil,Fa,TA,No,Unf,0.0,Unf,0.0,405.0,405.0,GasA,Gd,Y,SBrkr,717,322,0,0.0,0.0,1,0,2,1,TA,6,Typ,0,,Detchd,1940.0,Unf,1.0,281.0,TA,TA,N,0,0,168,0,111,0,,,,0,5,2009,WD,Normal,"(7298.458, 8298.201]",64.939130,535377150.0,535377150.0,1524 DOUGLAS AVE,0.0,93300.0,124400.0,F,0.0,0.0,0.0,,,4850.0,0.0,0.0,0.0,0.0,OldTown,8146.0,Dwg&Lot,0.0,1.0,1FamDet,1900.0,2-Story,Brk/Til,CompShg,MetalSd,MetalSd,,GasFWA,Yes,1039.0,405.0,6.0,0.0,0.0,Detachd,1940.0,1.0,281.0,2019.0,10.0,139000.0,WRDConv,Normal,Deed,1524.0,,DOUGLAS,AVE,,,,10-Jul-20,Ames City Assessor,2.0,OldTown,Old Town,42.033237,-93.617133,1.59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2619,2575,759,903205040,952,121000,30,RL,68.7,8854,Pave,,Reg,Lvl,AllPub,Inside,Gtl,BrkSide,Norm,Norm,1Fam,1.5Unf,6,6,1916,1950,Gable,CompShg,Wd Sdng,Wd Sdng,,0.0,TA,TA,BrkTil,TA,TA,No,Unf,0.0,Unf,0.0,952.0,952.0,Grav,Fa,N,FuseF,952,0,0,0.0,0.0,1,0,2,1,Fa,4,Typ,1,Gd,Detchd,1916.0,Unf,1.0,192.0,Fa,Po,P,0,98,0,0,40,0,,,,0,5,2009,WD,Normal,"(8298.201, 9297.944]",68.724832,903205040.0,903205040.0,1021 RIDGEWOOD AVE,0.0,103900.0,138500.0,F,0.0,0.0,0.0,4850.0,0.0,4850.0,0.0,0.0,0.0,0.0,BrkSide,8854.0,Dwg&Lot,0.0,1.0,1FamDet,1916.0,1.5 Unf,Brk/Til,CompShg,Wd Sdng,Wd Sdng,,Gravity,No,952.0,952.0,4.0,1.0,0.0,Detachd,1916.0,1.0,192.0,,,,,,Deed,1021,,RIDGEWOOD,AVE,,,,10-Jul-20,Ames City Assessor,2.0,BrkSide,Brookside,42.028240,-93.629960,0.87
2620,2576,760,905402060,1733,139600,20,RL,87.2,13680,Pave,,IR1,Lvl,AllPub,CulDSac,Gtl,Edwards,Norm,Norm,1Fam,1Story,3,5,1955,1955,Hip,CompShg,BrkFace,Wd Sdng,,0.0,TA,TA,Slab,,,,,0.0,,0.0,0.0,0.0,GasA,Ex,Y,FuseA,1733,0,0,0.0,0.0,2,0,4,1,TA,8,Min2,1,Gd,Attchd,1955.0,Unf,2.0,452.0,TA,TA,Y,0,0,0,0,0,0,,,,0,6,2009,WD,Normal,"(13296.916, 14296.659]",87.200000,905402060.0,905402060.0,3619 MARY CIR,0.0,138800.0,185100.0,F,0.0,0.0,0.0,,,,,0.0,0.0,0.0,Edwards,13680.0,Dwg&Lot,0.0,1.0,1FamDet,1955.0,1-Story,Slab,CompShg,BrkFace,Wd Sdng,,GasFWA,Yes,1733.0,0.0,8.0,1.0,0.0,Attachd,1955.0,2.0,452.0,,,,,,Deed,3619,,MARY,CIR,,,,10-Jul-20,Ames City Assessor,4.0,Edwards,Edwards,42.023035,-93.673386,1.39
2621,2577,761,909275030,2002,145000,90,RH,82.0,6270,Pave,,Reg,HLS,AllPub,Inside,Gtl,Crawfor,Norm,Norm,Duplex,2Story,5,6,1949,1950,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,TA,TA,No,BLQ,284.0,Unf,0.0,717.0,1001.0,GasA,TA,N,FuseA,1001,1001,0,0.0,0.0,2,0,4,2,TA,8,Typ,0,,2Types,1949.0,Unf,3.0,871.0,TA,TA,Y,0,0,0,0,0,0,,,,0,8,2007,WD,Normal,"(5298.972, 6298.715]",54.060150,909275030.0,909275030.0,2140 SUNSET DR 2142,0.0,137400.0,183200.0,F,0.0,0.0,0.0,,,,,0.0,0.0,0.0,Crawfor,6270.0,Dwg&Lot,0.0,1.0,Duplex,1949.0,2-Story,C'Block,CompShg,MetalSd,MetalSd,,GasFWA,No,2002.0,1001.0,8.0,0.0,0.0,No Data,0.0,0.0,0.0,,,,,,Deed,2140,,SUNSET,DR,,,2142,10-Jul-20,Ames City Assessor,4.0,Crawfor,Crawford,42.028112,-93.607153,2.04
2622,2578,762,907192040,1842,217500,60,RL,68.7,8826,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,144.0,Gd,TA,PConc,Gd,TA,No,GLQ,841.0,Unf,0.0,144.0,985.0,GasA,Ex,Y,SBrkr,985,857,0,1.0,0.0,2,1,3,1,Gd,7,Typ,1,TA,Attchd,2000.0,Fin,2.0,486.0,TA,TA,Y,193,96,0,0,0,0,,,,0,7,2007,WD,Normal,"(8298.201, 9297.944]",68.724832,907192040.0,907192040.0,5319 CLEMENS BLVD,0.0,198900.0,265200.0,F,0.0,0.0,0.0,4850.0,0.0,4850.0,0.0,0.0,0.0,0.0,CollgCr,8826.0,Dwg&Lot,0.0,1.0,1FamDet,2000.0,2-Story,P'Conc,CompShg,VinylSd,VinylSd,BrkFace,GasFWA,Yes,1842.0,985.0,7.0,1.0,0.0,Attachd,2000.0,2.0,486.0,,,,,,Deed,5319,,CLEMENS,BLVD,,,,10-Jul-20,Ames City Assessor,3.0,CollgCr,College Creek,42.021328,-93.685522,2.03


In [8]:
housing_price_data.to_csv('Ames_Housing_Price_Data_cleaned_2.csv')