# The Ames Housing Dataset

The Ames Housing Dataset has data for 2580 house sales, there are 81 descriptive variables for each house. 

In [55]:
import pandas as pd
from datetime import date

In [56]:
housing = pd.read_csv('data/Ames_HousePrice.csv', index_col=0)

In [57]:
#housing.count()
housing.columns

Index(['PID', 'GrLivArea', 'SalePrice', 'MSSubClass', 'MSZoning',
       'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour',
       'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
       'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond',
       'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
       'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF',
       '2ndFlrSF', 'LowQualFinSF', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond

In [58]:
#housing.info()
#housing.describe()
#housing.head()

# Handling Missing or Duplicate Data

In [59]:
#drop this duplicate
print(len(housing.PID.unique()))
print(len(housing))

duplicates = housing.duplicated()
print(housing[duplicates])
housing = housing.drop_duplicates()

print(len(housing))

2579
2580
         PID  GrLivArea  SalePrice  MSSubClass MSZoning  LotFrontage  LotArea  \
1  909276070       1717     194000          50       RL         80.0    12400   

  Street Alley LotShape  ... ScreenPorch PoolArea PoolQC Fence MiscFeature  \
1   Pave   NaN      Reg  ...         113        0    NaN   NaN         NaN   

  MiscVal MoSold YrSold SaleType  SaleCondition  
1       0      2   2006      WD          Normal  

[1 rows x 81 columns]
2579


Here we look at variables that need to be removed because of missing values.

In [60]:
missing_values=housing.isna().sum()
missing_values_features=missing_values[missing_values>0].sort_values(ascending=False)
missing_values_features
#missing_values_features.plot(kind='bar', figsize=(10,6))

PoolQC          2570
MiscFeature     2482
Alley           2411
Fence           2054
FireplaceQu     1241
LotFrontage      462
GarageCond       129
GarageQual       129
GarageFinish     129
GarageYrBlt      129
GarageType       127
BsmtExposure      71
BsmtFinType2      70
BsmtFinType1      69
BsmtQual          69
BsmtCond          69
MasVnrArea        14
MasVnrType        14
BsmtHalfBath       2
BsmtFullBath       2
TotalBsmtSF        1
BsmtUnfSF          1
GarageCars         1
GarageArea         1
BsmtFinSF2         1
BsmtFinSF1         1
Electrical         1
dtype: int64

Here we see that PoolQC, MiscFeature, Alley, and Fence all have mostly missing values, so drop them from the housing dataset. Now there are 77  features.

In [61]:
housing=housing.drop(columns=['PoolQC', 'MiscFeature', 'Alley', 'Fence'])

# Adding Features

In [62]:
# Calculate the YearsSinceRemod column
housing['YearsSinceRemod'] = housing['YearRemodAdd'] - housing['YearBuilt']

In [63]:
# Create the 'TotalSF' column by summing the values in the SF columns
sf_columns = [col for col in housing.columns if 'SF' in col]
housing['TotalSF'] = housing[sf_columns].sum(axis=1)

In [64]:
housing['PricePerSF'] = housing['SalePrice'] / housing['GrLivArea']

In [67]:
date_sold = list(zip(housing['YrSold'], housing['MoSold']))
housing['DateSold'] = [date(y,m,15) for y,m in date_sold]

# Filling Missing Data

Fill NaN with 'None' for categorical columns, and fill NaN with 0 for numerical features.

In [68]:
# Loop through each feature and examine
for i, feature in enumerate(housing.columns):
    if housing[feature].dtype=='object':
        housing[feature]=housing[feature].fillna('None')
        #print(i,feature,housing[feature].unique())
        #print(i)
    else:
        housing[feature] = housing[feature].fillna(0)
        #print(housing[feature].describe())
        #print(i)


Notes on Categorical variables

In [69]:
#The first "object" encountered was MSZoning
# possibly drop I, C, A (industrial, commercial, agri)
# https://www.zoneomics.com/zoning-maps/iowa/ames

#Condition1 and Condition2 have the same type data, are they identical for all houses?
#Same with: Exterior1st, Exterior2nd + ExterQual, ExterCond + BsmtQual, BsmtCond + BsmtFinType1, BsmtFinType2 + GarageQual,Cond


In [70]:
housing.to_csv('data/housing_final.csv', index=False)