# The Ames Housing Dataset

The Ames Housing Dataset(the file Ames_HousePrice) has data for 2580 house sales, there are 81 descriptive variables for each house. We load the 'ames final' file, which is merged with the Real Estate file, gaining 7 new variables for lat, long, distance, etc.

In [35]:
import pandas as pd
from datetime import date

In [36]:
housing = pd.read_csv('data/ames_final.csv')

In [37]:
print("Number of houses: ", housing['SalePrice'].count())
print("Number of features in dataset: ", len(housing.columns))
#print("Columns in dataset: ", housing.columns)

Number of houses:  2579
Number of features in dataset:  86


# Handling Missing Data

In [38]:
# Handling these duplicates is no longer necessary, 
#   its done in the real estate preprocessing

#drop these duplicates
#print(len(housing.PID.unique()))
#print(len(housing))

#duplicates = housing.duplicated()
#print("Number of duplicates: ", len(housing[duplicates]))
#housing = housing.drop_duplicates()

#print(len(housing))

In [39]:
# Here we look at variables that need to be removed because of missing values.
# For now, we will leave all of these in, to see if they add value to the model

missing_values=housing.isna().sum()
missing_values_features=missing_values[missing_values>0].sort_values(ascending=False)
missing_values_features
#missing_values_features.plot(kind='bar', figsize=(10,6))

PoolQC           2570
MiscFeature      2482
Alley            2411
Fence            2054
FireplaceQu      1241
LotFrontage       462
GarageFinish      129
GarageCond        129
GarageQual        129
GarageYrBlt       129
GarageType        127
longitude          82
latitude           82
DistanceToISU      82
BsmtExposure       71
BsmtFinType2       70
BsmtQual           69
BsmtCond           69
BsmtFinType1       69
MA_Zip1            22
Prop_Addr          20
MasVnrType         14
MasVnrArea         14
BsmtFullBath        2
BsmtHalfBath        2
BsmtUnfSF           1
GarageArea          1
GarageCars          1
TotalBsmtSF         1
Electrical          1
BsmtFinSF1          1
BsmtFinSF2          1
dtype: int64

In [40]:
#Here we see that PoolQC, MiscFeature, Alley, and Fence 
#  all have mostly missing values, so drop them from the housing dataset. Now there are 77  features.
#For now, we leave them in to explore the model. 
#housing=housing.drop(columns=['PoolQC', 'MiscFeature', 'Alley', 'Fence'])

In [41]:
#housing.info()
#housing.describe()
#housing.head()

# Filling Missing Data

In [42]:
# Fill NaN with 'None' for categorical columns, and fill NaN with 0 for numerical features.
# Loop through each feature and examine
for i, feature in enumerate(housing.columns):
    if housing[feature].dtype=='object':
        housing[feature]=housing[feature].fillna('None')
        #print(i,feature,housing[feature].unique())
        #print(i)
    else:
        housing[feature] = housing[feature].fillna(0)
        #print(housing[feature].describe())
        #print(i)


In [43]:
# Drop I, C, A (industrial, commercial, agri)
# Found 1 row with A, 17 with C, 2 with I
# https://www.zoneomics.com/zoning-maps/iowa/ames

#print(housing.groupby('MSZoning').count())

housing = housing[housing['MSZoning'].isin(['FV','RH','RL','RM'])]

print(housing['MSZoning'].unique())
print(housing['MSZoning'].count())

['RL' 'RM' 'FV' 'RH']
2559


# Adding Features

In [44]:
# Calculate the YearsSinceRemod column
housing['YearsSinceRemod'] = housing['YearRemodAdd'] - housing['YearBuilt']

# Create the 'TotalSF' column by summing the values in the SF columns
#sf_columns = [col for col in housing.columns if 'SF' in col]
sf_columns = ['TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF']
housing['TotalSF'] = housing[sf_columns].sum(axis=1)

#This feature causes data leakage if included in the modeling step
#housing['PricePerSF'] = housing['SalePrice'] / housing['GrLivArea']

date_sold = list(zip(housing['YrSold'], housing['MoSold']))
housing['DateSold'] = [date(y,m,15) for y,m in date_sold]

# Mapping Data for Ordinal Categorical Values

In [45]:
dict_map5 = {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
dict_map05 = {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
dict_map04 = {'None': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4}
dict_map_slope = {'Sev': 1, 'Mod': 2, 'Gtl': 3}
housing['ExterQual']=housing['ExterQual'].map(dict_map5)
housing['ExterCond']=housing['ExterCond'].map(dict_map5)
housing['BsmtQual']=housing['BsmtQual'].map(dict_map05)
housing['BsmtCond']=housing['BsmtCond'].map(dict_map05)
housing['HeatingQC']=housing['HeatingQC'].map(dict_map5)
housing['KitchenQual']=housing['KitchenQual'].map(dict_map5)
housing['FireplaceQu']=housing['FireplaceQu'].map(dict_map05)
housing['GarageQual']=housing['GarageQual'].map(dict_map05)
housing['GarageCond']=housing['GarageCond'].map(dict_map05)
housing['PoolQC']=housing['PoolQC'].map(dict_map04)
housing['LandSlope']=housing['LandSlope'].map(dict_map_slope)

# Sorting Features Into Numerical/Categorical

In [46]:
# Loop through each feature and examine datatype
numerical_features = []
cat_ord_features = [] #ordinal features, can be ranked from hi to lo
cat_nom_features = [] #nominal features, cannot be ranked

for i, feature in enumerate(housing.columns):
    if feature=='PoolArea' or feature=='DateSold':
        numerical_features.append(feature)
    elif feature=='MSSubClass' or feature=='MoSold' or feature=='YrSold':
        cat_nom_features.append(feature)
    elif feature=='LandSlope':
        cat_ord_features.append(feature)
    elif feature=='MA_Zip1' or feature=='latitude' or feature=='longitude'  or feature=='Prop_Addr':
        #do not add to features lists
        continue
    elif housing[feature].dtype=='object':
        cat_nom_features.append(feature)
    else:
        if len(housing[feature].unique()) < 20:
            cat_ord_features.append(feature)
        else:
            numerical_features.append(feature)

print("Number of numerical features: ", len(numerical_features))
for i, feature in enumerate(housing[numerical_features].columns):
    print("Numerical\t", feature, "\t Number of unique values: ", len(housing[feature].unique()))

print("Number of ordinal features: ", len(cat_ord_features))
for i, feature in enumerate(housing[cat_ord_features].columns):
    print("Categorical, ordinal\t\t", feature, housing[feature].unique())

print("Number of nominal features: ", len(cat_nom_features))
for i, feature in enumerate(housing[cat_nom_features].columns):
    print("Categorical, nom\t\t", feature, housing[feature].unique())

tot=len(numerical_features)+len(cat_ord_features)+len(cat_nom_features)
print("Total number of features: ", tot)

Number of numerical features:  27
Numerical	 GrLivArea 	 Number of unique values:  1209
Numerical	 SalePrice 	 Number of unique values:  861
Numerical	 LotFrontage 	 Number of unique values:  127
Numerical	 LotArea 	 Number of unique values:  1744
Numerical	 YearBuilt 	 Number of unique values:  114
Numerical	 YearRemodAdd 	 Number of unique values:  61
Numerical	 MasVnrArea 	 Number of unique values:  416
Numerical	 BsmtFinSF1 	 Number of unique values:  926
Numerical	 BsmtFinSF2 	 Number of unique values:  264
Numerical	 BsmtUnfSF 	 Number of unique values:  1058
Numerical	 TotalBsmtSF 	 Number of unique values:  985
Numerical	 1stFlrSF 	 Number of unique values:  1022
Numerical	 2ndFlrSF 	 Number of unique values:  588
Numerical	 LowQualFinSF 	 Number of unique values:  30
Numerical	 GarageYrBlt 	 Number of unique values:  100
Numerical	 GarageArea 	 Number of unique values:  571
Numerical	 WoodDeckSF 	 Number of unique values:  371
Numerical	 OpenPorchSF 	 Number of unique values: 

In [51]:
#numerical_features
#cat_ord_features
#cat_nom_features

In [48]:
housing.to_csv('data/housing_final.csv', index=False)