In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
housing = pd.read_csv('../datasets/train.csv')

In [3]:
housing.head(2)

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,0,0,,,,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,0,0,,,,0,4,2009,WD,220000


In [4]:
# replacing column names with lower case letters, replacing spaces and forward slashes with underscores
housing.columns = housing.columns.str.lower().str.replace(' ', '_').str.replace('/', '_')

#### Encoding Ordinal Columns

In [5]:
# Dictionary of common scoring
# Scored: None, Poor, Fair, Typical Average, Good, Excellent
qual_dict = {np.nan:0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}

# Replacing strings with equivalent ints for modeling and analysis
housing['heating_qc'] = housing['heating_qc'].map(qual_dict)
housing['exter_qual'] = housing['exter_qual'].map(qual_dict)
housing['kitchen_qual'] = housing['kitchen_qual'].map(qual_dict)
housing['fireplace_qu'] = housing['fireplace_qu'].map(qual_dict)
housing['bsmt_cond'] = housing['bsmt_cond'].map(qual_dict)
housing['bsmt_qual'] = housing['bsmt_qual'].map(qual_dict)
housing['garage_qual'] = housing['garage_qual'].map(qual_dict)

# custom dictionary for functional column
# mapping equivalent values
functionality = {'Sal':1, 'Sev':2, 'Maj2':3, 'Maj1':4, 'Mod':5, 'Min2':6, 'Min1':7, 'Typ':8}
housing['functional'] = housing['functional'].map(functionality)

# custom dictionary for basement finish type column
# mapping equivalent values
basement_type = {np.nan:0, 'Unf':1, 'LwQ':2, 'Rec':3, 'BLQ':4, 'ALQ':5, 'GLQ':6}
housing['bsmtfin_type_1'] = housing['bsmtfin_type_1'].map(basement_type)

# binary encoding whether the house has central air
housing['central_air'] = housing['central_air'].map({'Y':1, 'N':0})

Creating .csv with new column names and encoded data to be used in modeling and analysis 

In [7]:
housing.to_csv('../datasets/train_clean.csv', index=False)

Missing values will be filled using KNNImputer during modeling

# Resources

Used this site to find out more about the Somerset area of Ames

https://www.somersetames.com/

<br>

Used this site to look up parcel number

https://www.iowatreasurers.org/index.php?module=parceldetail#searchresult

<br>

Used this site to learn more about tornado alley and iowa's location in it

https://en.wikipedia.org/wiki/Tornado_Alley