# Feature Engineering
- cleaning, filtering, dropping data 
- dummifying and label encoding 
- combining related / dependent features 
- standardization / normalization 
- log of y 

In [29]:
import pandas as pd 
import numpy as np 

pd.set_option('display.max_columns', None)

# Comparing raw vs "cleaned" datasets

In [40]:
df_cleaned3 = pd.read_csv('../communal/Ames_Housing_Price_Data_cleaned_3.csv')
df_cleaned2 = pd.read_csv('../communal/Ames_Housing_Price_Data_cleaned_2.csv')
df_cleaned = pd.read_csv('../communal/Ames_Housing_Price_Data_cleaned.csv')

In [41]:
#df_cleaned3.info() #59 features
#df_cleaned2.info() #84 features
#df_cleaned.info() #82 features, no lat or long

In [42]:
removed_features = ['RoofStyle','Exterior2nd','RoofMatl','Condition1','BsmtFinType2','LandContour',
                    'Street','Condition2','Functional','KitchenAbvGr','Alley','LotConfig',
                    'Utilities','Heating','LandSlope','3SsnPorch','EnclosedPorch','Fence',
                    'MiscFeature','MiscVal','OpenPorchSF','PoolArea','PoolQC','SaleCondition',
                    'SaleType','ScreenPorch']

#SaleCondition? 

# Filtering, & dropping features

In [43]:
df_housing = df_cleaned3

#### Dropping features and outliers

In [44]:
df_housing = df_housing.drop(['PID','Prop_Addr','mean_LotFrontage','lot_bucket','GarageYrBlt'],axis = 1)

#### Categorizing Features

In [45]:
#quick looks at our graphs 
# categorical = df_housing.filter(categorical_columns)
# numerical = df_housing.filter(numerical_columns)
# numerical_sf_area = df_housing.filter(regex ='SF$|Area$')
# numerical_not_SF = df_housing.drop(df_housing.filter(regex='SF$|Area$').columns,axis=1)

In [48]:
numerical_features = ['GrLivArea', 'LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd','MasVnrArea', 
                     'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF','2ndFlrSF', 
                     'GarageArea','WoodDeckSF','TotalPorchSF', 'MoSold', 'YrSold',
                     'BsmtBath','Bath']

ordinal_features = ['OverallQual', 'OverallCond','ExterQual','ExterCond','BsmtQual','BsmtCond','BsmtExposure',
                'BsmtFinType1','HeatingQC','BedroomAbvGr','KitchenQual','TotRmsAbvGrd','Fireplaces',
                'FireplaceQu','GarageCars', 'GarageQual', 'GarageCond']

cat_features = ['MSSubClass', 'MSZoning', 'LotShape', 'Neighborhood','BldgType','HouseStyle', 'PavedDrive',
                 'Exterior1st','MasVnrType','Foundation','CentralAir', 
                 'Electrical','GarageType', 'GarageFinish']

In [49]:
df_housing[numerical_features] = df_housing[numerical_features].astype('float')

#### Combining related & dependent features

In [50]:
#combining number of bathrooms 
#turning half baths to .5
df_housing['BsmtHalfBath'] = df_housing['BsmtHalfBath'].replace([1], 0.5)
df_housing['HalfBath'] = df_housing['HalfBath'].replace([1], 0.5)

#adding half bathrooms to full baths
df_housing['BsmtBath'] = df_housing['BsmtHalfBath'] + df_housing['BsmtFullBath']
df_housing['Bath'] = df_housing['HalfBath'] + df_housing['FullBath']

df_housing.drop(['BsmtHalfBath','BsmtFullBath','HalfBath','FullBath'], axis = 1)

df_housing.head(2)

Unnamed: 0,GrLivArea,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,LotShape,Neighborhood,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,Exterior1st,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenQual,TotRmsAbvGrd,Fireplaces,FireplaceQu,GarageType,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,MoSold,YrSold,lat,long,TotalPorchSF,BsmtBath,Bath
0,856.0,126000,30,RL,64.9,7890.0,Reg,SWISU,1Fam,1Story,6,6,1939.0,1950.0,Wd Sdng,,0.0,TA,TA,CBlock,TA,TA,No,Rec,238.0,0.0,618.0,856.0,TA,Y,SBrkr,856.0,0.0,0,1.0,0.0,1,0.0,2,TA,4,1,Gd,Detchd,Unf,2.0,399.0,TA,TA,Y,0.0,3.0,2010.0,42.01778,-93.651452,166.0,1.0,1.0
1,1049.0,139500,120,RL,42.0,4235.0,Reg,Edwards,TwnhsE,1Story,5,5,1984.0,1984.0,HdBoard,BrkFace,149.0,Gd,TA,CBlock,Gd,TA,Mn,GLQ,552.0,393.0,104.0,1049.0,TA,Y,SBrkr,1049.0,0.0,0,1.0,0.0,2,0.0,2,Gd,5,0,,Attchd,Fin,1.0,266.0,TA,TA,Y,0.0,2.0,2009.0,42.024697,-93.664186,105.0,1.0,2.0


#### Ordinal Features & Label Encoding
 - converting ordinal categories using integer encoding


In [57]:
## converting ordinal categories using integer encoding

ordinal_features = ['OverallQual', 'OverallCond','ExterQual','ExterCond','BsmtQual','BsmtCond','BsmtExposure',
                'BsmtFinType1','HeatingQC','BedroomAbvGr','KitchenQual','TotRmsAbvGrd','Fireplaces',
                'FireplaceQu','GarageCars', 'GarageQual', 'GarageCond']

dict_1 = {'Ex':5, 'Gd':4, 'TA':3, 'Fa':2,'Po':1, 'None':0} #9
dict_2 = {'Gd':4, 'Av':3, 'Mn':2, 'No':1,'None':0}
dict_3 = {'GLQ':6,'ALQ':5,'BLQ':4,'Rec':3,'LwQ':2,'Unf':1,'None':0}

for col in ordinal_features: 
    unique_vals = df_housing[col].unique()
    if (set(unique_vals).issubset(set(dict_1.keys()))) == True:
        df_housing [col] = df_housing[col].replace(dict_1)
    elif (set(unique_vals).issubset(set(dict_2.keys()))) == True:   
        df_housing [col] = df_housing[col].replace(dict_2)
    elif (set(unique_vals).issubset(set(dict_3.keys()))) == True:   
        df_housing [col] = df_housing[col].replace(dict_3)

In [206]:
#check
#df_ordinals = df_housing.filter(ordinal_cols_list, axis=1)
#df_ordinals.isnull().sum(axis=0)

#### Categorical Features and Dummifying:  <span style="color: red;">*Dummified Dataset Here*</span>

In [53]:
#dummifying categorical features 
dummy = pd.get_dummies(df_housing, prefix= cat_features, 
                                  columns= cat_features, drop_first = True)
dummy = dummy.dropna()

In [54]:
dummy.describe()

Unnamed: 0,GrLivArea,SalePrice,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,HeatingQC,1stFlrSF,2ndFlrSF,LowQualFinSF,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenQual,TotRmsAbvGrd,Fireplaces,FireplaceQu,GarageCars,GarageArea,GarageQual,GarageCond,WoodDeckSF,MoSold,YrSold,lat,long,TotalPorchSF,BsmtBath,Bath,MSSubClass_30,MSSubClass_40,MSSubClass_45,MSSubClass_50,MSSubClass_60,MSSubClass_70,MSSubClass_75,MSSubClass_80,MSSubClass_85,MSSubClass_90,MSSubClass_120,MSSubClass_150,MSSubClass_160,MSSubClass_180,MSSubClass_190,MSZoning_C (all),MSZoning_FV,MSZoning_I (all),MSZoning_RH,MSZoning_RL,MSZoning_RM,LotShape_IR2,LotShape_IR3,LotShape_Reg,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_Greens,Neighborhood_GrnHill,Neighborhood_IDOTRR,Neighborhood_Landmrk,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl,PavedDrive_P,PavedDrive_Y,Exterior1st_AsphShn,Exterior1st_BrkComm,Exterior1st_BrkFace,Exterior1st_CBlock,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_ImStucc,Exterior1st_MetalSd,Exterior1st_Plywood,Exterior1st_PreCast,Exterior1st_Stucco,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_WdShing,MasVnrType_BrkFace,MasVnrType_None,MasVnrType_Stone,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,CentralAir_Y,Electrical_FuseF,Electrical_FuseP,Electrical_SBrkr,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageType_None,GarageFinish_None,GarageFinish_RFn,GarageFinish_Unf
count,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0,2624.0
mean,1482.796494,177875.010671,69.552314,10089.203125,6.040015,5.617378,1970.320122,1983.696265,98.773628,3.370427,3.092607,3.450076,2.924543,1.602515,3.601372,443.890625,53.129954,538.500762,1035.521341,4.133003,1143.209604,335.324314,4.262576,0.435595,0.031441,1.54878,0.195503,2.846799,3.47904,6.380335,0.599085,1.749238,1.744665,465.891387,2.814787,2.823933,95.888338,6.149009,2007.839558,42.034326,-93.642423,87.685595,0.467035,1.744284,0.047637,0.002287,0.00686,0.100229,0.193216,0.04497,0.008003,0.041921,0.016387,0.033155,0.06593,0.000381,0.047637,0.006098,0.02096,0.00686,0.043445,0.000762,0.008003,0.776296,0.164253,0.027439,0.004954,0.638338,0.003811,0.011433,0.039634,0.015244,0.091463,0.035823,0.064405,0.055259,0.003049,0.000762,0.030488,0.000381,0.014482,0.040015,0.158918,0.008765,0.047256,0.025534,0.046875,0.082698,0.016006,0.053735,0.043445,0.055259,0.017149,0.020579,0.008765,0.021341,0.033155,0.040015,0.078125,0.007622,0.5,0.002287,0.008384,0.299162,0.026677,0.045732,0.023247,0.905107,0.000762,0.001905,0.030488,0.000762,0.038491,0.160823,0.000381,0.158537,0.075457,0.000381,0.014101,0.336509,0.148247,0.019817,0.310976,0.610137,0.071265,0.443979,0.42721,0.016006,0.003811,0.001905,0.937119,0.015244,0.002668,0.917302,0.591463,0.01029,0.05907,0.00343,0.278201,0.049543,0.049924,0.278582,0.432165
std,488.843164,75776.355321,24.149991,8079.41949,1.37468,1.120265,29.675273,20.492254,176.089878,0.561147,0.375831,0.882641,0.562204,1.05986,2.089773,429.316938,173.725947,424.756446,422.130884,0.957858,377.285899,422.843264,44.260881,0.518479,0.130479,0.545231,0.28309,0.821094,0.640139,1.536495,0.648244,1.786871,0.7386,207.112178,0.690877,0.686079,128.871453,2.666688,1.314666,0.018107,0.026015,106.9593,0.515449,0.657203,0.213038,0.047773,0.082555,0.300362,0.394897,0.207277,0.089118,0.200446,0.126983,0.179077,0.248207,0.019522,0.213038,0.077863,0.143279,0.082555,0.203896,0.027603,0.089118,0.416806,0.370576,0.16339,0.070225,0.480573,0.061627,0.106332,0.195135,0.122545,0.288322,0.185884,0.245521,0.228529,0.055142,0.027603,0.171958,0.019522,0.119488,0.196032,0.365669,0.093229,0.212227,0.157769,0.211411,0.275478,0.125523,0.225537,0.203896,0.228529,0.129853,0.141998,0.093229,0.144547,0.179077,0.196032,0.268419,0.086987,0.500095,0.047773,0.091198,0.457978,0.161168,0.208942,0.150716,0.293123,0.027603,0.043619,0.171958,0.027603,0.192415,0.367438,0.019522,0.365313,0.264178,0.019522,0.117928,0.472605,0.355412,0.139398,0.462981,0.487812,0.257316,0.496946,0.494768,0.125523,0.061627,0.043619,0.242795,0.122545,0.051591,0.275478,0.491657,0.100934,0.235801,0.058476,0.448198,0.21704,0.217829,0.448387,0.495471
min,334.0,12789.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,334.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,41.986321,-93.693158,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1107.0,129887.5,59.6,7386.0,5.0,5.0,1953.0,1965.0,0.0,3.0,3.0,3.0,3.0,1.0,1.0,0.0,0.0,213.75,788.0,3.0,868.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,3.0,5.0,0.0,0.0,1.0,315.75,3.0,3.0,0.0,4.0,2007.0,42.021961,-93.660951,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1432.0,159000.0,69.0,9368.5,6.0,5.0,1972.0,1992.0,0.0,3.0,3.0,3.0,3.0,1.0,4.0,384.0,0.0,448.0,976.0,4.0,1068.0,0.0,0.0,0.0,0.0,2.0,0.0,3.0,3.0,6.0,1.0,1.0,2.0,474.0,3.0,3.0,0.0,6.0,2008.0,42.034775,-93.640741,50.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1728.0,208925.0,80.0,11475.75,7.0,6.0,1999.0,2003.0,156.0,4.0,3.0,4.0,3.0,2.0,6.0,732.0,0.0,784.0,1262.5,5.0,1362.25,702.0,0.0,1.0,0.0,2.0,0.5,3.0,4.0,7.0,1.0,4.0,2.0,576.0,3.0,3.0,168.0,8.0,2009.0,42.049071,-93.621658,133.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
max,4676.0,755000.0,408.707307,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5.0,5.0,5.0,5.0,4.0,6.0,2288.0,1526.0,2336.0,3206.0,5.0,3820.0,1872.0,1064.0,3.0,2.0,4.0,2.0,6.0,5.0,13.0,4.0,5.0,5.0,1488.0,5.0,5.0,1424.0,12.0,2010.0,42.063401,-93.57949,1207.0,3.0,6.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### <span style="color: red;"> Saving dummified data</span>

In [56]:
dummy.to_csv('../nicole/data/dummy_nicole_feature_engineered.csv', index = False)