# 02_data_cleaning: Clean Data/Prep for ML
Date: 2022-06-08

## Load Packages and Data

In [161]:
import os
#os.chdir('..')
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [27]:
#read in dataset
data = pd.read_csv('data/train.csv')
#separate predictors from target values 
training = data.drop('SalePrice', axis = 1) 
prices = data['SalePrice'].copy()

## Develop Cleaning Plan 

In [12]:
# see percentage of missing values 
percent_null = training.isnull().mean()

# see columns with more than 5% missing 
percent_null[percent_null > 0.05]

LotFrontage     0.177397
Alley           0.937671
FireplaceQu     0.472603
GarageType      0.055479
GarageYrBlt     0.055479
GarageFinish    0.055479
GarageQual      0.055479
GarageCond      0.055479
PoolQC          0.995205
Fence           0.807534
MiscFeature     0.963014
dtype: float64

**Review how to process these columns:**

Working with NA data: 
- Drop: 
    - `GarageYrBlt`: This variable is very correlated with `YearBuilt` and it doesn't make sense to impute NA values with anything. 
- Needs Imputation: 
    - `LotFrontage` since it seems like this might be correlated with price (but don't want to drop 18% of data) 
- Based on the data_description.txt file, it looks like many columns will be fixed if we change NA values to a category (not having that characteristic)
    - Garage data sets NA when there is no garage, so we will want to treat NA as a flag for no garage. 
    - Similarly, `FireplaceQu` only applies for homes with a Fireplace. Add a flag for no fireplace. 
    - `Alley` - change to flag of no alley
    - `Pool` - change to flag of No Pool 
    - `Fence` - change to flag of No Fence
    - `MiscFeature` - Only One Hot Encode non-NA since it other categories indicate having features such as elevator, shed, etc. and NA means there are no extra features.

Columns to One-Hot Encode that are currently numeric:
- `MSSubClass`: Numbers refer to differnt types of construction and should be categorical. 
- `YearRemodAdd`: Change to Binary if there is a remodel or not. (Based on scatter plots from 01 notebook that if there is no remodel, it seems this is just the same age as the construction year). 

## Clean Data 

Numeric: 
- Create a custom transformer for the YearRemodAdd 
- Use `SimpleImputer()` to impute variables like LotFrontage. Use median (better against outliers). 
- Scale the data using `MinMaxScaler()` 

Categorical: 
- Change numeric variables to categorical where applicable (ie., MSSubClass)
- Change NAs to a new category of not having the characteristic 

### Numeric 

In [139]:
#drop these columns from training set 
drop_columns = ['Id', 'GarageYrBlt'] 
training = training.drop(drop_columns, axis = 1)

In [76]:
# create transformer to get if there is a remodel or not 
class CleanRemodels(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y = None): 
        return self 
    
    def transform(self, X): 
        
        year_built = X['YearBuilt'].tolist()
        year_remodel = X['YearRemodAdd'].tolist()
        
        # If remodel year is after the built year, than the house was remodeled. 
        remodel = [1 if year_remodel[i] > year_built[i] else 0 for i in range(X.shape[0])]
        
        #Add the new binary column for whether there was a remodel, and remove the 'YearRemodAdd' column
        X['Remodeled'] = remodel
        X.drop('YearRemodAdd', axis = 1)
        
        return X
    

In [136]:
# create Pipeline for numeric variables 
# Note: since transfomer function returns array instead of dataframe, had to make CleanRemodel() function run first. 

numeric_pipeline = Pipeline(steps = [
    ('postprocess', CleanRemodels()),
    ('impute', SimpleImputer(strategy = 'median')),
    ('scale', MinMaxScaler())
])

# test the pipeline 

# get only numeric variables (and omit the MSSubClass variable since we'll make it categorical and Id since it's irrelevant)
train_num = training.select_dtypes(include = 'number').drop(['MSSubClass'], axis = 1)

housing_num = numeric_pipeline.fit_transform(train_num)

In [137]:
housing_num[:2]

array([[0.15068493, 0.0334198 , 0.66666667, 0.5       , 0.94927536,
        0.88333333, 0.1225    , 0.12508859, 0.        , 0.06421233,
        0.1400982 , 0.11977972, 0.41355932, 0.        , 0.25923135,
        0.33333333, 0.        , 0.66666667, 0.5       , 0.375     ,
        0.33333333, 0.5       , 0.        , 0.5       , 0.3864598 ,
        0.        , 0.11151737, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.09090909, 0.5       , 0.55555556,
        0.        ],
       [0.20205479, 0.03879502, 0.55555556, 0.875     , 0.75362319,
        0.43333333, 0.        , 0.17328136, 0.        , 0.12157534,
        0.20654664, 0.21294172, 0.        , 0.        , 0.17483044,
        0.        , 0.5       , 0.66666667, 0.        , 0.375     ,
        0.33333333, 0.33333333, 0.33333333, 0.5       , 0.32440056,
        0.34772462, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.36363636, 0.25      , 0.66666667,
        0.        ]])

### Categorical

In [150]:
# get all categorical variables 
num_columns = train_num.columns.to_list()
train_cat = training.drop(num_columns, axis = 1)

# change MSSubClass to categorical from numeric
train_cat['MSSubClass'] = train_cat['MSSubClass'].apply(str) 

# change NA values to 'None' for each of columns below
columns_to_change_null_values = ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 
                                 'PoolQC', 'Fence', 'MiscFeature', 'Alley', 'FireplaceQu',
                                'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
                                'BsmtFinType2', 'Electrical']

for column in columns_to_change_null_values: 
    train_cat.loc[train_cat[column].isnull(), column] = 'None'


In [151]:
# check that NA values are gone 
for column in columns_to_change_null_values: 
    print(column)
    print(train_cat[column].unique())

GarageType
['Attchd' 'Detchd' 'BuiltIn' 'CarPort' 'None' 'Basment' '2Types']
GarageFinish
['RFn' 'Unf' 'Fin' 'None']
GarageQual
['TA' 'Fa' 'Gd' 'None' 'Ex' 'Po']
GarageCond
['TA' 'Fa' 'None' 'Gd' 'Po' 'Ex']
PoolQC
['None' 'Ex' 'Fa' 'Gd']
Fence
['None' 'MnPrv' 'GdWo' 'GdPrv' 'MnWw']
MiscFeature
['None' 'Shed' 'Gar2' 'Othr' 'TenC']
Alley
['None' 'Grvl' 'Pave']
FireplaceQu
['None' 'TA' 'Gd' 'Fa' 'Ex' 'Po']
MasVnrType
['BrkFace' 'None' 'Stone' 'BrkCmn']
BsmtQual
['Gd' 'TA' 'Ex' 'None' 'Fa']
BsmtCond
['TA' 'Gd' 'None' 'Fa' 'Po']
BsmtExposure
['No' 'Gd' 'Mn' 'Av' 'None']
BsmtFinType1
['GLQ' 'ALQ' 'Unf' 'Rec' 'BLQ' 'None' 'LwQ']
BsmtFinType2
['Unf' 'BLQ' 'None' 'ALQ' 'Rec' 'LwQ' 'GLQ']
Electrical
['SBrkr' 'FuseF' 'FuseA' 'FuseP' 'Mix' 'None']


In [157]:
#check that there are now no null values
all(train_cat.isnull().mean() == 0)

True

In [179]:
# create a transfomer for the categorical variables 

class clean_categorical(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y = None): 
        return self
    
    def transform(self, X): 
        
        # change MSSubClass to categorical from numeric
        X['MSSubClass'] = X['MSSubClass'].apply(str) 

        # change NA values to 'None' for each of columns below
        columns_to_change_null_values = ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 
                                 'PoolQC', 'Fence', 'MiscFeature', 'Alley', 'FireplaceQu',
                                'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
                                'BsmtFinType2', 'Electrical']

        for column in columns_to_change_null_values: 
            X.loc[X[column].isnull(), column] = 'None'
        
        return X
            

#### Just for kicks: see how OneHotEncoder works

In [119]:
# test OneHotEncoder()
enc = OneHotEncoder(handle_unknown = 'ignore')
garage_1hot = enc.fit_transform(train_cat[['GarageType']])

pd_1hot = pd.DataFrame(garage_1hot.todense())
compare_to_original = pd.concat([train_cat[['GarageType']], pd_1hot], axis = 1)

In [122]:
# see how one hot encoding worked 
compare_to_original.drop_duplicates()

Unnamed: 0,GarageType,0,1,2,3,4,5,6
0,Attchd,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,Detchd,0.0,0.0,0.0,0.0,0.0,1.0,0.0
11,BuiltIn,0.0,0.0,0.0,1.0,0.0,0.0,0.0
17,CarPort,0.0,0.0,0.0,0.0,1.0,0.0,0.0
39,,0.0,0.0,0.0,0.0,0.0,0.0,1.0
98,Basment,0.0,0.0,1.0,0.0,0.0,0.0,0.0
129,2Types,1.0,0.0,0.0,0.0,0.0,0.0,0.0


### Full Pipeline

In [192]:
num_columns = train_num.columns.to_list()
cat_columns = train_cat.columns.to_list()

#create full pipeline for numeric and categorical
full_pipeline = ColumnTransformer([
    ("cat", clean_categorical(), cat_columns),
    ("num", numeric_pipeline, num_columns)
])

In [193]:
housing_prepared = full_pipeline.fit_transform(training)
#print out first 2 rows
housing_prepared[:2]

array([['60', 'RL', 'Pave', 'None', 'Reg', 'Lvl', 'AllPub', 'Inside',
        'Gtl', 'CollgCr', 'Norm', 'Norm', '1Fam', '2Story', 'Gable',
        'CompShg', 'VinylSd', 'VinylSd', 'BrkFace', 'Gd', 'TA', 'PConc',
        'Gd', 'TA', 'No', 'GLQ', 'Unf', 'GasA', 'Ex', 'Y', 'SBrkr', 'Gd',
        'Typ', 'None', 'Attchd', 'RFn', 'TA', 'TA', 'Y', 'None', 'None',
        'None', 'WD', 'Normal', 0.1506849315068493, 0.03341980415527355,
        0.6666666666666665, 0.5, 0.9492753623188417, 0.8833333333333329,
        0.1225, 0.12508858965272857, 0.0, 0.06421232876712328,
        0.14009819967266776, 0.11977971546581001, 0.4135593220338983,
        0.0, 0.2592313489073097, 0.3333333333333333, 0.0,
        0.6666666666666666, 0.5, 0.375, 0.3333333333333333, 0.5, 0.0,
        0.5, 0.38645980253878703, 0.0, 0.11151736745886655, 0.0, 0.0,
        0.0, 0.0, 0.0, 0.09090909090909091, 0.5, 0.5555555555555556, 0.0],
       ['20', 'RL', 'Pave', 'None', 'Reg', 'Lvl', 'AllPub', 'FR2', 'Gtl',
        'Veenke

In [194]:
all_columns = cat_columns + num_columns

In [195]:
# add back column names and change into dataframe
housing_df = pd.DataFrame(housing_prepared, columns = all_columns)
housing_df.head()

Unnamed: 0,MSSubClass,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,random_int,Remodeled
0,60,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,...,0.111517,0.0,0.0,0.0,0.0,0.0,0.090909,0.5,0.555556,0.0
1,20,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,...,0.0,0.0,0.0,0.0,0.0,0.0,0.363636,0.25,0.666667,0.0
2,60,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,...,0.076782,0.0,0.0,0.0,0.0,0.0,0.727273,0.5,0.777778,1.0
3,70,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,...,0.063985,0.492754,0.0,0.0,0.0,0.0,0.090909,0.0,0.0,1.0
4,60,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,...,0.153565,0.0,0.0,0.0,0.0,0.0,1.0,0.5,0.666667,0.0


In [196]:
#save data
housing_df.to_pickle('cleaned_data/training_cleaned.pkl')
housing_df.to_csv('cleaned_data/training_cleaned.csv')