# HousingDataProcessor Class

## 1. Libraries

In [1]:
import os
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
import warnings
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor
warnings.filterwarnings('ignore')

## 2. Data, Loading, Splitting

In [20]:
# Load the data
# data is first downloweded into DATA_PATH from 
# http://archive.ics.uci.edu/ml/machine-learning-databases/00267/data_banknote_authentication.txt
import os
import pandas as pd
import numpy as np

DATA_PATH = 'data'
FILE_NAME = 'train.csv'

def load_data(data_path=DATA_PATH, file_name=FILE_NAME, ratio=0.9):
    # load everything into data
    file_path = os.path.join(data_path, file_name)
    data = pd.read_csv(file_path)
    
    
    return data

In [21]:
data = load_data()

## 3. Fit and Transform of Data: HousingDataProcessor class
Below we implement the HousingData class. The purpose of this class is to write 
1. A data structure that contains the input data
2. fit_transform raw data into processed data ready for ML application

The fit functions consits of the following purpose:
0. it assumes that 'SalePrice' is not a column of the data. The order of the columns of the data is fixed
1. seperate the data into numerical and categorical data, records the names of these columns as self.num_ind and self.cat_ind
2. fit by median of all ZERO_HEAVY_COLS columns
3. fit numerical columns in (0,1) by MinMaxScaler, preserving the column order. 
4. fit by LabelEncoder and OneHotEncoder, preserving the column order

The transform_function does the transform part of the above. Returns a sparse matrix with everything in it
0. it assumes that 'SalePrice' is not a column of the data. The order of the columns of the data is fixed
1. seperate the data into numerical and categorical data via self.num_ind and self.cat_ind
2. fill missing categorical data by the string 'UofTMath'
3. fill all the missing numerical data by the median
4. for each numerical column which has heavy data present at value 0, create a new column with entry 1 if heavy data is present at 0 and with entry 0 otherwise. Then set the entry in the original numerical column from 0 to the median. Finally, append to the end the new binary column to the category columns and name 'name_of_original_numeric_colum_zero'
5. LabelEncoder and OneHotEncoder transform on categorical columns

In [216]:
class HousingDataProcessor(object):
    def __init__(self): 
        self.num_ind = [] # list
        self.cat_ind = []
        self.cat_ind_augmented = None
        
        self.num_types = [int, np.int64, float, np.float64]
        self.ZERO_HEAVY_COLS = ['GarageArea', 'TotalBsmtSF', 'MasVnrArea', 'BsmtFinSF1', 
                       'WoodDeckSF', '2ndFlrSF', 'OpenPorchSF', 'BsmtUnfSF', 
                       'EnclosedPorch', 'ScreenPorch', 'PoolArea', '3SsnPorch', 
                       'LowQualFinSF', 'MiscVal', 'BsmtFinSF2', 'YearRemodAdd']
        self.ZERO_HEAVY_COLS_zero = []
        for col in self.ZERO_HEAVY_COLS:
            self.ZERO_HEAVY_COLS_zero.append(col + '_zero')
            
        
        # numerical
        self.imputer_nan = None # fill empty by 0
        self.imputer_zero = None # fill 0 by median
        self.scaler = None
        
        # categorical
        self.encoders = {}
        self.cat_1hots = {}
        
    # seperate_num_and_cat: self, pd.DataFrame --> list of strings, list of strings
    # seperate numerical and non numerical columns in data
    # returns nums, and cats. They are the indice of
    # the numerical and categorical columns
    def fit_seperate_num_and_cat(self, data):
        for col in data.columns:
            if data[col].dtype in self.num_types:
                self.num_ind.append(col)
            else:
                self.cat_ind.append(col)

        self.cat_ind_augmented = self.cat_ind + self.ZERO_HEAVY_COLS_zero

    # fit_num_data: self, pd.DataFrame --> void
    def fit_num_data(self, num_data):
        temp = num_data.copy()
        
        #print(temp[self.ZERO_HEAVY_COLS])
      
        self.imputer_nan = SimpleImputer(fill_value=0)
        temp = pd.DataFrame(self.imputer_nan.fit_transform(temp), columns=num_data.columns)
        
        self.imputer_zero = SimpleImputer(missing_values=0, strategy='median')
        temp[self.ZERO_HEAVY_COLS] = pd.DataFrame(self.imputer_zero.fit_transform(temp[self.ZERO_HEAVY_COLS]),  
                                                  columns=num_data[self.ZERO_HEAVY_COLS].columns)
        
        self.scaler = MinMaxScaler()
        self.scaler.fit(temp)
    
    def transform_num_data(self, num_data):
        temp = num_data.copy()
        
        temp = pd.DataFrame(self.imputer_nan.transform(temp), columns=temp.columns)
        
        temp[self.ZERO_HEAVY_COLS] = pd.DataFrame(self.imputer_zero.transform(temp[self.ZERO_HEAVY_COLS]),  
                                                  columns=temp[self.ZERO_HEAVY_COLS].columns)
        temp = pd.DataFrame(self.scaler.transform(temp), columns=temp.columns)
        
        return temp


    def fit_1hot(self, data, additional_data=None):
        if additional_data != None:
            temp = pd.concat([data, additional_data]).copy()
        else:
            temp = data.copy()
            
        for col in self.cat_ind_augmented:
            print(col)
            self.encoders[col] = LabelEncoder()
            self.cat_1hots[col] = OneHotEncoder()

            temp = self.encoders[col].fit_transform(temp[col])
            self.cat_1hots[col].fit(temp.reshape(-1,1))

In [217]:
x = HousingDataProcessor()

In [218]:
x.fit_seperate_num_and_cat(data)

In [219]:
x.fit_num_data(data[x.num_ind])

In [220]:
y = x.transform_num_data(data[x.num_ind])

In [221]:
x.fit_1hot(data[x.cat_ind])

MSZoning
Street


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [196]:
a = np.array([1,2,3])

In [197]:
a.reshape(-1,1)

array([[1],
       [2],
       [3]])