# 1. Important libraries

In [1]:
# basic i/o
import os

# data structures
import numpy as np
import pandas as pd

# cleaning data
from scipy import sparse
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

# regressors
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor

# getting rid of annoying warnings.
import warnings
warnings.filterwarnings('ignore')

# 2. Load Data

In [2]:
# Load the data
# data is first downloweded into DATA_PATH from 
# http://archive.ics.uci.edu/ml/machine-learning-databases/00267/data_banknote_authentication.txt
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

DATA_PATH = 'data'
FILE_NAME = 'train.csv'

def load_data(data_path=DATA_PATH, file_name=FILE_NAME, ratio=0.9):
    # load everything into data
    file_path = os.path.join(data_path, file_name)
    data = pd.read_csv(file_path)
    
    slice_ind = int(data.shape[0]*ratio)
    
    return data.iloc[0:slice_ind], data.iloc[slice_ind:]


In [3]:
full_data, _ = load_data(ratio=1)
full_X = full_data.iloc[:,0:-1]
full_y = full_data['SalePrice']

In [4]:
train, test = load_data(ratio=0.9)
train_X = train.iloc[:,0:-1]
train_y = train['SalePrice']
test_X = test.iloc[:,0:-1]
test_y = test['SalePrice']

In [5]:
train_X.columns == test_X.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True])

# 3. The HousingDataProcessor class
Below we implement the HousingData class. The purpose of this class is to write
1. A data structure that contains the input data
2. fit_transform raw data into processed data ready for ML application

The fit functions consits of the following purpose:
1. it assumes that 'SalePrice' is not a column of the data. The order of the columns of the data is fixed
2. seperate the data into numerical and categorical data, records the names of these columns as self.num_ind and self.cat_ind
3. fit by median of all ZERO_HEAVY_COLS columns
4. fit numerical columns in (0,1) by MinMaxScaler, preserving the column order.
5. fit by LabelEncoder and OneHotEncoder

The transform_function does the transform part of the above. Returns a sparse matrix with everything in it
1. it assumes that 'SalePrice' is not a column of the data. The order of the columns of the data is fixed
2. seperate the data into numerical and categorical data via self.num_ind and self.cat_ind
3. fill missing categorical data by the string 'UofTMath'
4. fill all the missing numerical data by the median
5. for each numerical column which has heavy data present at value 0, create a new column with entry 1 if heavy data is present at 0 and with entry 0 otherwise. Then set the entry in the original numerical column from 0 to the median. Finally, append to the end the new binary column to the category columns and name 'name_of_original_numeric_colum_zero'
6. LabelEncoder and OneHotEncoder transform on categorical columns
7. add a 'BIAS' column with only 1

In [6]:
class HousingDataProcessor(object):
    
    def __init__(self):
        # cleaning data
        self.imputer_num = None
        self.imputer_cat = None
        self.imputer_zero = None
        
        self.scaler = None
        self.regressor = None
        self.Onehot_coders = {}
        self.encoders = {}
        self.ZERO_HEAVY_COLS = ['GarageArea', 'TotalBsmtSF', 'MasVnrArea', 'BsmtFinSF1', 
                       'WoodDeckSF', '2ndFlrSF', 'OpenPorchSF', 'BsmtUnfSF', 
                       'EnclosedPorch', 'ScreenPorch', 'PoolArea', '3SsnPorch', 
                       'LowQualFinSF', 'MiscVal', 'BsmtFinSF2', 'YearRemodAdd']
        self.ZERO_HEAVY_COLS_zero = []
    
        for col in self.ZERO_HEAVY_COLS:
            self.ZERO_HEAVY_COLS_zero.append(col + '_zero')
    
    # seperate_num_and_cat: self, pd.DataFrame --> list of strings, list of strings
    # seperate numerical and non numerical columns in data
    # returns nums, and cats. They are the indice of
    # the numerical and categorical columns
    def seperate_num_and_cat(self, data):
        # numerical data type
        cont_type = [int, np.int64, float, np.float64]

        nums = []
        cats = []

        for col in data.columns:
            if data[col].dtype in cont_type:
                nums.append(col)
                #data[col].to_numeric()
            else:
                cats.append(col)

        return nums, cats #.astype(str)

    # encode_fit: self, pd.DataFrame --> pd.DataFrame, list of LabelEncoder
    # assumes input is a pd.DataFrame with dtype == str
    # encodes each column in cats by LabelEncoder()
    def encode_fit(self, cats):
        encoders = {}

        for cat in cats.columns:
            encoder = LabelEncoder()
            encoder.fit(cats[cat]) # won't work if cats has missing value
            encoders[cat] = encoder
        
        return encoders
        #return pd.DataFrame(encoded_cats) #, encoders
    
    # encode_transform: self, pd.DataFrame, list of LabelEncoder --> pd.DataFrame
    def encode_transform(self, cats, encoders):
        encoded_cats = {}

        for cat in cats.columns:
            encoded_cats[cat] = encoders[cat].transform(cats[cat]) # won't work if cats has missing value

        return pd.DataFrame(encoded_cats, columns=cats.columns) #, encoders
     

    # encode_by_1hot_fit: list of string, pd.DataFrame --> dict of pd.sparseMatrix
    # encodes by OneHotEncoder for each column in cat_ind in data
    def encode_by_1hot_fit(self, cats):
        Onehot_coders = {}

        for col in cats.columns:
            onehot_coder = OneHotEncoder()
            onehot_coder.fit(cats[col].values.reshape(-1,1))
            Onehot_coders[col] = onehot_coder

        return Onehot_coders 
    
    # encode_by_1hot: list of string, pd.DataFrame, list of OneHotEncoder --> dict of pd.sparseMatrix
    def encode_by_1hot_transform(self, cats, Onehot_coders):
        cat_1hot = []

        for col in cats.columns:
            cat_1hot.append(Onehot_coders[col].transform(cats[col].values.reshape(-1,1)))

        return cat_1hot 
    
    # add_0_indicators: pd.DataFrame --> void
    # for col in ZERO_HEAVY_COLS, change all the zero to the median in data
    # then make a new column in data with 1 == this entry was 0 in col and
    # 0 == otherwise
    # this is used to shift the zero heavy datas to the median,
    # in an attempt to correct skewedness
    def zero_indicators_fit(self, data):
        medians = {}
        
        for col in self.ZERO_HEAVY_COLS:
            medians[col] = data[col][data[col] > 0].median()
        
        return medians
            
    def zero_indicators_transform(self, data, medians):   
        for col in self.ZERO_HEAVY_COLS:
            new_col = col + '_zero'
            data[new_col] = 1 * (data[col] == 0)

            data.loc[data[col]==0, col] = medians[col]

    
    def add_0_indicators(self, data):   
        for col in self.ZERO_HEAVY_COLS:
            new_col = col + '_zero'
            data[new_col] = 1 * (data[col] == 0)

    
        
    def fit_(self, data):        
        # find numeric and categorical columns of data
        nums_ind, cat_ind = self.seperate_num_and_cat(data)

        # in numerical data: fill nan by median
        self.imputer_num = SimpleImputer(strategy='median')       
        data[nums_ind] = self.imputer_num.fit_transform(data[nums_ind])

        # in categorical data: fill nan by 'nan'
        self.imputer_cat = SimpleImputer(strategy='constant', fill_value='UofTMath') 
        data[cat_ind] = self.imputer_cat.fit_transform(data[cat_ind])
        
        # change all the zeros to median and make a new column to indicate the appearance of 0
        self.add_0_indicators(data)
        cat_ind += self.ZERO_HEAVY_COLS_zero
        self.imputer_zero = SimpleImputer(strategy='median', missing_values=0)
        data[self.ZERO_HEAVY_COLS] = self.imputer_zero.fit_transform(data[self.ZERO_HEAVY_COLS])
    
        # scale the numerical columns to 0 mean and 1 standard deviation
        #self.scaler = MinMaxScaler()
        #data[nums_ind] = \
        #    pd.DataFrame(self.scaler.fit_transform(data[nums_ind]), columns=nums_ind)
        
        
        # encode categories
        self.encoders = self.encode_fit(data[cat_ind])
        data_cat = self.encode_transform(data[cat_ind], self.encoders)
          
        
        # 1hot encode
        self.Onehot_coders = self.encode_by_1hot_fit(data_cat)
        
    def fit(self, X):
        data = X.copy()
        del data['Id']
        
        self.fit_(data)
 
    def transform_(self, X, debug=False):
        if debug:
            print('X start')
            print(X.isnull().values.any())
            #print(X.head())
        
        # find numeric and categorical columns of data
        nums_ind, cat_ind = self.seperate_num_and_cat(X)
        
        # in numerical data: fill nan by median     
        X[nums_ind] = self.imputer_num.transform(X[nums_ind])

        # in categorical data: fill nan by 'nan'
        X[cat_ind] = self.imputer_cat.transform(X[cat_ind])
        
        if debug:
            print('X after imputation')
            print(X.isnull().values.any())
            #print(X.head())
        
        # change all the zeros to median and make a new column to indicate the appearance of 0
        self.add_0_indicators(X)
        cat_ind += self.ZERO_HEAVY_COLS_zero
        X[self.ZERO_HEAVY_COLS] = self.imputer_zero.transform(X[self.ZERO_HEAVY_COLS])
        
        if debug:
            print('X after adding zeros')
            print(X.isnull().values.any())
            #print(X.head())
        
        # I don't know why but this gives an array full of nan sometimes!!!!!!!
        # scale the numerical columns between 0 and 1
        #X[nums_ind] = \
            #pd.DataFrame(self.scaler.transform(X[nums_ind]), columns=nums_ind)
        
                
        if debug:
            print('X after rescaling')
            print(X.isnull().values.any())
            #print(X.head())

        
        # encode categories
        data_cat = self.encode_transform(X[cat_ind], self.encoders)
        
        # this is a list of sparse 1hot mat
        data_onehot = self.encode_by_1hot_transform(data_cat, self.Onehot_coders)
        
        # add a bias column
        X['BIAS'] = 1
        nums_ind.append('BIAS')
        
        if debug:
            print('X before sparsify')
            print(X.isnull().values.any())
        
        
        
        # put the numerical dense matrix in sparse format so that we can concatenate it with the 1hot encodings
        data_num = sparse.csr_matrix(X[nums_ind].values)
        
        
        
        ans = sparse.hstack([data_num] + data_onehot)
        
        if debug:
            print('final step')
            print(np.isnan(ans.toarray()).any())
        
        return ans
    
    def transform(self, X):
        data = X.copy()
        del data['Id']
        
        return self.transform_(data)
    
    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)
    
    def preprocess_label(self, y):
        return np.log(y)
        
    
    def postprocess_label(self, y):
        return np.exp(y)

        
    def predict(self, data):
        X, _ = self.preprocess_data(data)
        
        return self.regressor.predict(X)

        
    def input_data(self, X):
        self.training_data = self.preprocess_training_data(X)

# 4. HousingRegressor class

In [7]:
class HousingRegressor(object):
    def __init__(self, full_data, n_estimators=10):
        self.dataProcessor = HousingDataProcessor()
        X = full_data.copy()
        del X['SalePrice']
        
        self.dataProcessor.fit(X)
        
        self.regressor = None
        self.n_estimators = n_estimators
        
        
    def fit_(self, X, y):
        self.regressor = BaggingRegressor(base_estimator=GradientBoostingRegressor(min_samples_split=2, # this is default 
                                                                               max_depth=3 # this is default
                                                                              ), 
                                      n_estimators=self.n_estimators, # default is 10
                                      max_features=1.0, # default is all or 1.0
                                      n_jobs=-1
                                     )
        
        self.regressor.fit(X, y)
        
    def fit(self, X, y):
        X = self.dataProcessor.transform(X)
        self.fit_(X, np.log(y.values))
    
    def predict(self, X):
        X = self.dataProcessor.transform(X)
        return np.exp(self.regressor.predict(X))
    
    def predictionError(self, X, y):
        ypred = self.predict(X)
        return np.sqrt(np.sum((np.log(ypred) - np.log(y.values)) ** 2)/y.shape[0])
        

In [8]:
submit_data, _ = load_data(data_path=DATA_PATH, file_name='test.csv', ratio=1)
submit_data['SalePrice'] = 0
data_str = pd.concat([full_data, submit_data])

In [9]:
reg = HousingRegressor(data_str, 10000)

In [10]:
reg.fit(full_X, full_y)

In [11]:
#print('error:',reg.predictionError(test_X, test_y))

In [12]:
del submit_data['SalePrice']
length = submit_data.shape[0]

pred = reg.predict(submit_data)
submit = pd.DataFrame({'Id': np.arange(1461, 1461+length), 'SalePrice' : pred})
submit.to_csv('02092019.csv', index=False)