# 1. Important libraries

In [1]:
import os
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
import warnings
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor
warnings.filterwarnings('ignore')

# 2. Load Data

In [2]:
# Load the data
# data is first downloweded into DATA_PATH from 
# http://archive.ics.uci.edu/ml/machine-learning-databases/00267/data_banknote_authentication.txt
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

DATA_PATH = 'data'
FILE_NAME = 'train.csv'

def load_data(data_path=DATA_PATH, file_name=FILE_NAME, ratio=0.9):
    # load everything into data
    file_path = os.path.join(data_path, file_name)
    data = pd.read_csv(file_path)
    
    slice_ind = int(data.shape[0]*ratio)
    
    return data.iloc[0:slice_ind], data.iloc[slice_ind:]


In [3]:
train, test = load_data()

In [4]:
t = np.log(test['SalePrice'].values)

# 3. The HousingRegressor class

In [5]:
class HousingRegressor(object):
    
    def __init__(self):
        self.training_data = None
        self.training_labels = None
        self.scaler = None
        self.regressor = None
        self.Onehot_coders = {}
        self.encoders = {}
        self.ZERO_HEAVY_COLS = ['GarageArea', 'TotalBsmtSF', 'MasVnrArea', 'BsmtFinSF1', 
                       'WoodDeckSF', '2ndFlrSF', 'OpenPorchSF', 'BsmtUnfSF', 
                       'EnclosedPorch', 'ScreenPorch', 'PoolArea', '3SsnPorch', 
                       'LowQualFinSF', 'MiscVal', 'BsmtFinSF2', 'YearRemodAdd']
        self.ZERO_HEAVY_COLS_zero = []
    
        for col in self.ZERO_HEAVY_COLS:
            self.ZERO_HEAVY_COLS_zero.append(col + '_zero')
    
    # seperate_num_and_cat: self, pd.DataFrame --> list of strings, list of strings
    # seperate numerical and non numerical columns in data
    # returns nums, and cats. They are the indice of
    # the numerical and categorical columns
    def seperate_num_and_cat(self, data):
        # numerical data type
        cont_type = [int, np.int64, float, np.float64]

        nums = []
        cats = []

        for col in data.columns:
            if data[col].dtype in cont_type:
                nums.append(col)
                #data[col].to_numeric()
            else:
                cats.append(col)

        return nums, cats #.astype(str)

    # encode_fit: self, pd.DataFrame --> pd.DataFrame, list of LabelEncoder
    # assumes input is a pd.DataFrame with dtype == str
    # encodes each column in cats by LabelEncoder()
    def encode_fit(self, cats):
        encoded_cats = {}

        for cat in cats.columns:
            encoder = LabelEncoder()
            encoded_cats[cat] = encoder.fit_transform(cats[cat]) # won't work if cats has missing value
            #print(encoded_cats[cat])
            self.encoders[cat] = encoder

        return pd.DataFrame(encoded_cats) #, encoders
    
    # encode_transform: self, pd.DataFrame --> pd.DataFrame, list of LabelEncoder
    def encode_transform(self, cats):
        encoded_cats = {}

        for cat in cats.columns:
            encoded_cats[cat] = self.encoders[cat].transform(cats[cat]) # won't work if cats has missing value

        return pd.DataFrame(encoded_cats) #, encoders
     

    # encode_by_1hot_fit: list of string, pd.DataFrame --> dict of pd.sparseMatrix
    # encodes by OneHotEncoder for each column in cat_ind in data
    def encode_by_1hot_fit(self, cat_ind, data):
        cat_1hot = []

        for col in cat_ind:
            onehot_coder = OneHotEncoder()
            cat_1hot.append(onehot_coder.fit_transform(data[col].values.reshape(-1,1)))
            self.Onehot_coders[col] = onehot_coder

        return cat_1hot #, Onehot_coders
    
    # encode_by_1hot: list of string, pd.DataFrame --> dict of pd.sparseMatrix
    def encode_by_1hot_transform(self, cat_ind, data):
        cat_1hot = []

        for col in cat_ind:
            cat_1hot.append(self.Onehot_coders[col].transform(data[col].values.reshape(-1,1)))

        return cat_1hot #, Onehot_coders
    
    # add_0_indicators: pd.DataFrame --> void
    # for col in ZERO_HEAVY_COLS, change all the zero to the median in data
    # then make a new column in data with 1 == this entry was 0 in col and
    # 0 == otherwise
    # this is used to shift the zero heavy datas to the median,
    # in an attempt to correct skewedness
    def add_0_indicators(self, data):   
        for col in self.ZERO_HEAVY_COLS:
            new_col = col + '_zero'
            data[new_col] = 1 * (data[col] == 0)

            changed_0 = data[col][data[col] > 0].median()
            data.loc[data[col]==0, col] = changed_0
    
    def seperate_train_label_cols(self, data):
        cols = data.columns
        
        if 'SalePrice' in cols:
            return cols.drop('SalePrice'), pd.Index(['SalePrice'], dtype=data['SalePrice'].dtype)
        else:
            return cols, None

        
        
        
        
    def fit(self, data): 
        training_cols, label_col = self.seperate_train_label_cols(data)
        #print(len(training_cols), len(label_col))
        
        if label_col != None:
            labels = data[label_col]
        else:
            labels = 1
            
        #print(labels)
        
        data = data[training_cols]
        
        #print(labels)
        
        # find numeric and categorical columns of data
        nums_ind, cat_ind = self.seperate_num_and_cat(data)
        
        #print(len(nums_ind))
        #print(len(cat_ind))
        #print(len(nums_ind+cat_ind))

        # in numerical data: fill nan by mean
        data[nums_ind] = data[nums_ind].fillna(data[nums_ind].median())

        # in categorical data: fill nan by 'nan'
        data[cat_ind] = data[cat_ind].fillna('UofTMath').astype(str)
        #print(data[cat_ind])
        
        # print(len(data[nums_ind].columns), len(data[cat_ind].columns))
        
        # change all the zeros to median and make a new column to indicate the appearance of 0
        self.add_0_indicators(data)
        cat_ind += self.ZERO_HEAVY_COLS_zero
        # print(len(data[nums_ind].columns), len(data[cat_ind].columns))
        

        # scale the numerical columns to 0 mean and 1 standard deviation
        self.scaler = MinMaxScaler()
        data[nums_ind] = \
            pd.DataFrame(self.scaler.fit_transform(data[nums_ind]), columns=nums_ind)
        #print(len(data[nums_ind].columns), len(data[cat_ind].columns))
        
        # encode categories
        # somehow data[cat_ind] = self.encode(data[cat_ind]) produces NaN everywhere
        # due to type conversion.
        data_cat = self.encode_fit(data[cat_ind])
        # print(x)
        # data[cat_ind] = x
        # print(data[cat_ind])
        # print(labels)
        # print(len(data[nums_ind].columns), len(data_cat.columns))
        
        
        # 1hot encode
        cat_1hot = self.encode_by_1hot_fit(cat_ind, data_cat)
        #print(len(cat_1hot))
        
        # put the numerical dense matrix in sparse format so that we can concatenate it with the 1hot encodings
        data_num = sparse.csr_matrix(data[nums_ind].values)
        #print(data_num.shape)
            
        # after clean data, we fit
        self.training_data, self.training_label = \
        sparse.hstack([data_num] + cat_1hot), self.preprocess_label(labels)
        
        self.regressor = BaggingRegressor(base_estimator=GradientBoostingRegressor(min_samples_split=2, # this is default 
                                                                               max_depth=3 # this is default
                                                                              ), 
                                      n_estimators=10, # default is 10
                                      max_features=0.3, # default is all or 1.0
                                      n_jobs=-1
                                     )
        
        self.regressor.fit(self.training_data, self.training_label)
    
    def preprocess_label(self, y):
        return np.log(y)
    
    def transform(self, data):
        training_cols, label_col = self.seperate_train_label_cols(data)
        #print(len(training_cols), len(label_col))
        
        if label_col != None:
            labels = data[label_col]
        else:
            labels = 1
            
        #print(labels)
        
        data = data[training_cols]
        
        #print(labels)
        
        # find numeric and categorical columns of data
        nums_ind, cat_ind = self.seperate_num_and_cat(data)
        
        #print(len(nums_ind))
        #print(len(cat_ind))
        #print(len(nums_ind+cat_ind))

        # in numerical data: fill nan by mean
        data[nums_ind] = data[nums_ind].fillna(data[nums_ind].median())

        # in categorical data: fill nan by 'nan'
        data[cat_ind] = data[cat_ind].fillna('UofTMath').astype(str)
        #print(data[cat_ind])
        
        # print(len(data[nums_ind].columns), len(data[cat_ind].columns))
        
        # change all the zeros to median and make a new column to indicate the appearance of 0
        self.add_0_indicators(data)
        cat_ind += self.ZERO_HEAVY_COLS_zero
        # print(len(data[nums_ind].columns), len(data[cat_ind].columns))
        

        # scale the numerical columns to 0 mean and 1 standard deviation
        data[nums_ind] = \
            pd.DataFrame(self.scaler.transform(data[nums_ind]), columns=nums_ind)
        #print(len(data[nums_ind].columns), len(data[cat_ind].columns))
        
        # encode categories
        # somehow data[cat_ind] = self.encode(data[cat_ind]) produces NaN everywhere
        # due to type conversion.
        data_cat = self.encode_transform(data[cat_ind])
        # print(x)
        # data[cat_ind] = x
        # print(data[cat_ind])
        # print(labels)
        # print(len(data[nums_ind].columns), len(data_cat.columns))
        
        
        # 1hot encode
        cat_1hot = self.encode_by_1hot_transform(cat_ind, data_cat)
        #print(len(cat_1hot))
        
        # put the numerical dense matrix in sparse format so that we can concatenate it with the 1hot encodings
        data_num = sparse.csr_matrix(data[nums_ind].values)
        #print(data_num.shape)
            
        return sparse.hstack([data_num] + cat_1hot), self.preprocess_label(labels)
        
        
        
    
    def postprocess_label(self, y):
        return np.exp(y)

        
    def predict(self, data):
        X, _ = self.preprocess_data(data)
        
        return self.regressor.predict(X)

        
    def input_data(self, X):
        self.training_data = self.preprocess_training_data(X)

In [6]:
x = HousingRegressor()

In [7]:
x.fit(train)

In [8]:
a, _ = x.transform(test)

ValueError: y contains previously unseen labels: 'CBlock'

In [None]:
a.shape

In [None]:
x.training_data.shape