In [1]:
import os
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings('ignore')

In [2]:
class HousingRegressor(object):
    
    def __init__(self):
        self.training_data = None
        self.cat_1hot = None
        self.nums_ind = None
        self.cat_ind = None
        self.training_labels = None
        self.ascending_correlation = None
        self.scaler = None
        self.ZERO_HEAVY_COLS = ['GarageArea', 'TotalBsmtSF', 'MasVnrArea', 'BsmtFinSF1', 
                       'WoodDeckSF', '2ndFlrSF', 'OpenPorchSF', 'BsmtUnfSF', 
                       'EnclosedPorch', 'ScreenPorch', 'PoolArea', '3SsnPorch', 
                       'LowQualFinSF', 'MiscVal', 'BsmtFinSF2', 'YearRemodAdd']
        self.ZERO_HEAVY_COLS_zero = []
    
        for col in self.ZERO_HEAVY_COLS:
            self.ZERO_HEAVY_COLS_zero.append(col + '_zero')
    
    # seperate_num_and_cat: self, pd.DataFrame --> list of strings, list of strings
    # seperate numerical and non numerical columns in data
    # returns nums, and cats. They are the indice of
    # the numerical and categorical columns
    def seperate_num_and_cat(self, data):
        # numerical data type
        cont_type = [int, np.int64, float, np.float64]

        nums = []
        cats = []

        for col in data.columns:
            if data[col].dtype in cont_type:
                nums.append(col)
                #data[col].to_numeric()
            else:
                cats.append(col)

        return nums, cats #.astype(str)

    # encode: self, pd.DataFrame --> pd.DataFrame, list of LabelEncoder
    # assumes input is a pd.DataFrame with dtype == str
    # encodes each column in cats by LabelEncoder()
    def encode(self, cats):
        #encoders = {}
        encoded_cats = {}

        for cat in cats.columns:
            encoder = LabelEncoder()
            encoded_cats[cat] = encoder.fit_transform(cats[cat]) # won't work if cats has missing value
            #encoders[cat] = encoder

        return pd.DataFrame(encoded_cats) #, encoders
    
    # find_ind_by_increasing_cov: pd.DataFrame --> pd.Series
    # rank by correlation with 'SalePrice'
    # returns pd.Series whose index is the sorted column names
    # and entries are normalized (sums to 1) correlation coefficients
    def find_ind_by_increasing_cov(self, data):

        cols = data.columns
        scaler = StandardScaler()
        temp = pd.DataFrame(scaler.fit_transform(data), columns=cols)

        x = np.abs(temp.cov()['SalePrice'])
        return x.sort_values(ascending=False)/np.sum(x)    

    # encode_by_1hot: list of string, pd.DataFrame --> dict of pd.sparseMatrix
    # encodes by OneHotEncoder for each column in cat_ind in data
    def encode_by_1hot(self, cat_ind, data):
        cat_1hot = {}
        # Onehot_coders = []

        for col in cat_ind:
            onehot_coder = OneHotEncoder()
            cat_1hot[col] = onehot_coder.fit_transform(data[col].values.reshape(-1,1)) 

        return cat_1hot #, Onehot_coders
    
    # add_0_indicators: pd.DataFrame --> void
    # for col in ZERO_HEAVY_COLS, change all the zero to the median in data
    # then make a new column in data with 1 == this entry was 0 in col and
    # 0 == otherwise
    # this is used to shift the zero heavy datas to the median,
    # in an attempt to correct skewedness
    def add_0_indicators(self, data):   
        for col in self.ZERO_HEAVY_COLS:
            new_col = col + '_zero'
            data[new_col] = 1 * (data[col] == 0)

            median = data[col][data[col] > 0].median()
            data.loc[data[col]==0, col] = median

    
    # order_cat_ind: list of string, pd.Index --> list of string
    # return an ordered cat_ind according to ascending_correlation
    def order_cat_ind(self, cat_ind, ascending_correlation):
        ans = []
        
        for ind in ascending_correlation.index:
            if ind in cat_ind:
                ans.append(ind)
        
        return ans

    def preprocess_training_data(self, data): 
        # find numeric and categorical columns of data
        nums_ind, cat_ind = self.seperate_num_and_cat(data)

        # in numerical data: fill nan by mean
        data[nums_ind] = data[nums_ind].fillna(data[nums_ind].mean())

        # in categorical data: fill nan by 'nan'
        data[cat_ind] = data[cat_ind].fillna('nan').astype(str)
        
        # change all the zeros to median and make a new column to indicate the appearance of 0
        self.add_0_indicators(data)
        cat_ind += self.ZERO_HEAVY_COLS_zero
        
        # encode categories
        data[cat_ind] = self.encode(data[cat_ind])
        

        # find index of increasing covariance
        self.ascending_correlation = self.find_ind_by_increasing_cov(data)
        
        # order cat_ind according to ascending_correlation
        cat_in = self.order_cat_ind(cat_ind, self.ascending_correlation)
        
        # scale the numerical columns to 0 mean and 1 standard deviation
        self.scaler = StandardScaler()
        data[nums_ind] = \
            pd.DataFrame(self.scaler.fit_transform(data[nums_ind]), columns=nums_ind)

        # 1hot encode
        self.cat_1hot = self.encode_by_1hot(cat_ind, data)
        
        self.training_data = data[nums_ind]
        self.nums_ind = nums_ind
        self.cat_ind = cat_ind

In [3]:
x = HousingRegressor()

In [4]:
# 1
# Load the data
# data is first downloweded into DATA_PATH from 
# http://archive.ics.uci.edu/ml/machine-learning-databases/00267/data_banknote_authentication.txt

DATA_PATH = 'data'
FILE_NAME = 'train.csv'

# load_data: string, string --> pd.DataFrame
# load data from os.path.join(data_path, file_name) and
# return as pd.DataFrame
def load_data(data_path=DATA_PATH, file_name=FILE_NAME):
    # load everything into data
    file_path = os.path.join(data_path, file_name)
    data = pd.read_csv(file_path)
    
    return data

data = load_data()

In [5]:
x.preprocess_training_data(data)

In [12]:
x.training_data.iloc(0)[0]

Id              -1.730865
MSSubClass       0.073375
LotFrontage     -0.229372
LotArea         -0.207142
OverallQual      0.651479
OverallCond     -0.517200
YearBuilt        1.050994
YearRemodAdd     0.878668
MasVnrArea      -0.192095
BsmtFinSF1       0.203219
BsmtFinSF2      -0.062105
BsmtUnfSF       -1.125970
TotalBsmtSF     -0.561219
1stFlrSF        -0.793434
2ndFlrSF         0.368718
LowQualFinSF     0.038464
GrLivArea        0.370333
BsmtFullBath     1.107810
BsmtHalfBath    -0.241061
FullBath         0.789741
HalfBath         1.227585
BedroomAbvGr     0.163779
KitchenAbvGr    -0.211454
TotRmsAbvGrd     0.912210
Fireplaces      -0.951226
GarageYrBlt      1.021157
GarageCars       0.311725
GarageArea       0.266959
WoodDeckSF      -0.157219
OpenPorchSF     -0.267449
EnclosedPorch   -0.047107
3SsnPorch       -0.034021
ScreenPorch     -0.034480
PoolArea        -0.016518
MiscVal         -0.055694
MoSold          -1.599111
YrSold           0.138777
SalePrice        0.347273
Name: 0, dty