In [232]:
# IMPORT PACKAGES
import pandas as pd
import numpy as np

In [233]:
def read_data_csv(filename):
    return pd.read_csv(filename)

In [234]:
def drop_useless_columns(df):
    df_ = df.copy()
    return df_.drop(columns=['Id'])

In [235]:
# SPLITS FEATURES BETWEEN NUMERICAL AND CATEGORICAL
def split_numerical_categorical(df):
    df_ = df.copy()
    df_dtypes = df_.dtypes.to_dict()
    
    feature_numbers = []
    feature_objects = []
    
    for i,j in df_dtypes.items():
        if j.char == 'O':
            feature_objects.append(i)
        else:
            feature_numbers.append(i)
            
    return feature_numbers, feature_objects

In [236]:
# NA FILLER FOR CATEGORICAL AND NUMERICAL DATA
def na_filler_function(df):
    df_ = df.copy()

    feature_numbers, feature_objects = split_numerical_categorical(df_)
            
    if len(feature_objects) > 0:
        df_[feature_objects] = df_[feature_objects].fillna('None').copy()
    if len(feature_numbers) > 0:
        df_[feature_numbers] = df_[feature_numbers].fillna(0).copy()
    
    return df_

In [237]:
# ONE INSTANCE OF OHE
class OHE:
    def __init__(self, column_name, drop=False):
        self.column_name = column_name
        self.drop = drop
        self.new_cols = []
        
    def fit(self, X_, y=None):
        # UNIQUE VALUES ON COLUMN
        self.new_cols = X_[self.column_name].unique()
    
    def transform(self, X_, y=None):
        X = X_.copy()

        # CREATE COLUMNS
        for i in self.new_cols:
            j = X[
                X[self.column_name] == i
            ].index
            
            new_label = self.column_name + '_' + str(i)
            X[new_label] = 0
            X.loc[j, new_label] = 1
            
        # DROP ORIGINAL COLUMN
        if self.drop:
            return X.drop(columns=[self.column_name])
        else:
            return X
    
    def fit_transform(self, X_, y=None):
        self.fit(X_)
        return self.transform(X_)

In [238]:
# CONVERTS ALL CATEGORICAL DATA INTO OHE REPRESENTATION
class OHE_Categorical:
    def __init__(self):
        self.ohe = []
    
    def fit(self, df, y=None):
        df_ = df.copy()
        feature_numbers, feature_objects = split_numerical_categorical(df_)

        for i in feature_objects:
            self.ohe.append(OHE(i, drop=True))
            
        for i in range(len(self.ohe)):
            self.ohe[i].fit(df_)
        
    def transform(self, df, y=None):
        df_ = df.copy()
        for i in range(len(self.ohe)):
            df_ = self.ohe[i].transform(df_)

        return df_
    
    def fit_transform(self, df, y=None):
        self.fit(df)
        return self.transform(df)

### LOAD DATA

In [239]:
df = read_data_csv('train.csv')

### PRE-PROCESS DATA

In [241]:
df1 = df.copy()
df1 = na_filler_function(df1)
df1['MSSubClass'] = df1['MSSubClass'].astype('object')
ohe = OHE_Categorical()
df1 = ohe.fit_transform(df1)