# Coverting Dataset into machine readable numerical format
### Label Encoder
Label encoder is a class that converts categorical data into machine understandable format (i.e numerical format). <br>
It replaces the existing data with encoded data. <br>
Link: <a href="https://bit.ly/2F2Jc60">sklearn LabelEncoder</a>

In [1]:
# Helping libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import SelectKBest, chi2

In [2]:
class MultiColumn_LabelEncoder:
    
    # Specify column names that needs to be encoded
    def __init__(self, columns=None):
        self.columns = columns
    
    def fit(self, X, y):
        return self
    
    def transform(self, X, y=None):
        output = X.copy()
        if self.columns is not None:
            for column in self.columns:
                output[column] = LabelEncoder().fit_transform(output[column])
                output[column] = output[column].astype('category')
        else:
            for column_name, column in output.iteritems():
                output[column_name] = LabelEncoder().fit_transform(column)
        return output
    
    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)

In [3]:
class Preprocess:
    def __init__(self, data, columns):
        self.data = data
        self.columns = columns
    
    def one_hot_encode(self):
        # Seperating Features and Labels
        X = self.data.iloc[:, :-1]
        y = self.data.iloc[:, -1]
        
        columns_to_encode = list(X.select_dtypes(include=['category', object]))

        # One-hot encoding on Features with categorical values
        X_new = pd.get_dummies(X, drop_first=True, columns=columns_to_encode)
    
        return X_new, y

# Check best features
### SelectKBest
We use <b>SelectKBest</b> class from sklearn for cheecking the scores of each attributes and then decide which one to eiminate in case if the contribution of the attribute is for learning is low.
Link: <a href="https://bit.ly/2w0hp1Q">sklearn SelectKBest</a>

In [4]:
def main():
    data = pd.read_csv('datasets/adult_training.csv', na_values=' ?')
    
    # Handling NAN values_training.csv', na_values=' ?')
    data = data.dropna()
    
    # Array of column names with data type as object (non integer or float)
    object_attributes = list(data.select_dtypes(include='object'))
    
    label_encoder = MultiColumn_LabelEncoder(columns=object_attributes)
    new_data = label_encoder.fit_transform(data)
    
    X, y = Preprocess(data=data, columns=object_attributes).one_hot_encode()
    print("Column names after one hot encoding :\n", list(X.columns))
    print("\nTotal number of columns: ", len(list(X.columns)))
    
    # Numpy array for features and labels
    X = np.array(X.values)
    y = np.array(y.values)
    
    # Check scores of each attribute for selecting best ones
    selector = SelectKBest(score_func=chi2, k='all')
    X_new = selector.fit_transform(X, y)
    print("\nFeature scores: ", selector.scores_)

In [5]:
if __name__ == '__main__':
    main()

Column names after one hot encoding :
 ['Age', 'Fnlwgt', 'Capital_Gain', 'Capital_Loss', 'Hours_Per_Week', 'Workclass_ Local-gov', 'Workclass_ Private', 'Workclass_ Self-emp-inc', 'Workclass_ Self-emp-not-inc', 'Workclass_ State-gov', 'Workclass_ Without-pay', 'Education_ 11th', 'Education_ 12th', 'Education_ 1st-4th', 'Education_ 5th-6th', 'Education_ 7th-8th', 'Education_ 9th', 'Education_ Assoc-acdm', 'Education_ Assoc-voc', 'Education_ Bachelors', 'Education_ Doctorate', 'Education_ HS-grad', 'Education_ Masters', 'Education_ Preschool', 'Education_ Prof-school', 'Education_ Some-college', 'Marital_Status_ Married-AF-spouse', 'Marital_Status_ Married-civ-spouse', 'Marital_Status_ Married-spouse-absent', 'Marital_Status_ Never-married', 'Marital_Status_ Separated', 'Marital_Status_ Widowed', 'Occupation_ Armed-Forces', 'Occupation_ Craft-repair', 'Occupation_ Exec-managerial', 'Occupation_ Farming-fishing', 'Occupation_ Handlers-cleaners', 'Occupation_ Machine-op-inspct', 'Occupatio