## Lab 2: Classification
Group 2


## Data Preparation

In [None]:
#Loading Libraries

import datetime
import itertools
import os
import pathlib
import sklearn

import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats

from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import RFE, RFECV
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
header_names = [
    'age',
    'class_worker',
    'det_ind_code',
    'det_occ_code',
    'education',
    'wage_per_hour',
    'hs_college',
    'marital_stat',
    'major_ind_code',
    'major_occ_code',
    'race',
    'hisp_origin',
    'sex',
    'union_member',
    'unemp_reason',
    'full_or_part_emp',
    'capital_gains',
    'capital_losses',
    'stock_dividends',
    'tax_filer_stat',
    'region_prev_res',
    'state_prev_res',
    'det_hh_fam_stat',
    'det_hh_summ',
    'instance_weight', ## this field is not used as a feature
    'mig_chg_msa',
    'mig_chg_reg',
    'mig_move_reg',
    'mig_same',
    'mig_prev_sunbelt',
    'num_emp',
    'fam_under_18',
    'country_father',
    'country_mother',
    'country_self',
    'citizenship',
    'own_or_self',
    'vet_question',
    'vet_benefits',
    'weeks_worked',
    'year',
    'income_50k',
]


df = pd.read_csv('/Users/password1234/Documents/Machine Learning/census-income.data.csv', header=None, names=header_names)
df_test = pd.read_csv('/Users/password1234/Documents/Machine Learning/census-income.test.csv', header=None, names=header_names)
df = pd.concat([df,df_test]) ## the test file is also labelled so they can be merged
df.drop(columns=['instance_weight']) ## not used for our analysis

categorical_features = [
    'class_worker',
    'det_ind_code',
    'det_occ_code',
    'education',
    'hs_college',
    'marital_stat',
    'major_ind_code',
    'major_occ_code',
    'race',
    'hisp_origin',
    'sex',
    'union_member',
    'unemp_reason',
    'full_or_part_emp',
    'tax_filer_stat',
    'region_prev_res',
    'state_prev_res',
    'det_hh_fam_stat',
    'det_hh_summ',
    'mig_chg_msa',
    'mig_chg_reg',
    'mig_move_reg',
    'mig_same',
    'mig_prev_sunbelt',
    'fam_under_18',
    'country_father',
    'country_mother',
    'country_self',
    'citizenship',
    'own_or_self',
    'vet_question',
    'vet_benefits',
    'year',
]
df[categorical_features] = df[categorical_features].astype('category')

In [None]:
### Drop columns not used in modelling
df = df.drop(
    columns=[
        'region_prev_res',
        'state_prev_res',
        'det_hh_fam_stat',
        'det_hh_summ',
        'mig_chg_msa',
        'mig_chg_reg',
        'mig_move_reg',
        'mig_same',
        'mig_prev_sunbelt',
        'country_father',
        'country_mother',
        'country_self',
        'year',
    ]
)

In [None]:
# This is section one of the data processing, will be using the same data as shown in the minilab
df.info()

### Data Preperation Part 2

In [None]:
# Sampling the dataset
selection_df = df.sample(frac = .005)

In [None]:
selection_df.shape()

In [None]:
def preprocess_pipeline(numeric_features, categorical_features):
    ### Scale numerical, one hot categorical

    numeric_transformer = Pipeline(
        steps=[
            #('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler()),
        ]
    )
    categorical_transformer = Pipeline(
        steps=[
            #('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore')),
        ]
    )
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)]
    )
    preprocess_pipe = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
        ]
    )
    
    return preprocess_pipe

In [None]:
### Preprocess the data
target = 'income_50k'
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns
categorical_features = df.select_dtypes(include=['object','bool', 'category']).drop([target], axis=1).columns
X_selection = selection_df.drop(target, axis=1)
y_selection = selection_df[target]
preprocessor = preprocess_pipeline(numeric_features, categorical_features)
X_selection_preprocessed = preprocessor.fit_transform(X_selection)

In [None]:
### Make a list of all the columns after one hot encoding
ohe = preprocessor['preprocessor'].named_transformers_['cat']['onehot']
cat_processed = ohe.get_feature_names(X_selection[categorical_features].columns)
all_processed_cols = np.concatenate((numeric_features, cat_processed), axis=0)

In [None]:
import warnings
warnings.filterwarnings('ignore')

### RFE CV to find best features 
clf = LogisticRegression(n_jobs=-1)
rfecv = RFECV(clf, step=1, cv=5)
rfecv.fit(X_selection_preprocessed, y_selection)
print("Optimal number of features : %d" % rfecv.n_features_)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (% of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

In [None]:
### Show all selected features
selected_features = all_processed_cols[rfecv.support_]
print(selected_features)

In [None]:
### log 
X_selection_log = X_selection.copy()
X_selection_log['wage_per_hour'] = np.log10(X_selection_log['wage_per_hour'] + 1)
X_selection_log['capital_gains'] = np.log10(X_selection_log['capital_gains'] + 1)
X_selection_log['capital_losses'] = np.log10(X_selection_log['capital_losses'] + 1)
X_selection_log['stock_dividends'] = np.log10(X_selection_log['stock_dividends'] + 1)
X_selection_log_preprocessed = preprocessor.fit_transform(X_selection_log)


In [None]:
clf = LogisticRegression(n_jobs=-1)
rfecv_log = RFECV(clf, step=1, cv=5)
rfecv_log.fit(X_selection_log_preprocessed, y_selection)
print("Optimal number of features : %d" % rfecv_log.n_features_)
print("Max Score :", max(rfecv_log.grid_scores_) )

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (% of correct classifications)")
plt.plot(range(1, len(rfecv_log.grid_scores_) + 1), rfecv_log.grid_scores_)
plt.show()

In [None]:
### Show all selected features
selected_features = all_processed_cols[rfecv_log.support_]
print(selected_features)

## Modeling and Evaluation

In [None]:
finalDF = selection_df[selected_features]

finalDF.info()

In [None]:
#Splitting data into train and test
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=1)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics as mt

In [None]:
#### Using the classification_pipeline function Decision Tree
classifier = DecisionTreeClassifier()
param_grid = { 
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth' : [2, None],
    'min_samples_split' : [2, 3, 4],
    'min_samples_leaf' : [1,2,3],
    
}

CV = GridSearchCV(classifier, param_grid, scoring='accuracy', cv=10, n_jobs= -1)
CV.fit(X_selected, y)
    
print('Best Score: {s}'.format(s=CV.best_score_))
print('Best Parameters: {p}'.format(p=CV.best_params_))  

In [None]:
#getting prediction accuracy for Decision Tree

dt_clf = DecisionTreeClassifier(criterion = 'gini', 
                                splitter = 'random', 
                                min_samples_split = 3, 
                                min_samples_leaf = 3)


# train the decision tree algorithm
%time dt_clf.fit(X_train,y_train)
yhat = dt_clf.predict(X_test)
print ('accuracy:', mt.accuracy_score(y_test,yhat))

In [None]:
#Feature importance graph
imp = dt_clf.feature_importances_

#Plt
plt.bar(range(len(imp)), imp)

### Model and Evaluation 1

### Model and Evaluation 2

### Model and Evaluation 3

### Model and Evaluation 4

### Model and Evaluation 5

### Model and Evaluation 6

### Deployment

### Exceptional Work