<a href="https://colab.research.google.com/github/narcilla/118A/blob/master/118A.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
df_adult = pd.read_csv('adult.data')
df_covtype = pd.read_csv('covtype.data')
df_letter = pd.read_csv('letter-recognition.data')

# **Adults**

In [3]:
df_adult.head()

Unnamed: 0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


In [4]:
df_adult.shape

(32560, 15)

In [5]:
df_adult.columns = df_adult.columns.str.replace(' ', '')

In [6]:
df_adult.columns

Index(['39', 'State-gov', '77516', 'Bachelors', '13', 'Never-married',
       'Adm-clerical', 'Not-in-family', 'White', 'Male', '2174', '0', '40',
       'United-States', '<=50K'],
      dtype='object')

In [7]:
df_adult['<=50K'].unique()

array([' <=50K', ' >50K'], dtype=object)

### Feature engineering (adult)

In [8]:
# replace target, <=50K, with 0/1 for binary classification
df_adult['<=50K'].replace({' <=50K':0, ' >50K':1}, inplace=True)

In [9]:
# use one-hot encoding for categorical variables
categorical = [i for i in df_adult.dtypes.index if df_adult.dtypes[i]=='object']

df_adult = pd.get_dummies(df_adult, columns=categorical)

# Covtype

In [10]:
df_covtype.head()

Unnamed: 0,2596,51,3,258,0,510,221,232,148,6279,1,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.10,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.20,0.21,0.22,0.23,0.24,0.25,0.26,0.27,0.28,0.29,0.30,0.31,1.1,0.32,0.33,0.34,0.35,0.36,0.37,0.38,0.39,0.40,0.41,0.42,5
0,2590,56,2,212,-6,390,220,235,151,6225,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5
1,2804,139,9,268,65,3180,234,238,135,6121,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2
2,2785,155,18,242,118,3090,238,238,122,6211,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2
3,2595,45,2,153,-1,391,220,234,150,6172,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5
4,2579,132,6,300,-15,67,230,237,140,6031,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2


In [11]:
df_covtype.shape

(581011, 55)

Select 30000 random data points to use for analysis since that is what Caruana uses in the paper. Also, 581011 data points is too much

In [12]:
df_ctype = df_covtype.sample(frac=1)
df_ctype = df_ctype.iloc[0:30000]

In [13]:
df_ctype['5'].value_counts()

2    14750
1    10826
3     1864
7     1061
6      906
5      474
4      119
Name: 5, dtype: int64

Last column is target. A value of 2 is converted to 1, the rest are converted to 0 for binary classification

In [14]:
df_ctype['5'].replace({1:0, 3:0, 4:0, 5:0, 6:0, 7:0, 2:1}, inplace=True)

# Letter

In [15]:
df_letter.head()

Unnamed: 0,T,2,8,3,5,1,8.1,13,0,6,6.1,10,8.2,0.1,8.3,0.2,8.4
0,I,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
1,D,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
2,N,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
3,G,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10
4,S,4,11,5,8,3,8,8,6,9,5,6,6,0,8,9,7


In [16]:
df_letter.shape

(19999, 17)

In [17]:
len(df_letter['T'].value_counts())

26

'T' column is target. According to the paper, replace letters A-M as 1 and the rest as 0

In [18]:
alphabet = list(string.ascii_uppercase)
df_letter['T'].replace(alphabet[0:13], 1, inplace=True)
df_letter['T'].replace(alphabet[13:], 0, inplace=True)

### Split data points into inputs/outputs

In [19]:
df_adult_X = df_adult.drop(['<=50K'], axis=1)
df_adult_Y = df_adult['<=50K']

In [20]:
df_ctype_X = df_ctype.drop(['5'], axis=1)
df_ctype_Y = df_ctype['5']

In [21]:
df_letter_X = df_letter.drop(['T'], axis=1)
df_letter_Y = df_letter['T']

### Training/Validation

Setting up all classifiers, pipelines, parameter grids, and GridSearchCV objects

In [22]:
# Define classifiers
clf1 = KNeighborsClassifier()
clf2 = LogisticRegression(random_state=1, max_iter=1000)
clf3 = RandomForestClassifier(random_state=1)


# pipelines
pipe1 = Pipeline([('std', StandardScaler()),
                  ('classifier', clf1)])

pipe2 = Pipeline([('std', StandardScaler()),
                  ('classifier', clf2)])

pipe3 = Pipeline([('std', StandardScaler()),
                  ('classifier', clf3)])


# Setting up the parameter grids
grid1 = [{'classifier__n_neighbors': np.linspace(1, 500, 25).astype(int),
                'classifier__weights': ['uniform', 'distance']}]

grid2 = [{'classifier__C': np.power(10., np.arange(-8, 5))}]

grid3 = [{'classifier__max_features': [1, 2, 4, 6, 8, 12, 16],
          'classifier__n_estimators': [1024]}]


# Setting up multiple GridSearchCV objects, 1 for each algorithm
gridcvs = {}

for pgrid, est, name in zip((grid1, grid2, grid3),
                            (pipe1, pipe2, pipe3),
                            ('KNN', 'Logistic', 'RF')):
    gcv = GridSearchCV(estimator=est,
                       param_grid=pgrid,
                       scoring='accuracy',
                       n_jobs=1,
                       cv=StratifiedKFold(n_splits=5),
                       verbose=0,
                       refit=True)
    gridcvs[name] = gcv

In [23]:
def train_test(df_X, df_Y):
    '''For dataset, randomly choose 5000 data samples for 5-fold CV, select best hyperparameters for each
       algorithm, and find training/test accuracy scores. Do this for 3 trials'''
    num_trials = 3
    for i in range(num_trials):
    
        # randomly choose 5000 data samples for 5-fold CV
        X_train, X_test, y_train, y_test = train_test_split(df_X, df_Y, train_size=5000, stratify=df_Y)
    
        # grid search to select hyperparameters
        #acc_scores_trial = {name: 1 for name, gs_est in gridcvs.items()}
        for name, gsearch in sorted(gridcvs.items()):
            # fit grid search to training, evaluate on test set
            gsearch.fit(X_train, y_train)
            gsearch.best_estimator_.fit(X_train, y_train) # fit to entire train set
            train_acc = accuracy_score(y_true=y_train, y_pred=gsearch.best_estimator_.predict(X_train)) # accuracy on train
            test_acc = accuracy_score(y_true=y_test, y_pred=gsearch.best_estimator_.predict(X_test)) # accuracy on test
            print('Trial',i+1,'Training Accuracy for',name,': %.3f' %train_acc) # print accuracy for training
            print('Trial',i+1,'Test Accuracy for',name,': %.3f' %test_acc) # print accuracy for test

### Accuracy scores

In [24]:
print('ADULT')
train_test(df_adult_X, df_adult_Y)

ADULT
Trial 1 Training Accuracy for KNN : 0.837
Trial 1 Test Accuracy for KNN : 0.830
Trial 1 Training Accuracy for Logistic : 0.858
Trial 1 Test Accuracy for Logistic : 0.849
Trial 1 Training Accuracy for RF : 1.000
Trial 1 Test Accuracy for RF : 0.853
Trial 2 Training Accuracy for KNN : 0.839
Trial 2 Test Accuracy for KNN : 0.830
Trial 2 Training Accuracy for Logistic : 0.860
Trial 2 Test Accuracy for Logistic : 0.848
Trial 2 Training Accuracy for RF : 1.000
Trial 2 Test Accuracy for RF : 0.853
Trial 3 Training Accuracy for KNN : 1.000
Trial 3 Test Accuracy for KNN : 0.830
Trial 3 Training Accuracy for Logistic : 0.863
Trial 3 Test Accuracy for Logistic : 0.848
Trial 3 Training Accuracy for RF : 1.000
Trial 3 Test Accuracy for RF : 0.853


In [25]:
print('COV_TYPE')
train_test(df_ctype_X, df_ctype_Y)

COV_TYPE
Trial 1 Training Accuracy for KNN : 1.000
Trial 1 Test Accuracy for KNN : 0.774
Trial 1 Training Accuracy for Logistic : 0.754
Trial 1 Test Accuracy for Logistic : 0.752
Trial 1 Training Accuracy for RF : 1.000
Trial 1 Test Accuracy for RF : 0.824
Trial 2 Training Accuracy for KNN : 1.000
Trial 2 Test Accuracy for KNN : 0.773
Trial 2 Training Accuracy for Logistic : 0.759
Trial 2 Test Accuracy for Logistic : 0.754
Trial 2 Training Accuracy for RF : 1.000
Trial 2 Test Accuracy for RF : 0.823
Trial 3 Training Accuracy for KNN : 1.000
Trial 3 Test Accuracy for KNN : 0.770
Trial 3 Training Accuracy for Logistic : 0.748
Trial 3 Test Accuracy for Logistic : 0.757
Trial 3 Training Accuracy for RF : 1.000
Trial 3 Test Accuracy for RF : 0.830


In [26]:
print('LETTER')
train_test(df_letter_X, df_letter_Y)

LETTER
Trial 1 Training Accuracy for KNN : 1.000
Trial 1 Test Accuracy for KNN : 0.952
Trial 1 Training Accuracy for Logistic : 0.721
Trial 1 Test Accuracy for Logistic : 0.725
Trial 1 Training Accuracy for RF : 1.000
Trial 1 Test Accuracy for RF : 0.947
Trial 2 Training Accuracy for KNN : 1.000
Trial 2 Test Accuracy for KNN : 0.955
Trial 2 Training Accuracy for Logistic : 0.730
Trial 2 Test Accuracy for Logistic : 0.725
Trial 2 Training Accuracy for RF : 1.000
Trial 2 Test Accuracy for RF : 0.948
Trial 3 Training Accuracy for KNN : 1.000
Trial 3 Test Accuracy for KNN : 0.955
Trial 3 Training Accuracy for Logistic : 0.724
Trial 3 Test Accuracy for Logistic : 0.722
Trial 3 Training Accuracy for RF : 1.000
Trial 3 Test Accuracy for RF : 0.947
