## Libraries

In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
df_adult = pd.read_csv('adult.data')
df_covtype = pd.read_csv('covtype.data')
df_letter = pd.read_csv('letter-recognition.data')

# **Adults**

In [3]:
df_adult.head()

Unnamed: 0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


In [4]:
df_adult.shape

(32560, 15)

In [5]:
df_adult.columns = df_adult.columns.str.replace(' ', '')

In [6]:
df_adult.columns

Index(['39', 'State-gov', '77516', 'Bachelors', '13', 'Never-married',
       'Adm-clerical', 'Not-in-family', 'White', 'Male', '2174', '0', '40',
       'United-States', '<=50K'],
      dtype='object')

In [7]:
df_adult['<=50K'].unique()

array([' <=50K', ' >50K'], dtype=object)

### Feature engineering (adult)

In [8]:
# test dataset
p = pd.DataFrame(data={'a':[1,55,456,223,789,43],'b':[90,24,4325,4675,2314,43],'c':[123,324,12334,543,345,129],'d':[3214,1234,325,453,213,567]})
p

Unnamed: 0,a,b,c,d
0,1,90,123,3214
1,55,24,324,1234
2,456,4325,12334,325
3,223,4675,543,453
4,789,2314,345,213
5,43,43,129,567


In [9]:
shuffle = p.sample(frac=1)
shuffle

Unnamed: 0,a,b,c,d
3,223,4675,543,453
4,789,2314,345,213
2,456,4325,12334,325
5,43,43,129,567
0,1,90,123,3214
1,55,24,324,1234


In [10]:
shuffle.iloc[0:2]

Unnamed: 0,a,b,c,d
3,223,4675,543,453
4,789,2314,345,213


In [11]:
df_adult.head()

Unnamed: 0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


In [12]:
# replace target, <=50K, with 0/1 for binary classification
df_adult['<=50K'].replace({' <=50K':0, ' >50K':1}, inplace=True)

In [13]:
# use one-hot encoding for categorical variables
categorical = [i for i in df_adult.dtypes.index if df_adult.dtypes[i]=='object']

df_adult = pd.get_dummies(df_adult, columns=categorical)

# Covtype

In [14]:
df_covtype.head()

Unnamed: 0,2596,51,3,258,0,510,221,232,148,6279,...,0.34,0.35,0.36,0.37,0.38,0.39,0.40,0.41,0.42,5
0,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
1,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
2,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
3,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5
4,2579,132,6,300,-15,67,230,237,140,6031,...,0,0,0,0,0,0,0,0,0,2


In [15]:
df_covtype.shape

(581011, 55)

Select 30000 random data points to use for analysis since that is what Caruana uses in the paper. Also, 581011 data points is too much

In [16]:
df_ctype = df_covtype.sample(frac=1)
df_ctype = df_ctype.iloc[0:30000]

In [17]:
df_ctype['5'].value_counts()

2    14627
1    10955
3     1808
7     1060
6      944
5      475
4      131
Name: 5, dtype: int64

Last column is target. A value of 2 is converted to 1, the rest are converted to 0 for binary classification

In [18]:
# last column is target. A value of 2 is converted to 1, the rest are converted to 0 for binary classification
df_ctype['5'].replace({1:0, 3:0, 4:0, 5:0, 6:0, 7:0, 2:1}, inplace=True)

In [19]:
df_ctype['5'].value_counts()

0    15373
1    14627
Name: 5, dtype: int64

# Letter

In [26]:
df_letter.head()

Unnamed: 0,T,2,8,3,5,1,8.1,13,0,6,6.1,10,8.2,0.1,8.3,0.2,8.4
0,I,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
1,D,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
2,N,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
3,G,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10
4,S,4,11,5,8,3,8,8,6,9,5,6,6,0,8,9,7


In [27]:
df_letter.shape

(19999, 17)

In [57]:
len(df_letter['T'].value_counts())

26

'T' column is target. According to the paper, replace letters A-M as 1 and the rest as 0

In [50]:
alphabet = list(string.ascii_uppercase)
df_letter['T'].replace(alphabet[0:13], 1, inplace=True)
df_letter['T'].replace(alphabet[13:], 0, inplace=True)



#df.replace([0, 1, 2, 3], 4)
#df_ctype['5'].replace({1:0, 3:0, 4:0, 5:0, 6:0, 7:0, 2:1}, inplace=True)

['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M']

### Split data points into inputs/outputs

In [22]:
# randomly shuffle data points
#df_adult = df_adult.sample(frac=1)

In [20]:
# separate into training/validation and testing
#cv = df_adult.iloc[0:5000]
#test = df_adult.iloc[5000:]

# separate inputs/outputs
#cv_X = cv.drop(['<=50K'], axis=1)
#cv_Y = cv['<=50K']
#test_X = test.drop(['<=50K'], axis=1)
#test_Y = test['<=50K']

df_adult_X = df_adult.drop(['<=50K'], axis=1)
df_adult_Y = df_adult['<=50K']
#dfa_X_train, dfa_X_test, dfa_y_train, dfa_y_test = train_test_split(df_adult_X, df_adult_Y, random_state=12345,
                                                                    #train_size=5000, stratify=df_adult_Y)

In [21]:
df_ctype_X = df_ctype.drop(['5'], axis=1)
df_ctype_Y = df_ctype['5']

In [58]:
df_letter_X = df_letter.drop(['T'], axis=1)
df_letter_Y = df_letter['T']

**use [52] instead of [47] and [49]**

In [None]:
#def rand_split(df):
    #'''randomly shuffle data points and split into training/validation and testing'''
    
    #df = df.sample(frac=1)
    
    # separate into training/validation and testing
    #cv = df.iloc[0:5000]
    #test = df.iloc[5000:]
    
    # separate inputs/outputs
    #X = df.drop(['<=50K'], axis=1)
    #Y = cv['<=50K']
    #test_X = test.drop(['<=50K'], axis=1)
    #test_Y = test['<=50K']
    
    #X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=12345, train_size=5000, stratify=Y)
    

#def standardize_df(df):
   # '''standardize each feature of the data'''
    #scaler = StandardScaler()
   # return scaler.fit_transform(df)

### Training/Validation

In [None]:
np.logspace(0,1,4)

In [None]:
np.linspace(0,10,4)

Setting up all classifiers, pipelines, parameter grids, and GridSearchCV objects

In [None]:
a = np.array([5.3242,24.54,90.36,29.54])
np.array([5.3242,24.54,90.36,29.54]).astype(int)

In [23]:
# Define classifiers
clf1 = KNeighborsClassifier()
clf2 = LogisticRegression(random_state=1)
clf3 = RandomForestClassifier(random_state=1)


# pipelines
pipe1 = Pipeline([('std', StandardScaler()),
                  ('classifier', clf1)])

pipe2 = Pipeline([('std', StandardScaler()),
                  ('classifier', clf2)])

pipe3 = Pipeline([('std', StandardScaler()),
                  ('classifier', clf3)])


# Setting up the parameter grids
# later use np.logspace(0, 2.69897, 25) for n_neighbors
grid1 = [{'classifier__n_neighbors': np.linspace(1, 500, 25).astype(int),
                'classifier__weights': ['uniform', 'distance']}]
                #'classifier__C': np.power(10., np.arange(-4, 4))}]

grid2 = [{'classifier__C': np.concatenate((np.array(0), np.power(10., np.arange(-8, 5))), axis=None)}]

grid3 = [{'classifier__max_features': [1, 2, 4, 6, 8, 12, 16, 20],
          'classifier__n_estimators': [1024]}]


# Setting up multiple GridSearchCV objects, 1 for each algorithm
gridcvs = {}

for pgrid, est, name in zip((grid1, grid2, grid3),
                            (pipe1, pipe2, pipe3),
                            ('KNN', 'Logistic', 'RF')):
    gcv = GridSearchCV(estimator=est,
                       param_grid=pgrid,
                       scoring='accuracy',
                       n_jobs=1,
                       cv=StratifiedKFold(n_splits=5),
                       verbose=0,
                       refit=True)
    gridcvs[name] = gcv

In [None]:
#sorted(gridcvs.items())

In [24]:
# array of array of scores for each algorithm, each array length 3 since there are 3 trials
#acc_scores = {name: [] for name, gs_est in gridcvs.items()}

#skfold = StratifiedKFold( n_splits=5, shuffle=True, random_state=1)

# The outer loop for algorithm selection
#c = 1
#for outer_train_idx, outer_valid_idx in skfold.split(X_train,y_train):
    #for name, gs_est in sorted(gridcvs.items()): # name refers to name of algorithm, gs_est refers to gridsearch
        #print('outer fold %d/5 | tuning %-8s' % (c, name), end='')

        # The inner loop for hyperparameter tuning
        #gs_est.fit(X_train.iloc[outer_train_idx], y_train.iloc[outer_train_idx])
        #y_pred = gs_est.predict(X_train.iloc[outer_valid_idx])
        #acc = accuracy_score(y_true=y_train.iloc[outer_valid_idx], y_pred=y_pred)
        #print(' | inner ACC %.2f%% | outer ACC %.2f%%' %
         #     (gs_est.best_score_ * 100, acc * 100))
        #cv_scores[name].append(acc)

    #c += 1

    
# START HERE
# input is a dataset, we're going to call this method 3 times (for 3 datasets)

# array of array of scores for each algorithm, each array length 3 since there are 3 trials
#acc_scores = {name: [] for name, gs_est in gridcvs.items()}

#skfold = StratifiedKFold( n_splits=5, shuffle=True, random_state=1)

def train_test(df_X, df_Y):
    '''For dataset, randomly choose 5000 data samples for 5-fold CV, select best hyperparameters for each
       algorithm, and find training/test accuracy scores. Do this for 3 trials'''
    num_trials = 3
    for i in range(num_trials):
    
        # randomly choose 5000 data samples for 5-fold CV
        X_train, X_test, y_train, y_test = train_test_split(df_X, df_Y, random_state=12345, train_size=5000, stratify=df_Y)
    
        # grid search to select hyperparameters
        #acc_scores_trial = {name: 1 for name, gs_est in gridcvs.items()}
        for name, gsearch in sorted(gridcvs.items()):
            # fit grid search to training, evaluate on test set
            gsearch.fit(X_train, y_train)
            #acc = gsearch.best_score_ # average validation accuracy over different splits in the cross-validation
            #best_pars = gsearch.best_params_
            #y_pred = gsearch.best_estimator_.predict(X_test)
            #acc_scores[name].append(acc)
            #acc_scores_trial[name] = gsearch.best_score_
            gsearch.best_estimator_.fit(X_train, y_train) # fit to entire train set
            train_acc = accuracy_score(y_true=y_train, y_pred=gsearch.best_estimator_.predict(X_train)) # accuracy on train
            test_acc = accuracy_score(y_true=y_test, y_pred=gsearch.best_estimator_.predict(X_test)) # accuracy on test
            print('Trial',i+1,'Training Accuracy for',name,': %.3f' %train_acc) # print accuracy for training
            print('Trial',i+1,'Test Accuracy for',name,': %.3f' %test_acc) # print accuracy for test
        
        
        
        
        
        #acc_scores[name].append(test_acc)
        
    # pick the best classifier, train on all training data, predict on test data
    #best_classif = max(acc_scores_trial, key=acc_scores_trial.get)
    #gridcvs[best_classif].best_estimator_.fit(X_train, y_train)
    #best_test_acc = accuracy_score(y_true=y_test, y_pred=gridcvs[best_classif].best_estimator_.predict(X_test))

### Accuracy scores

In [None]:
print('ADULT')
train_test(df_adult_X, df_adult_Y)
print('\n')

In [None]:
print('COV_TYPE')
train_test(df_ctype_X, df_ctype_Y)

In [None]:
print('LETTER')
train_test(df_letter_X, df_letter_Y)

In [None]:
a = 0.9
l = 'LOG'
print('Trial',1,'Training Accuracy for',l,': %.2f' %a)
print('la')

In [None]:
a = 5
a+=1
a

In [None]:
test = {'a':40000, 'b':500, 'c':9000}
#max(test[i] for i in test)
max(test, key=test.get)

In [None]:
a = [100,200,300,49,50000]
a.append(1000000)
max(a)

In [None]:
len(gridcvs.items()) # array of tuples - (name of algorithm, GridSearchCV)

In [None]:
np.concatenate((np.array(0), np.power(10., np.arange(-8, 5))), axis=None)

In [None]:
np.power(10., np.arange(-4, 4))

In [None]:
np.arange(-4,4)