# Testing Multiple Classifiers

The objective of this study is to test performance of multiple classifiers and the ensembled estimation. 

In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def warn(*args, **kwargs): pass
import warnings
warnings.warn = warn

%matplotlib inline

# from sklearn.preprocessing import LabelEncoder
# from sklearn.cross_validation import StratifiedShuffleSplit

train = pd.read_csv('../input/train.csv').drop('id',axis=1)
test = pd.read_csv('../input/test.csv')
test_ids = test['id']
test.drop('id',axis=1,inplace=True)

There is no null value in train and test data.

In [2]:
print(train.isnull().any().any())
print(test.isnull().any().any())

False
False


# Label Encoding the Label

In [3]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [4]:
species = train['species']
train.drop('species',axis=1,inplace=True)
y_train = le.fit_transform(species)

# Normalize the Sparse Features

In [5]:
from sklearn.preprocessing import MaxAbsScaler

In [6]:
x_data = np.vstack([train,test])
mas = MaxAbsScaler()
n_x_data = mas.fit_transform(x_data)
print(n_x_data.shape)
n_x_data

(1584, 192)


array([[ 0.08888282,  0.11428711,  0.13953682, ...,  0.01298739,
         0.        ,  0.16994177],
       [ 0.06666212,  0.        ,  0.18604513, ...,  0.00259854,
         0.44943277,  0.1503313 ],
       [ 0.06666212,  0.04762044,  0.11627672, ...,  0.        ,
         0.23595738,  0.01961047],
       ..., 
       [ 0.19999772,  0.14285645,  0.09302256, ...,  0.        ,
         0.49438525,  0.0457533 ],
       [ 0.15555631,  0.04762044,  0.36046318, ...,  0.        ,
         0.13483443,  0.12418847],
       [ 0.        ,  0.57143554,  0.        , ...,  0.04155806,
         0.        ,  0.11764942]])

# Split the dataset - raw features

In [7]:
from sklearn.model_selection import StratifiedShuffleSplit

n_x_test = n_x_data[len(species):,:]
x_train = n_x_data[0:len(species),:]

# val_size = 0.1
# seed = 0
# n_x_train, n_x_val, y_train, y_val = cross_validation.train_test_split(n_x_train, y_train, test_size=val_size, 
#                                                                        random_state=seed, stratify=y_train)

sss = StratifiedShuffleSplit(n_splits=2, test_size=0.1, random_state=0)

for train_index, test_index in sss.split(x_train,y_train):
    n_x_train, n_x_val = x_train[train_index], x_train[test_index]
    n_y_train, n_y_val = y_train[train_index], y_train[test_index]

In [8]:
print(n_x_train.shape)
print(n_y_train.shape)
print(n_x_val.shape)
print(n_y_val.shape)
print(n_x_test.shape)

(891, 192)
(891,)
(99, 192)
(99,)
(594, 192)


In [9]:
np.isnan(y_train).any()

False

# Setting up models and grid search

In [10]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score, log_loss

In [11]:
seed=1
models = [
            'ADB',
            'GBC',
            'RFC',
            'KNC',
            'SVC',
            'logisticRegression'
         ]
clfs = [
        AdaBoostClassifier(random_state=seed),
        GradientBoostingClassifier(random_state=seed),
        RandomForestClassifier(random_state=seed,n_jobs=-1),
        KNeighborsClassifier(n_jobs=-1),
        SVC(random_state=seed,probability=True),
        LogisticRegression(solver='newton-cg', multi_class='multinomial')
        ]

In [12]:
params = {
            models[0]:{'learning_rate':[0.01], 'n_estimators':[150]},
            models[1]:{'learning_rate':[0.01],'n_estimators':[100], 'max_depth':[3],
                       'min_samples_split':[2],'min_samples_leaf': [2]},
            models[2]:{'n_estimators':[100], 'criterion':['gini'],'min_samples_split':[2],
                      'min_samples_leaf': [4]},
            models[3]:{'n_neighbors':[5], 'weights':['distance'],'leaf_size':[15]},
            models[4]: {'C':[100], 'tol': [0.005],
                       'kernel':['sigmoid']},
            models[5]: {'C':[2000], 'tol': [0.0001]}
         }

In [13]:
y_test = 0
test_scores = []

In [14]:
for name, estimator in zip(models,clfs):
    print(name)
    clf = GridSearchCV(estimator, params[name], scoring='log_loss', refit='True', n_jobs=-1, cv=5)
    clf.fit(n_x_train, n_y_train)

    print("best params: " + str(clf.best_params_))
    print("best scores: " + str(clf.best_score_))
    estimates = clf.predict_proba(n_x_test)
    y_test+=estimates
    acc = accuracy_score(n_y_val, clf.predict(n_x_val))
    print("Accuracy: {:.4%}".format(acc))
    
    test_scores.append((acc,clf.best_score_))
    
    submission = pd.DataFrame(estimates, index=test_ids, columns=le.classes_)
    submission.to_csv('./'+name+'.csv')

ADB
best params: {'n_estimators': 150, 'learning_rate': 0.01}
best scores: -2.271160197186102
Accuracy: 63.6364%
GBC
best params: {'n_estimators': 100, 'max_depth': 3, 'min_samples_split': 2, 'learning_rate': 0.01, 'min_samples_leaf': 2}
best scores: -2.2416002434828592
Accuracy: 67.6768%
RFC
best params: {'min_samples_split': 2, 'criterion': 'gini', 'min_samples_leaf': 4, 'n_estimators': 100}
best scores: -1.0429103612341104
Accuracy: 98.9899%
KNC
best params: {'leaf_size': 15, 'weights': 'distance', 'n_neighbors': 5}
best scores: -0.173115044295944
Accuracy: 97.9798%
SVC
best params: {'C': 100, 'tol': 0.005, 'kernel': 'sigmoid'}
best scores: -2.409326500304996
Accuracy: 98.9899%
logisticRegression
best params: {'C': 2000, 'tol': 0.0001}
best scores: -0.03842124714330611
Accuracy: 98.9899%


In [15]:
y_test = y_test/len(models)
submission = pd.DataFrame(y_test, index=test_ids, columns=le.classes_)
submission.to_csv('./avgEnsembles.csv')