# Test Classifiers and Regressors on train / test data set

In [11]:
import numpy as np
import math
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score,cross_val_predict, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC,SVR,LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score,roc_curve, auc, get_scorer, roc_auc_score
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy.stats import spearmanr, pearsonr
from matplotlib import pyplot 
from sklearn import linear_model,neighbors,tree,gaussian_process,ensemble,neural_network, manifold

## Parameters

In [12]:
rel = "./"
train_data_file = rel+"features.csv"
cv_fold = 10

## Classifiers to test

In [13]:
classifiers = [
    ['KNN',KNeighborsClassifier(3)],
    ['DTC',DecisionTreeClassifier()],
    ['RFC',RandomForestClassifier()],
    ['ADA',AdaBoostClassifier()],
    ['GNB',GaussianNB()],
    ['LDA',LinearDiscriminantAnalysis()],
    ['QDA',QuadraticDiscriminantAnalysis()]
]

## Load data

In [14]:
import csv, json

import io
import csv

def parse(data_file, sep):
    d = pd.read_csv(data_file, header=0).as_matrix()
    x = d[:,:-1]
    y = d[:,-1]
    return x,y


In [15]:
def multiclass(y_class):
    y = np.zeros([len(y_class),len(np.unique(y_class))])
    for idx, val in enumerate(y_class):
        y[idx,int(val)]=1
    return y

In [16]:
x_train,y_train = parse(train_data_file,sep=',')

In [17]:
x_train.shape

(489L, 16L)

In [18]:
y_train_mul = multiclass(y_train)

In [19]:
pd.DataFrame(x_train).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.0,130.0,1154.0,0.0,41.196853,274.99829,0.0,5.885265,54.999658,0.0,2.026329,1.939921,0.0,0.289476,0.387984,0.0
1,1.0,218.0,94.0,0.0,62.127774,28.952765,0.0,6.212777,5.790553,0.0,2.445318,1.360861,0.0,0.244532,0.272172,0.0
2,2.0,69.0,48.0,8.0,19.645238,11.21565,3.5,1.964524,1.121565,1.75,3.569048,2.705349,0.5,0.356905,0.270535,0.25
3,3.0,99.0,8.0,0.0,35.043599,0.0,0.0,3.893733,0.0,0.0,3.527346,0.0,0.0,0.391927,0.0,0.0
4,4.0,298.0,79.0,0.0,76.303046,22.552548,0.0,5.869465,2.255255,0.0,4.398763,3.306795,0.0,0.338366,0.330679,0.0


## Score classifiers

In [22]:
import sklearn.pipeline as pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import BaggingClassifier
import sklearn.metrics as metrics

classifiers_scores = []
# iterate over classifiers
for name,clf in classifiers:
    #clf =  ensemble.VotingClassifier(classifiers,voting='soft')
    y_score_clf = cross_val_predict(clf, x_train, y_train, cv=cv_fold,method='predict_proba')
    #score_max = f1_score(y_train_class, y_score_clf[:,1]>0.5)
    classifiers_scores.append(pearsonr(y_train,y_score_clf[:,1])[0])

In [23]:
pd.DataFrame({"Classifier":[name for name,classifier in classifiers],\
              "Pearson":classifiers_scores},\
             columns=["Classifier","Pearson"]\
            )

Unnamed: 0,Classifier,Pearson
0,KNN,0.086189
1,DTC,-0.096073
2,RFC,-0.017703
3,ADA,0.024885
4,GNB,-0.034307
5,LDA,0.03803
6,QDA,-0.047862


## The best classifier

In [31]:
max_cls = np.where(classifiers_scores == max(classifiers_scores))[0][0]
clf = classifiers[max_cls][1]
print(clf)
y_score_clf = cross_val_predict(clf, x_train, y_train, cv=cv_fold,method='predict_proba')
y_score = y_score_clf[:,1]
print("Pearson:\t%0.2f"%pearsonr(y_train,y_score)[0])
plot_roc_curve(y_train_class,y_score)
    



KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')
Pearson:	0.09


ValueError: Mix type of y not allowed, got types set(['continuous', 'multiclass'])

In [25]:
out_regr_err = 'class.res.csv'
columns = ['Plag','Truth','Classification','Diff','Str1','Str2']
pd.DataFrame(np.array([y_test,y_train_class,y_score_class,y_test_class-y_score_class,pairs_test[:,0],pairs_test[:,1]]).T,columns=columns).to_csv(out_regr_err)

NameError: name 'y_test' is not defined