# Classification

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from ipywidgets import FloatProgress
from matplotlib.colors import ListedColormap
from sklearn import ensemble, cross_validation, learning_curve, metrics, multiclass, preprocessing
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
data = pd.read_csv('threeclassdata.csv', index_col='Unnamed: 0')

X = data.drop('Label', axis = 1)
y = data['Label']

pia_border = np.where(data['Label'] == 0)[0][-1:][0]
vio_border = np.where(data['Label'] == 1)[0][-1:][0]

pia_data = data.drop('Label', axis = 1)
pia_data['Label'] = np.hstack([np.ones(pia_border), np.zeros(len(data) - pia_border)])
X_pia = pia_data.drop('Label', axis = 1)
y_pia = pia_data['Label']

vio_data = data.drop('Label', axis = 1)
vio_data['Label'] = np.hstack([np.zeros(pia_border), np.ones(vio_border - pia_border),
                               np.zeros(len(data) - vio_border)])
X_vio = vio_data.drop('Label', axis = 1)
y_vio = vio_data['Label']

gac_data = data.drop('Label', axis = 1)
gac_data['Label'] = np.hstack([np.zeros(vio_border), np.ones(len(data) - vio_border)])
X_gac = gac_data.drop('Label', axis = 1)
y_gac = gac_data['Label']

In [3]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.3, stratify = y)

sc = preprocessing.StandardScaler()

X_train = preprocessing.StandardScaler.fit_transform(sc, X_train)
X_test = preprocessing.StandardScaler.transform(sc, X_test)

(X_pia_train,
X_pia_test,
y_pia_train,
y_pia_test) = cross_validation.train_test_split(X_pia, y_pia,
                                                test_size = 0.3, stratify = y_pia)

X_pia_train = preprocessing.StandardScaler.fit_transform(sc, X_pia_train)
X_pia_test = preprocessing.StandardScaler.transform(sc, X_pia_test)

(X_vio_train,
X_vio_test,
y_vio_train,
y_vio_test) = cross_validation.train_test_split(X_vio,  y_vio,
                                                test_size = 0.3, stratify = y_vio)

X_vio_train = preprocessing.StandardScaler.fit_transform(sc, X_vio_train)
X_vio_test = preprocessing.StandardScaler.transform(sc, X_vio_test)

(X_gac_train,
X_gac_test,
y_gac_train,
y_gac_test) = cross_validation.train_test_split(X_gac, y_gac,
                                                test_size = 0.3, stratify = y_gac)

X_gac_train = preprocessing.StandardScaler.fit_transform(sc, X_gac_train)
X_gac_test = preprocessing.StandardScaler.transform(sc, X_gac_test)

In [4]:
def get_score(model, X, y, cv = 10, n_jobs = 1):
    return cross_validation.cross_val_score(model, X, y, cv = cv, n_jobs = n_jobs).mean()

## RandomForest

Regardless of multilabel

In [5]:
rf = ensemble.RandomForestClassifier(n_estimators = 50)

In [6]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [7]:
score = rf.score(X_test, y_test)
print score

0.632302405498


Multilabel (not necessary)

In [8]:
rf_ispia = ensemble.RandomForestClassifier(n_estimators = 50)
rf_ispia.fit(X_pia_train, y_pia_train)

score = rf_ispia.score(X_pia_test, y_pia_test)
print score

0.714776632302


In [9]:
rf_isvio = ensemble.RandomForestClassifier(n_estimators = 50)
rf_isvio.fit(X_vio_train, y_vio_train)

score = rf_isvio.score(X_vio_test, y_vio_test)
print score

0.819587628866


In [10]:
rf_isgac = ensemble.RandomForestClassifier(n_estimators = 50)
rf_isgac.fit(X_gac_train, y_gac_train)

score = rf_isgac.score(X_gac_test, y_gac_test)
print score

0.723367697595


Should have prepared a supertrain dataset, now is TODO

Unsurprisingly, ```sklearn``` already has multi-label classifier

In [11]:
Y_train = preprocessing.MultiLabelBinarizer().fit_transform([[y] for y in y_train])
Y_test = preprocessing.MultiLabelBinarizer().fit_transform([[y] for y in y_test])

clf = multiclass.OneVsRestClassifier(ensemble.RandomForestClassifier(n_estimators = 50))
clf.fit(X_train, Y_train)

score = clf.score(X_test, Y_test)

print score



0.501718213058
0.501718213058


Poor quality, because not multi-label, or the metric is bad. Come up with another matric?