In [1]:
import numpy as np

<h2>Leitura de dataset</h2>

In [2]:
X_train = np.genfromtxt('Datasets/train/X_train.txt')
print(X_train.shape) 

y_train = np.genfromtxt('Datasets/train/y_train.txt')
print(y_train.shape)

X_test = np.genfromtxt('Datasets/test/X_test.txt')
print(X_test.shape)

y_test = np.genfromtxt('Datasets/test/y_test.txt')
print(y_test.shape)

(7352, 561)
(7352,)
(2947, 561)
(2947,)


<h2>Normalização</h2>

In [3]:
print('Valor mínimo de X_train {}'.format(X_train.min()))
print('Valor máximo de X_train {}'.format(X_train.max()))

print('Valor mínimo de X_test {}'.format(X_test.min()))
print('Valor máximo de X_test {}'.format(X_test.max()))

Valor mínimo de X_train -1.0
Valor máximo de X_train 1.0
Valor mínimo de X_test -1.0
Valor máximo de X_test 1.0


<h2>Balanceamento</h2>

In [4]:
unicos = np.unique(y_train)

for u in unicos:
    print('valore {}, quantidade {}'.format(u, np.count_nonzero(y_train == u)))

valore 1.0, quantidade 1226
valore 2.0, quantidade 1073
valore 3.0, quantidade 986
valore 4.0, quantidade 1286
valore 5.0, quantidade 1374
valore 6.0, quantidade 1407


<h2>Util</h2>

In [5]:
from sklearn.metrics import *

def printValues(label, y_pred):
    acc = accuracy_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred, average='micro')
    pre = precision_score(y_test, y_pred, average='micro')
    
    print('\033[1m{}\033[0m:\nAccuracy: {}\nRecall: {}\nPrecision: {}\n'.format(label, acc, rec, pre))

<h2>Feature selection</h2>

<h3>F-test</h3>

In [6]:
from sklearn.feature_selection import *

F, pval = f_classif(X_train, y_train)
ft_columns = np.argwhere(pval <= 0.01).reshape(-1)

ft_train = X_train[:, ft_columns]
ft_test = X_test[:, ft_columns]
ft_train.shape, ft_test.shape

((7352, 555), (2947, 555))

<h3>Mutual Information</h3>

In [7]:
mi = mutual_info_classif(X_train, y_train)
mi_colulmns = np.argwhere(mi <= 0.05).reshape(-1)

mi_train = X_train[:, mi_colulmns]
mi_test = X_test[:, mi_colulmns]
mi_train.shape, mi_test.shape

((7352, 24), (2947, 24))

<h3>Variance threshold</h3>

In [8]:
vt = VarianceThreshold(threshold=0.1)
vt.fit(X_train)

vt_train = vt.transform(X_train)
vt_test = vt.transform(X_test)
vt_train.shape, vt_test.shape

((7352, 189), (2947, 189))

<h3>Naive Bayes</h3>

In [9]:
from sklearn.naive_bayes import GaussianNB

def runNaiveBayes(x_tr, x_te, label):
    gnb = GaussianNB()
    y_pred = gnb.fit(x_tr, y_train).predict(x_te)

    printValues(label, y_pred)

In [10]:
runNaiveBayes(X_train, X_test, 'Original')

runNaiveBayes(mi_train, mi_test, 'Mutual Information')

runNaiveBayes(ft_train, ft_test, 'F-test')

runNaiveBayes(vt_train, vt_test, 'Variance Threshold')

[1mOriginal[0m:
Accuracy: 0.770274855786
Recall: 0.770274855786
Precision: 0.770274855786

[1mMutual Information[0m:
Accuracy: 0.423142178487
Recall: 0.423142178487
Precision: 0.423142178487

[1mF-test[0m:
Accuracy: 0.770614183916
Recall: 0.770614183916
Precision: 0.770614183916

[1mVariance Threshold[0m:
Accuracy: 0.830335934849
Recall: 0.830335934849
Precision: 0.830335934849



<h3>SVC</h3>

In [11]:
from sklearn.svm import SVC

def runSVC(x_tr, x_te, label):
    svc = SVC(C=10., gamma=0.1)
    y_pred = svc.fit(x_tr, y_train).predict(x_te)

    printValues(label, y_pred)

In [12]:
runSVC(X_train, X_test, 'Original')

runSVC(mi_train, mi_test, 'Mutual Information')

runSVC(ft_train, ft_test, 'F-test')

runSVC(vt_train, vt_test, 'Variance Threshold')

[1mOriginal[0m:
Accuracy: 0.956226671191
Recall: 0.956226671191
Precision: 0.956226671191

[1mMutual Information[0m:
Accuracy: 0.617237869019
Recall: 0.617237869019
Precision: 0.617237869019

[1mF-test[0m:
Accuracy: 0.956905327452
Recall: 0.956905327452
Precision: 0.956905327452

[1mVariance Threshold[0m:
Accuracy: 0.954190702409
Recall: 0.954190702409
Precision: 0.954190702409



<h3>Árvore de decisão</h3>

In [13]:
from sklearn.tree import DecisionTreeClassifier

def runTree(x_tr, x_te, label):
    decision_tree = DecisionTreeClassifier(criterion='entropy')
    y_pred= decision_tree.fit(x_tr, y_train).predict(x_te)
    
    printValues(label, y_pred)

In [14]:
runTree(X_train, X_test, 'Original')

runTree(mi_train, mi_test, 'Mutual Information')

runTree(ft_train, ft_test, 'F-test')

runTree(vt_train, vt_test, 'Variance Threshold')

[1mOriginal[0m:
Accuracy: 0.851034950797
Recall: 0.851034950797
Precision: 0.851034950797

[1mMutual Information[0m:
Accuracy: 0.315235833051
Recall: 0.315235833051
Precision: 0.315235833051

[1mF-test[0m:
Accuracy: 0.862232779097
Recall: 0.862232779097
Precision: 0.862232779097

[1mVariance Threshold[0m:
Accuracy: 0.822192059722
Recall: 0.822192059722
Precision: 0.822192059722



<h3>MLP</h3>

In [15]:
from sklearn.neural_network import MLPClassifier

def runMlp(x_tr, x_te, label):
    mlp = MLPClassifier(hidden_layer_sizes=(100,),
                                            activation='logistic',
                                            max_iter=200,
                                            verbose=False,
                                            early_stopping=True)
    
    y_pred = mlp.fit(x_tr, y_train).predict(x_te)
    
    printValues(label, y_pred) 

In [16]:
runMlp(X_train, X_test, 'Original')

runMlp(mi_train, mi_test, 'Mutual Information')

runMlp(ft_train, ft_test, 'F-test')

runMlp(vt_train, vt_test, 'Variance Threshold')

[1mOriginal[0m:
Accuracy: 0.930437733288
Recall: 0.930437733288
Precision: 0.930437733288

[1mMutual Information[0m:
Accuracy: 0.523243976926
Recall: 0.523243976926
Precision: 0.523243976926

[1mF-test[0m:
Accuracy: 0.946046827282
Recall: 0.946046827282
Precision: 0.946046827282

[1mVariance Threshold[0m:
Accuracy: 0.930437733288
Recall: 0.930437733288
Precision: 0.930437733288

