In [None]:
import os
import pandas as pd
import numpy as np

from IPython.display import Image
from subprocess import call

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit

from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn import metrics

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
dataset = pd.read_csv('dataset_final_treat.csv')
dataset.head()

In [None]:
columns_drop = []
if len(columns_drop) > 0:
    dataset = dataset.drop(columns_drop, axis=1)

In [None]:
if 'stimul' in dataset.columns:
    mask = dataset.stimul.duplicated()
    print(dataset.stimul[~mask])
    new_stimuls = {'GREEN': 0, 'WHITE': 1, 'RED': 2, 'BLUE': 3}
    
    for index, item in dataset.iterrows():
        dataset['stimul'][index] = new_stimuls[item.stimul]

In [None]:
if 'classify' in dataset.columns:
    mask = dataset.classify.duplicated()
    print(dataset.classify[~mask])
    new_classify = {'Alterado': 0, 'Atermo': 1}

    for index, item in dataset.iterrows():
        dataset['classify'][index] = new_classify[item.classify]

In [None]:
def normalize_column(column_name):
    columns = list(dataset.columns)
    x = dataset[[column_name]]
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    dataset_aux = pd.DataFrame({column_name: x_scaled[:, 0]})
    dataset.pop(column_name)
    dataset.insert(columns.index(column_name), column_name, dataset_aux)

In [None]:
# normalize_column('3_seconds_before')
# normalize_column('size_instantly_before_stimul')
# normalize_column('size_instantly_after_stimul')
# normalize_column('3_seconds_after')
# normalize_column('5_seconds_after')
# normalize_column('6_seconds_after')
# normalize_column('10_seconds_after')
# normalize_column('min_value1')
# normalize_column('min_value2')
# normalize_column('min_value3')
# normalize_column('max_value1')
# normalize_column('max_value2')
# normalize_column('max_value3')

In [None]:
print(dataset.head())

In [None]:
X = dataset.copy()
X.pop('classify')
y = dataset['classify']

# X = X.fillna(X.mean())
# y = y.fillna(y.mean())

X = X.fillna(0)
y = y.fillna(0)

print(X, y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
aux = dataset.copy()
aux.pop('classify')

feature_imp = pd.Series(clf.feature_importances_,index=list(aux.columns)).sort_values(ascending=False)
feature_imp

In [None]:
sns.barplot(x=feature_imp, y=feature_imp.index)

plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.legend()
plt.show()

In [None]:
scores = cross_val_score(clf, X, y, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
scores = cross_val_score(clf, X, y, cv=10, scoring='f1_macro')
print("f1_macro: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
if False:
    estimator = clf.estimators_[0]

    aux = dataset.copy()
    aux.pop('classify')

    target = ['Altered', 'Aterm']

    export_graphviz(estimator, out_file='tree.dot', feature_names=aux.columns, class_names=target,
                    rounded=True, special_characters=True, proportion=False, precision=2, filled=True)

    # Convert to png using system command (requires Graphviz)
    call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])

    # Display in jupyter notebook
    Image(filename = 'tree.png')