In [3]:
import numpy as np
import pandas as pd
import itertools as it
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score,confusion_matrix,recall_score,accuracy_score
from sklearn.metrics import roc_curve, auc,precision_recall_curve, roc_curve
import matplotlib.pylab as plt
import matplotlib.patheffects as pe
import seaborn as sns
from tsne import *

import plotly.graph_objects as go

from warnings import filterwarnings
filterwarnings('ignore')

In [4]:
Data = pd.read_excel('Data.xlsx')
Data = Data.set_index('Date')
Y = Data['Class']
X = Data.drop('Class', axis = 1)

In [5]:
T = tsne(X = X, perplexity=60)
X = pd.DataFrame(T)

Preprocessing the data using PCA...
Computing pairwise distances...
Computing P-values for point 0 of 4931...
Computing P-values for point 500 of 4931...
Computing P-values for point 1000 of 4931...
Computing P-values for point 1500 of 4931...
Computing P-values for point 2000 of 4931...
Computing P-values for point 2500 of 4931...
Computing P-values for point 3000 of 4931...
Computing P-values for point 3500 of 4931...
Computing P-values for point 4000 of 4931...
Computing P-values for point 4500 of 4931...
Mean value of sigma: 0.685436
Iteration 10: error is 22.609606
Iteration 20: error is 21.721687
Iteration 30: error is 18.802471
Iteration 40: error is 17.836693
Iteration 50: error is 17.386727
Iteration 60: error is 17.134444
Iteration 70: error is 17.079267
Iteration 80: error is 17.017483
Iteration 90: error is 16.910267
Iteration 100: error is 16.842879
Iteration 110: error is 2.491703
Iteration 120: error is 2.194294
Iteration 130: error is 2.012103
Iteration 140: error is 1.

### Parameters

In [8]:
step = 100
ts_prop = 0.1
va_prop = 0.2

Hyper_Parameters = [[i for i in range(50,500,50)], # n_estimators -> nro de arboles en el bosque
                    ['gini', 'entropy'],           # criterion -> criterio de separacion
                    [i for i in range(1,4)],       # max_depth
                    [i for i in range(1,10)]]      # min_samples_leaf
set_parameters = [i for i in it.product(*Hyper_Parameters)]

#### Error real y de entrenamiento deseados

In [9]:
Epsilon = 0.4
delta = 0.15

#### Garantia probable de aprendizaje ($\eta_{estimado}$)

In [10]:
n_est = int(np.ceil((1/Epsilon)*(np.log(len(set_parameters)) + np.log(1/delta))))

#### Division del conjunto: entrenamiento (70%), validacion (20%) y prueba (10%)

In [11]:
X1, X1_ts, Y1,Y1_ts = train_test_split(X, Y, train_size = 1-ts_prop, random_state=19)
X1_tr, X1_va, Y1_tr, Y1_va = train_test_split(X1, Y1, train_size = 1-va_prop, random_state=19)

#### Discretizacion para explorar el aumento de $\eta$

In [12]:
N = range(n_est, round(X1_tr.shape[0]*(1-va_prop)), step)

### Exploracion de modelos partiendo de $\eta_{estimado}$

In [13]:
n_est_mod_results = {
    'Model': [None for i in range(len(set_parameters))],
    'Accuracy': np.zeros((len(N),len(set_parameters))),
    'Precision': np.zeros((len(N),len(set_parameters))),
    'Recall': np.zeros((len(N),len(set_parameters)))
}

In [14]:
for j in range(len(set_parameters)):
    rfc = RandomForestClassifier(n_estimators = set_parameters[j][0],
                                 criterion = set_parameters[j][1],
                                 max_depth = set_parameters[j][2], 
                                 min_samples_leaf = set_parameters[j][3],
                                 random_state=19)
    for i in range(len(N)):
        np.random.seed(seed = 19)
        ind1 = np.random.randint(0, X1_tr.shape[0], N[i])
        rfc.fit(X1_tr.iloc[ind1,], Y1_tr[ind1])
        y_pred = rfc.predict(X1_va)
        n_est_mod_results['Accuracy'][i,j] = accuracy_score(Y1_va, y_pred)
        n_est_mod_results['Precision'][i,j] = precision_score(Y1_va, y_pred, average = 'macro')
        n_est_mod_results['Recall'][i,j] = recall_score(Y1_va, y_pred, average = 'macro')
    n_est_mod_results['Model'][j] = rfc
    
ind_bst = np.where(n_est_mod_results['Accuracy'] == np.amax(n_est_mod_results['Accuracy']))[1][0]
n_est_mod_results['Best'] = set_parameters[ind_bst]

#### Grid search

In [15]:
Hyper_Parameters = {
    'n_estimators': [i for i in range(50,500,50)],
    'criterion': ['gini', 'entropy'],
    'max_depth': [i for i in range(1,4)],
    'min_samples_leaf': [i for i in range(1,10)]
}
clf = GridSearchCV(RandomForestClassifier(), Hyper_Parameters, cv = 5)
clf.fit(X1_tr.values, Y1_tr)
clf.best_params_

{'criterion': 'gini',
 'max_depth': 3,
 'min_samples_leaf': 8,
 'n_estimators': 150}

In [19]:
fig1 = go.Figure(data=[go.Surface(z=n_est_mod_results['Accuracy'])])

fig1.update_layout(title='Accuracy Surface', autosize=True,
                   scene = dict(xaxis_title='Modelos',
                                yaxis_title='Tamaño de N',
                                zaxis_title='Accuracy'))

fig1.show()
fig1.write_html('Acc_Forest_Dim.html')

In [20]:
fig2 = go.Figure(data=[go.Surface(z=n_est_mod_results['Precision'])])

fig2.update_layout(title='Precision Surface', autosize=True,
                   scene = dict(xaxis_title='Modelos',
                                yaxis_title='Tamaño de N',
                                zaxis_title='Precision'))

fig2.show()
fig2.write_html('Pre_Forest_Dim.html')

In [21]:
fig3 = go.Figure(data=[go.Surface(z=n_est_mod_results['Recall'])])

fig3.update_layout(title='Recall Surface', autosize=True,
                   scene = dict(xaxis_title='Modelos',
                                yaxis_title='Tamaño de N',
                                zaxis_title='Recall'))

fig3.show()
fig3.write_html('Rec_Forest_Dim.html')