# Avaliação de algoritmos de Machine Learning

In [1]:
import pandas as pd
import numpy as np

In [2]:
from scipy.stats import stats
import math
import statistics

In [3]:
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [4]:
df_credit = pd.read_csv('Bases de dados/credit_data.csv')
print(df_credit.shape)
df_credit.head()

(2000, 5)


Unnamed: 0,i#clientid,income,age,loan,c#default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.64226,0
4,5,66952.688845,18.584336,8770.099235,1


In [5]:
df_credit.dropna(inplace=True)
df_credit.shape

(1997, 5)

In [6]:
X = df_credit.iloc[:, 1:4].values
y = df_credit.iloc[:, 4].values
X.shape, y.shape

((1997, 3), (1997,))

In [7]:
resultados_naive_bayes = []
resultados_logistic = []
resultados_forest = []
for i in range(30):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=i)
    
    # Naive Bayes
    naive_bayes = GaussianNB()
    naive_bayes.fit(X_train, y_train)
    resultados_naive_bayes.append(accuracy_score(y_test, naive_bayes.predict(X_test)))
    
    # Logisti Regression
    logistic = LogisticRegression()
    logistic.fit(X_train, y_train)
    resultados_logistic.append(accuracy_score(y_test, logistic.predict(X_test)))
    
    # Random Forest
    forest = RandomForestClassifier()
    forest.fit(X_train, y_train)
    resultados_forest.append(accuracy_score(y_test, forest.predict(X_test)))

In [8]:
print(resultados_naive_bayes)

[0.925, 0.925, 0.9325, 0.925, 0.92, 0.905, 0.9175, 0.9175, 0.9125, 0.9325, 0.9225, 0.9125, 0.935, 0.9175, 0.925, 0.9175, 0.9275, 0.92, 0.9325, 0.92, 0.93, 0.905, 0.9175, 0.9325, 0.9425, 0.9375, 0.94, 0.92, 0.935, 0.925]


In [9]:
print(resultados_logistic)

[0.9325, 0.91, 0.9125, 0.9225, 0.9075, 0.89, 0.91, 0.9075, 0.8775, 0.915, 0.9175, 0.9, 0.925, 0.9175, 0.9025, 0.9125, 0.9525, 0.91, 0.9225, 0.9075, 0.925, 0.905, 0.9075, 0.945, 0.9225, 0.9275, 0.9225, 0.9175, 0.91, 0.9]


In [10]:
print(resultados_forest)

[0.9775, 0.99, 0.9825, 0.99, 0.98, 0.9875, 0.99, 0.9825, 0.98, 0.9825, 0.9875, 0.9825, 0.9725, 0.985, 0.98, 0.9725, 0.9875, 0.9825, 0.99, 0.9825, 0.9825, 0.975, 0.98, 0.9825, 0.985, 0.995, 0.995, 0.9825, 0.9925, 0.9875]


## Média

In [11]:
resultados_naive_bayes = np.array(resultados_naive_bayes)
resultados_logistic = np.array(resultados_logistic)
resultados_forest = np.array(resultados_forest)

In [12]:
print(f'Naive Bayes: {resultados_naive_bayes.mean()}')
print(f'Logistic Regression: {resultados_logistic.mean()}')
print(f'Random Forest:{resultados_forest.mean()}')

Naive Bayes: 0.92425
Logistic Regression: 0.9145
Random Forest:0.9840833333333335


## Moda

In [13]:
print(f'Naive Bayes: {statistics.mode(resultados_naive_bayes)}')
print(f'Logistic Regression: {statistics.mode(resultados_logistic)}')
print(f'Random Forest:{statistics.mode(resultados_forest)}')

Naive Bayes: 0.925
Logistic Regression: 0.91
Random Forest:0.9825


## Mediana

In [14]:
print(f'Naive Bayes: {np.median(resultados_naive_bayes)}')
print(f'Logistic Regression: {np.median(resultados_logistic)}')
print(f'Random Forest:{np.median(resultados_forest)}')

Naive Bayes: 0.925
Logistic Regression: 0.9125
Random Forest:0.9825


## Variância

In [15]:
print(f'Naive Bayes: {statistics.variance(resultados_naive_bayes)}')
print(f'Logistic Regression: {statistics.variance(resultados_logistic)}')
print(f'Random Forest:{statistics.variance(resultados_forest)}')

Naive Bayes: 9.058189655172414e-05
Logistic Regression: 0.00021655172413793106
Random Forest:3.382902298850572e-05


## Desvio Padrão

In [16]:
print(f'Naive Bayes: {statistics.stdev(resultados_naive_bayes)}')
print(f'Logistic Regression: {statistics.stdev(resultados_logistic)}')
print(f'Random Forest:{statistics.stdev(resultados_forest)}')

Naive Bayes: 0.009517452209059111
Logistic Regression: 0.014715696522350923
Random Forest:0.005816272258801656


## Coeficiente de variação

In [17]:
print(f'Naive Bayes: {stats.variation(resultados_naive_bayes)*100} %')
print(f'Logistic Regression: {stats.variation(resultados_logistic)*100} %')
print(f'Random Forest:{stats.variation(resultados_forest)*100} %')

Naive Bayes: 1.0124407504578252 %
Logistic Regression: 1.5821056616884057 %
Random Forest:0.5811004645371411 %


## EXERCICIO - Validação cruzada

In [18]:
resultados_naive_bayes_cv = []
resultados_logistic_cv = []
resultados_forest_cv = []
for i in range(30):
    kfold = KFold(n_splits=10, shuffle=True, random_state=i)
    
    # Naive Bayes
    naive_bayes = GaussianNB()
    scores = cross_val_score(naive_bayes, X, y, cv=kfold)
    resultados_naive_bayes_cv.append(scores.mean())
    
    # Logisti Regression
    logistic = LogisticRegression()
    scores = cross_val_score(logistic, X, y, cv=kfold)
    resultados_logistic_cv.append(scores.mean())
    
    # Random Forest
    forest = RandomForestClassifier()
    scores = cross_val_score(forest, X, y, cv=kfold)
    resultados_forest_cv.append(scores.mean())

## Média

In [19]:
resultados_naive_bayes_cv = np.array(resultados_naive_bayes_cv)
resultados_logistic_cv = np.array(resultados_logistic_cv)
resultados_forest_cv = np.array(resultados_forest_cv)

In [20]:
print(f'Naive Bayes: {resultados_naive_bayes_cv.mean()}')
print(f'Logistic Regression: {resultados_logistic_cv.mean()}')
print(f'Random Forest:{resultados_forest_cv.mean()}')

Naive Bayes: 0.9249359296482411
Logistic Regression: 0.9137817420435511
Random Forest:0.9867469011725292


## Moda

In [21]:
print(f'Naive Bayes: {statistics.mode(resultados_naive_bayes_cv)}')
print(f'Logistic Regression: {statistics.mode(resultados_logistic_cv)}')
print(f'Random Forest:{statistics.mode(resultados_forest_cv)}')

Naive Bayes: 0.9248894472361808
Logistic Regression: 0.9113718592964826
Random Forest:0.9874849246231155


## Mediana

In [22]:
print(f'Naive Bayes: {np.median(resultados_naive_bayes_cv)}')
print(f'Logistic Regression: {np.median(resultados_logistic_cv)}')
print(f'Random Forest:{np.median(resultados_forest_cv)}')

Naive Bayes: 0.9248919597989949
Logistic Regression: 0.9130979899497487
Random Forest:0.9869786432160805


## Variância

In [23]:
print(f'Naive Bayes: {statistics.variance(resultados_naive_bayes_cv)}')
print(f'Logistic Regression: {statistics.variance(resultados_logistic_cv)}')
print(f'Random Forest:{statistics.variance(resultados_forest_cv)}')

Naive Bayes: 6.608178433320306e-07
Logistic Regression: 1.3004529114700778e-05
Random Forest:1.876804719026264e-06


## Desvio Padrão

In [24]:
print(f'Naive Bayes: {statistics.stdev(resultados_naive_bayes_cv)}')
print(f'Logistic Regression: {statistics.stdev(resultados_logistic_cv)}')
print(f'Random Forest:{statistics.stdev(resultados_forest_cv)}')

Naive Bayes: 0.0008129070324040939
Logistic Regression: 0.003606179295972509
Random Forest:0.0013699652254806557


## Coeficiente de variação

In [25]:
print(f'Naive Bayes: {stats.variation(resultados_naive_bayes_cv)*100} %')
print(f'Logistic Regression: {stats.variation(resultados_logistic_cv)*100} %')
print(f'Random Forest:{stats.variation(resultados_forest_cv)*100} %')

Naive Bayes: 0.08641071566366061 %
Logistic Regression: 0.38801026116292653 %
Random Forest:0.1365029835219334 %


## Seleção de atributos utilizando variância

In [30]:
base_selecao = {'a': np.random.rand(20), 
                'b': np.array([0.5] * 20), 
                'classe': np.random.randint(0, 2, size=20)}
base_selecao = pd.DataFrame(base_selecao)
base_selecao.head()

Unnamed: 0,a,b,classe
0,0.680176,0.5,0
1,0.908085,0.5,1
2,0.800027,0.5,0
3,0.72705,0.5,1
4,0.769841,0.5,0


In [31]:
base_selecao.describe(include='all').T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
a,20.0,0.590994,0.233039,0.103769,0.435988,0.635407,0.773248,0.908085
b,20.0,0.5,0.0,0.5,0.5,0.5,0.5,0.5
classe,20.0,0.45,0.510418,0.0,0.0,0.0,1.0,1.0


In [32]:
np.var(base_selecao['a']), np.var(base_selecao['b'])

(0.05159174818668065, 0.0)

In [35]:
math.sqrt(np.var(base_selecao['a'])), math.sqrt(np.var(base_selecao['b']))

(0.22713816981450002, 0.0)

In [36]:
X = base_selecao.iloc[:, 0:2].values
y = base_selecao.iloc[:, 2].values
X.shape, y.shape

((20, 2), (20,))