# Parte 1 - Classificação binária

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import pdb;

In [None]:
from src.efc2 import csv_reader
#%pycat efc2/csv_reader.py

In [None]:
data = csv_reader.load_csv()
#data.head(data.shape[0])

data = data.drop("Unnamed: 0", 1)
data.head(data.shape[0])

## Item a

In [None]:
columns = list(data)

plot_rows = int(len(columns) / 2)
if(len(columns) % 2 != 0):
    plot_rows = plot_rows + 1

print(plot_rows)
fig1, axs = plt.subplots(plot_rows, 2, constrained_layout=True, figsize=(30,30))

data.hist(ax=axs);

In [None]:
f = plt.figure(figsize=(30, 30))
plt.matshow(data.corr(), fignum=f.number)
plt.xticks(range(data.shape[1]), data.columns, fontsize=14, rotation=45)
plt.yticks(range(data.shape[1]), data.columns, fontsize=14)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=14)
plt.title('Correlation Matrix', fontsize=16);

## Item b

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

import numpy as np

train, test = train_test_split(data, test_size=0.2)

In [None]:
def gradient_descent(matrix_y, matrix_ye, matrix_phi):
    matrix_error = matrix_y - matrix_ye
    return -matrix_error.T.dot(matrix_phi)/len(matrix_y)

In [None]:
def phi(x):
    phi1 = np.ones((x.shape[0],1))
    return np.append(phi1, x, axis=1)

In [None]:
# Função custo
def j_cross_entropy(targets, predictions, epsilon=1e-12):
        
    # só funciona se epsilon for maior que zero
    assert(epsilon > 0)
    
    predictions = np.clip(predictions, epsilon, 1. - epsilon)
    N = predictions.shape[0]
    
    #pdb.set_trace()
    ce_when_y_1 = -np.sum(targets*np.log(predictions))/N
    ce_when_y_0 = - np.sum((1-targets)*np.log(1-predictions))/N
    ce = ce_when_y_1 + ce_when_y_0
    
    #print("y = 1 ", ce_when_y_1, " y = 0", ce_when_y_0, " total = ", ce)
    
    return ce

### Fase de Treinamento

In [None]:
#verificar se pode estar saturando sigmoide.
matrix_phi = phi(preprocessing.scale(train.drop("label", 1).values))

# matrix com os dados 
matrix_y = train["label"].values.reshape(train["label"].values.shape[0], 1)

matrix_phi
#matrix_y

In [None]:
alpha = 0.1

# A dimensão da matrix w é número de atributos mais 1 x 1
matrix_w = np.random.rand(train.drop("label", 1).shape[1] + 1, 1)

iterations = 1000
matrix_cost = np.zeros((iterations,))

for i in range(iterations):
    z = matrix_phi.dot(matrix_w)
    matrix_ye = 1 /(1 + np.exp(-z))
    
    matrix_w = matrix_w - (alpha) * gradient_descent(matrix_y, matrix_ye, matrix_phi).T
    matrix_cost[i] = j_cross_entropy(matrix_y, matrix_ye)
    
df_cost = pd.DataFrame(matrix_cost,columns=['cost'])
df_cost.plot()

### Fase de Teste


In [None]:
#verificar se pode estar saturando sigmoide.
matrix_phi_test = phi(preprocessing.scale(test.drop("label", 1).values))

# matrix com os dados de teste
# recupera dados com rotulos label e reshape 
matrix_y_test = test["label"].values.reshape(test["label"].values.shape[0], 1)

# calculando estimativa para todos os dados de teste com o w calculado anteriormente
z = matrix_phi_test.dot(matrix_w)
matrix_ye_test = 1 /(1 + np.exp(-z))

In [None]:
matrix_confusion_df = pd.DataFrame([], columns = ['threshold', 'tp', 'fp', 'tn', 'fn'])

# definindo threshold
for threshold in np.linspace(0, 1, num=21):
    #threshold 
    
    # Decisão: coloca (decide por) 1 se for maior, c.c. 0
    matrix_ye_test_decided = (matrix_ye_test > threshold).astype(int)

    matrix_confusion = [{'threshold':threshold, 'tp':0, 'tn':0, 'fp':0, 'fn':0}]
    row_df = pd.DataFrame(matrix_confusion)

    #matrix_y_test.T, matrix_ye_test.T

    for y, ye in zip(matrix_y_test, matrix_ye_test_decided):
        if(y == ye):
            if(y == 1):
                row_df["tp"] =  row_df["tp"] + 1
            else:
                row_df["tn"] =  row_df["tn"] + 1
        else:
            if(y == 1):
                # ye == 0, porem y == 0
                row_df["fn"] =  row_df["fn"] + 1
            else:
                # ye == 1, porem y == 0
                row_df["fp"] =  row_df["fp"] + 1
                
    matrix_confusion_df = matrix_confusion_df.append(row_df, sort=False)

matrix_confusion_df

In [None]:
# Curva ROC
# x - falso positivo (fp / tn + fp ) = (fp / N-) 
# y - recall - sensibilidade (tp / tp + fn)
x = matrix_confusion_df['fp']/(matrix_confusion_df['tn'] + matrix_confusion_df['fp'])
y = matrix_confusion_df['tp']/(matrix_confusion_df['tp'] + matrix_confusion_df['fn'])

plt.plot(x,y)
    