<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Preparação" data-toc-modified-id="Preparação-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Preparação</a></span><ul class="toc-item"><li><span><a href="#Libraries" data-toc-modified-id="Libraries-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Libraries</a></span></li><li><span><a href="#Funções" data-toc-modified-id="Funções-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Funções</a></span></li><li><span><a href="#Criando-dados-e-aplicando-modelo" data-toc-modified-id="Criando-dados-e-aplicando-modelo-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Criando dados e aplicando modelo</a></span></li></ul></li><li><span><a href="#Análises-Gráficas" data-toc-modified-id="Análises-Gráficas-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Análises Gráficas</a></span><ul class="toc-item"><li><span><a href="#Precisão-por-faixa---Barras-empilhadas" data-toc-modified-id="Precisão-por-faixa---Barras-empilhadas-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Precisão por faixa - Barras empilhadas</a></span></li><li><span><a href="#Métricas-com-variação-de-ponto-de-corte-ou-quantis" data-toc-modified-id="Métricas-com-variação-de-ponto-de-corte-ou-quantis-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Métricas com variação de ponto de corte ou quantis</a></span><ul class="toc-item"><li><span><a href="#Gráfico-com-Precisão-e-volumes-respectivos-analisados" data-toc-modified-id="Gráfico-com-Precisão-e-volumes-respectivos-analisados-2.2.1"><span class="toc-item-num">2.2.1&nbsp;&nbsp;</span>Gráfico com Precisão e volumes respectivos analisados</a></span></li><li><span><a href="#Gráfico-único-com-Precisão,-Recall,-F1" data-toc-modified-id="Gráfico-único-com-Precisão,-Recall,-F1-2.2.2"><span class="toc-item-num">2.2.2&nbsp;&nbsp;</span>Gráfico único com Precisão, Recall, F1</a></span></li></ul></li></ul></li></ul></div>

# Preparação

## Libraries

In [1]:
import pandas as pd
import numpy as np
from numpy.random import normal
from random import seed, choices
from itertools import product
from tqdm import tqdm_notebook
from datetime import datetime, timedelta
from sklearn.metrics import make_scorer, f1_score, fbeta_score, roc_auc_score, auc, roc_curve, precision_score, recall_score, classification_report, confusion_matrix, accuracy_score
from textwrap import wrap

# Importando funções para plotly
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.offline import plot
from plotly.subplots import make_subplots
import sys
import os
import warnings
if not sys.warnoptions:
    warnings.simplefilter('ignore')
    os.environ['PYTHONWARNINGS'] = 'ignore'

################################################
# Pasta raíz, onde está esse arquivo
#path = os.path.abspath(os.getcwd())

# Setando caminhos
#pathout = os.path.join(path, 'Graficos')
################################################

# Importando pacotes


In [2]:
#dados1 = pd.read_csv('teste1.csv',sep=';',decimal=',')
#dados2 = pd.read_csv('teste2.csv',sep=';',decimal=',')
#print(dados1.head(4))
#print(dados2.head(4))

## Funções

In [3]:
def probability_groups (n_groups, train_probs, y_train, test_probs, y_test):
    '''
    n_groups: number of groups to split train/test data 
    train_probs: train data probability given by models
    y_train: train data real class
    test_probs: test data probability given by models
    y_test: test data real class   
    
    '''
    ser, bins = pd.qcut(train_probs, n_groups, retbins = True)
    bins2 = bins.copy()
    
    bins2[0] = 0
    bins2[n_groups] = 1
    train_table = pd.DataFrame(pd.crosstab(ser, y_train))
    train_table.columns = ['N_0', 'N_1']
    train_table['total'] = train_table.N_0 + train_table.N_1
    train_table['P_1'] = np.round(train_table['N_1'] / train_table['total'],2)
    train_table['P_0'] = np.round(train_table['N_0'] / train_table['total'],2)

    train_table['P_total'] = np.round((train_table['total'] / sum(train_table.total) ), 2)
    train_group = ser.copy()

    test_group = pd.cut(test_probs, bins=bins2, labels=False, include_lowest=True)
    test_table = pd.DataFrame(pd.crosstab(test_group, y_test))
    
    
    test_table.columns = ['N_0', 'N_1']
    test_table['total'] = test_table.N_0 + test_table.N_1
    test_table['P_1'] = np.round(test_table['N_1'] / test_table['total'],2)
    test_table['P_0'] = np.round(test_table['N_0'] / test_table['total'],2)

    test_table['P_total'] = np.round((test_table['total'] / sum(test_table.total) ), 2)
    
    print ('original bins: {}' .format(bins))
    print ('modified bins: {}' .format(bins2))
    #print(test_table)
    
    
    train_table.reset_index(inplace=True)
    train_table.rename(columns={train_table.columns[0]: "intervalos" }, inplace=True)
    test_table.reset_index(inplace=True)
    test_table.iloc[:,0] = test_table.iloc[:,0].apply(lambda x: x+1)

    test_table.rename(columns={test_table.columns[0]: "intervalos" }, inplace=True)
    
    return (train_table, test_table, train_group, test_group)


def table_percentis_recall_precision (data, name_prob1 , name_true, quantiles_or_thresholds, type_info = 'threshold'):
    '''
    data: dataframe with predict probabilities and real classes;
    name_prob1: column with probability given by model.;
    name_true: column with real class;
    quantiles_or_thresholds: list with values to split scores;
    type_info: type of analyze -  quantiles or threshold.
    
    '''
    data.sort_values(by = name_prob1, inplace=True, ascending=False)
    data.reset_index(inplace=True, drop=True)
    results = pd.DataFrame()
    
    if type_info == 'quantiles':
        for i, q in enumerate(quantiles_or_thresholds):
            tam1 = round(data.shape[0]*q)
            tam0 = data.shape[0] - tam1
            pred = np.concatenate((np.repeat(1,tam1), np.repeat(0,tam0)), axis =0)

            #print (data2[name_true])
            recall = recall_score(data[name_true], pred)*100
            precision = precision_score(data[name_true], pred)*100
            f1 = f1_score(data[name_true], pred)*100
            accuracy = accuracy_score(data[name_true], pred)*100
            info = {
                'percentil' : q*100,
                'quantidade_preditos_faixa' : tam1,
                'quantidade_real_faixa' : (precision/100)*tam1 ,
                'ponto_de_corte' : min(data.loc[0:tam1,name_prob1]),
                'recall' : "%.1f" % recall,
                'precision': "%.1f" % precision,
                'f1': "%.1f" % f1,
                'accuracy' : "%.1f" % accuracy
            }
            resultados_aux = pd.DataFrame(info, index = [str(i)])
                    #print(resultados_aux)
            results = results.append(resultados_aux)
        
    else:
        for i,threshold in enumerate(quantiles_or_thresholds):
            tam1 = data[data[name_prob1] >= threshold].shape[0]
            tam1  = round(tam1)
            tam0 = data.shape[0] - tam1
            pred = np.concatenate((np.repeat(1,tam1), np.repeat(0,tam0)), axis =0)
            q = tam1/data.shape[0]
            #print
            #print (data2[name_true])
            recall = recall_score(data[name_true], pred)*100
            precision = precision_score(data[name_true], pred)*100
            f1 = f1_score(data[name_true], pred)*100
            accuracy = accuracy_score(data[name_true], pred)*100
            info = {
                'percentil' : round(q*100),
                'quantidade_preditos_faixa' : tam1,
                'quantidade_real_faixa' : (precision/100)*tam1 ,
                'ponto_de_corte' : threshold,
                'recall' : "%.1f" % recall,
                'precision': "%.1f" % precision,
                'f1': "%.1f" % f1,
                'accuracy' : "%.1f" % accuracy
            }
            resultados_aux = pd.DataFrame(info, index=[str(i)])
                    #print(resultados_aux)
            results = results.append(resultados_aux)
    #print(results)  
    
    results = results[['percentil', 'quantidade_preditos_faixa', 'quantidade_real_faixa', 'ponto_de_corte', 'recall', 'precision', 'f1', 'accuracy']]
    results = results.round(2)

    results['corte_perc'] = results.apply(lambda x: str(x['ponto_de_corte']) + ' - ' + str(x['percentil']) + '%', axis = 1)
    return results

## Criando dados e aplicando modelo

In [4]:
# Criando variável categórica
uf = ['a'] * 400 + ['b'] * 800 + ['c'] * 300

# Criando variável numérica
seed(1)
renda = np.random.uniform(low = 1, high = 10, size = len(uf))

# Colocando em dataframe
df = pd.DataFrame({'uf': uf, 'renda': renda})

# Criando variável resposta
seed(80)
df['y'] = 3 + np.where(df['uf'] == 'a', 10, np.where(df['uf'] == 'b', 5, 0)) - 1.5 * df['renda']
df['y'] = df['y'] + np.random.normal(loc = 0, scale = 1, size = len(renda))
df['renda'] = df['renda'] + np.random.normal(loc = 0, scale = 4, size = len(renda))

# Transformando as variáveis
X = df[['renda']]
X['a'] = np.where(df['uf'] == 'a', 1, 0)
X['b'] = np.where(df['uf'] == 'b', 1, 0)

# Variável de classificação
df['y'] = ( 1 / (1 + np.exp(df['y'])) ).round(0)
df['y'].value_counts()

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Separando em treino, validação e out of time
Xtr, Xte, ytr, yte = train_test_split(X, df['y'], stratify = df['y'], test_size = 0.2, random_state = 42)
Xva, Xou, yva, you = train_test_split(Xte, yte, stratify = yte, test_size = 0.5, random_state = 42)

# Treinando o modelo
fit = LogisticRegression(random_state = 0).fit(Xtr, ytr)

# Criando tabelas com dados originais e predicts
tr = Xtr.copy()
tr['y'] = ytr
tr['ypred'] = fit.predict(Xtr)
tr['yprob'] = fit.predict_proba(Xtr)[:,1]
tr.drop(columns = Xtr.columns, inplace = True)
tr['tipo'] = 'Treino'

va = Xva.copy()
va['y'] = yva
va['ypred'] = fit.predict(Xva)
va['yprob'] = fit.predict_proba(Xva)[:,1]
va.drop(columns = Xva.columns, inplace = True)
va['tipo'] = 'Validacao'

ou = Xou.copy()
ou['y'] = you
ou['ypred'] = fit.predict(Xou)
ou['yprob'] = fit.predict_proba(Xou)[:,1]
ou.drop(columns = Xou.columns, inplace = True)
ou['tipo'] = 'OutOfTime'

# Unindo tudo para dar merge com base inicial
fim = tr.append(va)
fim = fim.append(ou)

df.rename(columns = {'y': 'yold'}, inplace = True)
fim = fim.merge(df, how = 'left', left_index = True, right_index = True)
print( (fim['y'] == fim['yold']).value_counts() )
fim.drop(columns = ['yold'], inplace = True)

tr = fim[fim['tipo'] == 'Treino']
va = fim[fim['tipo'] == 'Validacao']
ou = fim[fim['tipo'] == 'OutOfTime']

tr.reset_index(inplace = True, drop = True)
va.reset_index(inplace = True, drop = True)
ou.reset_index(inplace = True, drop = True)

tr.index = pd.date_range(datetime.today(), periods = tr.shape[0]).tolist()
va.index = pd.date_range(datetime.today(), periods = va.shape[0]).tolist()
ou.index = pd.date_range(datetime.today(), periods = ou.shape[0]).tolist()

True    1500
dtype: int64


# Análises Gráficas

## Precisão por faixa - Barras empilhadas

In [12]:
def stacked_barplot_probs (df,
                           p_geral,
                           name_axis_x = 'intervalos', 
                           name_cols_plot = ['P_1', 'P_0'], 
                           name_cols_N = ['total', 'P_total'],
                           colors = ['rgb(229, 134, 6)', 'rgb(93, 105, 177)'],
                           nomes_legenda = ['% Classe 1', '% Classe 0']
                           ):
    
    fig = go.Figure()
    # desenha o gráfico de barras empilhado para cada classe: colunas escolhidas em name_cols_plot
    for i in range(len(name_cols_plot)):
        fig.add_trace(
            go.Bar(x=[str(j) for j in df[name_axis_x].values.tolist()] , 
            name = nomes_legenda[i], 
            y=df[name_cols_plot[i]], 
            text = ['{:,.0f}%'.format(i2*100) for i2 in df[name_cols_plot[i]].values.tolist()], 
            textposition='inside',  
            textfont_size=16,
            #orientation='h',
            marker=dict(
                color=colors[i],
                line=dict(color='rgb(248, 248, 249)'), #width=1)
            ))
        )
           
    # intuito é escrever as labels da coluna 'name_col_2', com respectivos  volumes em cima.
    # a localização depende somente do ylim do eixo y secundário abaixo.
    fig.add_trace(go.Scatter(x= [str(j) for j in df[name_axis_x].values.tolist()], 
                                 y= df[name_cols_N[0]],
                                 text = ['{:,.0f}'.format(i2) for i2 in df[name_cols_N[0]].values.tolist()],
                                 textposition='top center',
                                 mode = 'lines+text',
                                 marker=dict(color='white'),
                                 textfont=dict(
                                    #family="sans serif",
                                    size=13,
                                    color="gray"
                                 ),
                                showlegend=False, yaxis="y2"))
        
    fig.add_trace(go.Scatter(x= [str(j) for j in df[name_axis_x].values.tolist()], 
                                 y= df[name_cols_N[1]],
                                 text = [' {:,.0f}%'.format(i2*100) for i2 in df[name_cols_N[1]].values.tolist()],
                                 textposition='top center',
                                 mode = 'lines+text',
                                 marker=dict(color='white'),
                                 textfont=dict(
                                    #family="sans serif",
                                    size=13,
                                    color="gray"
                                 ),
                                showlegend=False, yaxis="y3"))        


    # escrever o balanceamento da classe - colocar a anotação do balanceamento na altura correspondente no gráfico
    fig.add_annotation(
        x=df.shape[0]-0.6,
        y=p_geral,
        xref="x",
        yref="y",
        text="% classe 1 geral: {:,.0f}%".format(p_geral*100),
        showarrow=True,
        font=dict(
            #family="Courier New, monospace",
            #size=16,
            color="white"
            ),
        align="center",
        arrowhead=2,
        arrowsize=1,
        arrowwidth=2,
        arrowcolor="#636363",
        ax=90,
        ay=0,
        bordercolor="#c7c7c7",
        borderwidth=2,
        borderpad=4,
        bgcolor="gray",
        opacity=0.8
        )

    # layouts - adicional
    fig.update_layout(barmode='stack',
        title_text="Título",
        plot_bgcolor='white',
        legend_title=' ',
        xaxis=dict(
            title='Intervalos de probabilidade',
            titlefont_size=18,
            tickangle = 0,
            tickfont_size=8,
        ),
        legend=dict(x=0.84, y=0.9)
        #transition_duration=100
    )
    
    fig.update_layout(
        yaxis=dict(
            #title="yaxis title",
            range=[0,1.3],
            showticklabels=False
        ),
        yaxis2=dict(
            #title="yaxis2 title",
            anchor="x",
            overlaying="y",
            #side="left",
            #position=0.15,
            range=[-100,(df[name_cols_N[0]].max())*1.45],
            showticklabels=False
        ),
        yaxis3=dict(
            #title="yaxis2 title",
            anchor="x",
            overlaying="y",
            #side="left",
            #position=0.15,
            range=[-10,2],
            showticklabels=False
        )
    )

    return fig

In [13]:
train_table, test_table, train_group, test_group = probability_groups(7, tr.yprob, tr.y, va.yprob, va.y)
stacked_barplot_probs(train_table, tr.y.mean())

original bins: [0.00516991 0.09234908 0.26527958 0.44116578 0.58855572 0.74107164
 0.85812314 0.99508469]
modified bins: [0.         0.09234908 0.26527958 0.44116578 0.58855572 0.74107164
 0.85812314 1.        ]


## Métricas com variação de ponto de corte ou quantis

In [7]:
#tabela_percentis_recall_precision(dados1, 'prob_treino', 'prob_resp', [0.1,0.2,0.3], type_info = 'quantiles')
df_metrics = table_percentis_recall_precision(tr, 'yprob', 'y', np.arange(0,1,0.1), type_info = 'threshold')
df_metrics

Unnamed: 0,percentil,quantidade_preditos_faixa,quantidade_real_faixa,ponto_de_corte,recall,precision,f1,accuracy,corte_perc
0,100,1200,595.0,0.0,100.0,49.6,66.3,49.6,0.0 - 100%
1,85,1019,584.0,0.1,98.2,57.3,72.4,62.8,0.1 - 85%
2,76,914,567.0,0.2,95.3,62.0,75.1,68.8,0.2 - 76%
3,68,817,546.0,0.3,91.8,66.8,77.3,73.3,0.3 - 68%
4,60,720,519.0,0.4,87.2,72.1,78.9,76.9,0.4 - 60%
5,52,618,479.0,0.5,80.5,77.5,79.0,78.8,0.5 - 52%
6,42,502,414.0,0.6,69.6,82.5,75.5,77.6,0.6 - 42%
7,33,401,344.0,0.7,57.8,85.8,69.1,74.3,0.7 - 33%
8,22,260,237.0,0.8,39.8,91.2,55.4,68.2,0.8 - 22%
9,9,103,99.0,0.9,16.6,96.1,28.4,58.3,0.9 - 9%


### Gráfico com Precisão e volumes respectivos analisados

In [8]:
def PlotPrecision (df, 
                   nomes_legenda = ['precision', '# Preditos', '# Real'],
                   colors = ['rgb(236,112,20)', 'rgb(102, 163, 255)', 'rgb(0, 102, 255)'],
                   yaxis = [None, 'y2', 'y2'],
                   thresholds = [0.5]
                  ):

    cols_plot = ['precision', 'quantidade_preditos_faixa', 'quantidade_real_faixa']
    line_size = [1.5,1.5,1.5]
    fig = go.Figure()
    # DESENHAR AS 3 LINHAS
    eixo_x = ['<br>'.join(wrap(str(j),width=5)) for j in df['corte_perc'].values.tolist()]
    for i, col in enumerate(cols_plot):
        fig.add_trace(go.Scatter(x = eixo_x,
                             y = df[col],
                             #text = tab_plot[cols_plot[0]],
                             #textposition = 'top center',
                             yaxis = yaxis[i],
                             mode = 'lines',
                             #name = 'Precisão',
                             name = nomes_legenda[i],
                             line=dict(color=colors[i], width=line_size[i]),
                            ))
    annotations=[]
    for th in thresholds:
        df_marks = df[df['ponto_de_corte'] == th]
        for i, col in enumerate(cols_plot):
            # MARCAR AS BOLINHAS
            fig.add_trace(go.Scatter(
                        x=[eixo_x[int(df_marks.index[0])]],
                        y=[df_marks[col][0]],
                        yaxis = yaxis[i],
                        name = '',
                        mode='markers',
                        marker=dict(color=colors[i], size=12),
                        showlegend=False))

        for i2, col in enumerate(cols_plot):
            #MARCA TEXO NAS BOLINHAS
            if col == 'precision':
                texto = '{:,.0f}%'.format(float(df_marks[col][0]))
            else:
                texto = '{:,.0f}'.format(float(df_marks[col][0]))
            annotations.append(dict(yref= yaxis[i2], 
                                    x=eixo_x[int(df_marks.index[0])], 
                                    y=float(df_marks[col][0])+1.0,
                                    font = dict(color = colors[i2], size=16),
                                    text = texto, 
                                    xanchor='center', yanchor='bottom', showarrow=False))

    fig.update_layout(
        yaxis=dict(
            title="Precisão (%)",
            titlefont=dict(
                color = colors[0]
            ),
            tickfont=dict(
                color = colors[0]
            ),
            range=[0,100],
            dtick = 10,
            showticklabels=True
        ),
        yaxis2=dict(
            #title="yaxis2 title",
            titlefont=dict(
                color= colors[2]
            ),
            tickfont=dict(
                color= colors[2]
            ),
            #anchor="x",
            overlaying="y",
            side="right",
            #position=0.15,
            range=[0,1200],
            showticklabels=True
        ))

    # layouts - adicional
    fig.update_layout(
            title_text="Insira o título",
            plot_bgcolor='white',
            legend_title=' ',
            xaxis=dict(
                showline=True,
                linecolor='rgb(204,204,204)',
                #titlefont = dict(color = 'rgb(204,204,204)'),
                title='Pontos de corte',
                titlefont_size=18,
                tickangle = 0,
                tickfont_size=10,
                ticks='outside',
                tickfont=dict(
                size=12,
                color='rgb(82,82,82)',
                ),   
            ),
            yaxis = dict(
                titlefont_size=16
            ),
            annotations = annotations,
            legend=dict(x = 0.025,y = 0.025,
                        title = '',
                        bgcolor="white",
                        bordercolor="gray",
                        borderwidth=1.5)
            #transition_duration=100
            )
    return fig

In [9]:
PlotPrecision(df_metrics, thresholds = [0.2, 0.5, 0.6])

### Gráfico único com Precisão, Recall, F1

In [10]:
def PlotPrecisionRecallF1 (df, 
                   cols_plot = ['precision', 'recall', 'f1'],
                   nomes_legenda = ['precision', 'recall', 'f1-score'],
                   colors = ['rgb(236,112,20)', 'rgb(102, 163, 255)', 'rgb(128,0,0)'],
                   ):
    '''
    funciona se quiser colocar só precisao e recall em cols_plot;
    '''
    line_size = [1.5,1.5,1.5]
    fig = go.Figure()
    # DESENHAR AS 3 LINHAS
    eixo_x = ['<br>'.join(wrap(str(j),width=5)) for j in df['corte_perc'].values.tolist()]
    for i, col in enumerate(cols_plot):
        fig.add_trace(go.Scatter(x = eixo_x,
                             y = df[col],
                             text = df[col],
                             textposition = 'top center',
                             mode = 'lines+markers+text',
                             #name = 'Precisão',
                             name = nomes_legenda[i],
                             line=dict(color=colors[i], width=line_size[i]),
                            ))    
   # layouts - adicional
    fig.update_layout(
            title_text="Variação Precision e Recall por pontos de corte",
            plot_bgcolor='white',
            xaxis=dict(
                showline=True,
                linecolor='rgb(204,204,204)',
                #titlefont = dict(color = 'rgb(204,204,204)'),
                title='Pontos de corte',
                titlefont_size=18,
                tickangle = 0,
                tickfont_size=10,
                ticks='outside',
                tickfont=dict(
                size=12,
                color='rgb(82,82,82)',
                ),   
            ),
            yaxis = dict(showticklabels=False),
            legend=dict(x = 0.025,y = 0.025,
                        title = '',
                        bgcolor="white")
                        #bordercolor="gray",
                        #borderwidth=1.5)
            #transition_duration=100
            )
    return fig

In [11]:
PlotPrecisionRecallF1(df_metrics, cols_plot = ['precision', 'recall'])