In [1]:
%pylab inline

import pandas as pd
import plotly.express as px

import sklearn.preprocessing
import sklearn.linear_model
import sklearn.model_selection
import sklearn.metrics

from sklearn.model_selection import train_test_split

Populating the interactive namespace from numpy and matplotlib


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Fixar comportamento aleatório
random.seed(38)

**Objetivo**
- Identificar quais funcionários da empresa tem maior propensão a pedir demissão.

# Leitura dos Dados

In [5]:
df = pd.read_csv('drive/MyDrive/ASA Arcelor Mittal 2023/data/dataset empregados - classificação.csv')
df.head(10)

Unnamed: 0,Escolaridade,AnoIngresso,Cidade,NivelSalario,Idade,Sexo,JaFicouOcioso,AnosExperiencia,PedidoDemissao
0,graduacao,2017,Bangalore,3,34,masculino,nao,0.0,0
1,graduacao,2013,Pune,1,28,feminino,nao,3.0,1
2,graduacao,2014,New Delhi,3,38,feminino,nao,2.0,0
3,mestrado,2016,Bangalore,3,27,masculino,nao,5.0,1
4,mestrado,2017,Pune,3,24,masculino,sim,2.0,1
5,graduacao,2016,Bangalore,3,22,masculino,nao,0.0,0
6,graduacao,2015,New Delhi,3,38,masculino,nao,0.0,0
7,graduacao,2016,Bangalore,3,34,feminino,nao,2.0,1
8,graduacao,2016,Pune,3,23,masculino,nao,1.0,0
9,mestrado,2017,New Delhi,2,37,masculino,nao,2.0,0


In [6]:
df.describe()

Unnamed: 0,AnoIngresso,NivelSalario,Idade,AnosExperiencia,PedidoDemissao
count,4653.0,4653.0,4653.0,4652.0,4653.0
mean,2015.06297,2.698259,29.393295,2.906062,0.343864
std,1.863377,0.561435,4.826087,1.558157,0.475047
min,2012.0,1.0,22.0,0.0,0.0
25%,2013.0,3.0,26.0,2.0,0.0
50%,2015.0,3.0,28.0,3.0,0.0
75%,2017.0,3.0,32.0,4.0,1.0
max,2018.0,3.0,41.0,7.0,1.0


In [7]:
df.nunique().sort_values()

Sexo                2
JaFicouOcioso       2
PedidoDemissao      2
Escolaridade        3
Cidade              3
NivelSalario        3
AnoIngresso         7
AnosExperiencia     8
Idade              20
dtype: int64

É importante analisar a distribuição da *target* para vermos o desbalanceamento das classes

In [8]:
df_cont = df['PedidoDemissao'].value_counts()

print('Em valores absolutos:')
display(df_cont)

print('\nEm percentual:')
display(df_cont / df.shape[0] * 100)
print(f'\n A taxa de desbalanceamento é de {df_cont[0]/df_cont[1]:.2f}.')

px.bar(df_cont)

Em valores absolutos:


0    3053
1    1600
Name: PedidoDemissao, dtype: int64


Em percentual:


0    65.613583
1    34.386417
Name: PedidoDemissao, dtype: float64


 A taxa de desbalanceamento é de 1.91.


# Tratamento dos dados
Dependendo do algoritmo utilizado, algumas etapas de tratamento dos dados podem ser necessárias:
+ Tratamento de nulos
+ *Encoding*
+ Normalização

Como vamos usar regressão logística e esse algoritmo não possui metodologia própria para lidar com essas situações, todas as etapas citadas são necessárias.

## Tratamento de Nulos
Para realizar o tratamento de nulos é, mais uma vez, importante conhecer seu dado.

Ele é quantitativo discreto? Quantitativo contínuo? Categórico? Cada um tem uma maneira de lidar.



In [9]:
df.loc[df.isna().sum(axis=1) > 0, :]

Unnamed: 0,Escolaridade,AnoIngresso,Cidade,NivelSalario,Idade,Sexo,JaFicouOcioso,AnosExperiencia,PedidoDemissao
40,graduacao,2015,Bangalore,3,36,,,,0
2515,graduacao,2014,Bangalore,3,29,,nao,0.0,0
3559,graduacao,2014,,3,32,masculino,nao,1.0,0


In [10]:
df_clean = df.drop(index=40)
df_clean['Sexo'].value_counts()

masculino    2776
feminino     1875
Name: Sexo, dtype: int64

In [11]:
px.bar(df_clean['JaFicouOcioso'].value_counts())

In [41]:
df['Cidade'].value_counts()

Bangalore    2227
Pune         1268
New Delhi    1157
Name: Cidade, dtype: int64

In [12]:
from sklearn.impute import SimpleImputer

cat_imp = SimpleImputer(strategy='most_frequent')
cat_imp.fit(df_clean[['Cidade', 'Sexo']])

In [13]:
cat_imp.transform(df_clean[['Cidade', 'Sexo']])

array([['Bangalore', 'masculino'],
       ['Pune', 'feminino'],
       ['New Delhi', 'feminino'],
       ...,
       ['New Delhi', 'masculino'],
       ['Bangalore', 'masculino'],
       ['Bangalore', 'masculino']], dtype=object)

In [14]:
df_clean[['Cidade', 'Sexo']] = cat_imp.transform(df_clean[['Cidade', 'Sexo']])

# Testar se ainda existem dados nulos
df_clean.loc[df_clean.isna().sum(axis=1) > 0, :]

Unnamed: 0,Escolaridade,AnoIngresso,Cidade,NivelSalario,Idade,Sexo,JaFicouOcioso,AnosExperiencia,PedidoDemissao


## *Encoding*
Para fazer o encoding das variáveis de texto, precisamos analisar quantas categorias são e qual tipo de codificação vamos usar.

No nosso exemplo, Sexo e JaFicouOcioso possuem apenas 2 categorias, enquanto Cidade e Escolaridade possuem 3.

Isso significa que vamos usar *One Hot Encoding* em Escolaridade?

In [42]:
df_clean['Escolaridade'].value_counts()

graduacao    3600
mestrado      873
doutorado     179
Name: Escolaridade, dtype: int64

In [15]:
from sklearn.preprocessing import OrdinalEncoder

ord_enc = OrdinalEncoder(dtype=int)
ord_enc.fit(df_clean[['Sexo', 'JaFicouOcioso']])

df_enc = df_clean.copy()
df_enc[['Sexo', 'JaFicouOcioso']] = ord_enc.transform(df_clean[['Sexo', 'JaFicouOcioso']])
df_enc

Unnamed: 0,Escolaridade,AnoIngresso,Cidade,NivelSalario,Idade,Sexo,JaFicouOcioso,AnosExperiencia,PedidoDemissao
0,graduacao,2017,Bangalore,3,34,1,0,0.0,0
1,graduacao,2013,Pune,1,28,0,0,3.0,1
2,graduacao,2014,New Delhi,3,38,0,0,2.0,0
3,mestrado,2016,Bangalore,3,27,1,0,5.0,1
4,mestrado,2017,Pune,3,24,1,1,2.0,1
...,...,...,...,...,...,...,...,...,...
4648,graduacao,2013,Bangalore,3,26,0,0,4.0,0
4649,mestrado,2013,Pune,2,37,1,0,2.0,1
4650,mestrado,2018,New Delhi,3,27,1,0,5.0,1
4651,graduacao,2012,Bangalore,3,30,1,1,2.0,0


In [43]:
ord_enc.categories_

[array(['feminino', 'masculino'], dtype=object),
 array(['nao', 'sim'], dtype=object)]

In [16]:
df_enc['Escolaridade'] = df_enc['Escolaridade'].replace(
    {
        'graduacao': 0,
        'mestrado': 1,
        'doutorado': 2
    }
)
df_enc

Unnamed: 0,Escolaridade,AnoIngresso,Cidade,NivelSalario,Idade,Sexo,JaFicouOcioso,AnosExperiencia,PedidoDemissao
0,0,2017,Bangalore,3,34,1,0,0.0,0
1,0,2013,Pune,1,28,0,0,3.0,1
2,0,2014,New Delhi,3,38,0,0,2.0,0
3,1,2016,Bangalore,3,27,1,0,5.0,1
4,1,2017,Pune,3,24,1,1,2.0,1
...,...,...,...,...,...,...,...,...,...
4648,0,2013,Bangalore,3,26,0,0,4.0,0
4649,1,2013,Pune,2,37,1,0,2.0,1
4650,1,2018,New Delhi,3,27,1,0,5.0,1
4651,0,2012,Bangalore,3,30,1,1,2.0,0


In [17]:
df_enc = pd.get_dummies(df_enc, columns=['Cidade'], drop_first=True)
df_enc

Unnamed: 0,Escolaridade,AnoIngresso,NivelSalario,Idade,Sexo,JaFicouOcioso,AnosExperiencia,PedidoDemissao,Cidade_New Delhi,Cidade_Pune
0,0,2017,3,34,1,0,0.0,0,0,0
1,0,2013,1,28,0,0,3.0,1,0,1
2,0,2014,3,38,0,0,2.0,0,1,0
3,1,2016,3,27,1,0,5.0,1,0,0
4,1,2017,3,24,1,1,2.0,1,0,1
...,...,...,...,...,...,...,...,...,...,...
4648,0,2013,3,26,0,0,4.0,0,0,0
4649,1,2013,2,37,1,0,2.0,1,0,1
4650,1,2018,3,27,1,0,5.0,1,1,0
4651,0,2012,3,30,1,1,2.0,0,0,0


## Separação treino-teste

In [18]:
from sklearn.model_selection import StratifiedKFold # KFold se não houver diferença entre grupos

# Escolha do número de folds
N_folds = 5

# Criação do Splitter
splitter = StratifiedKFold(
    n_splits=N_folds,
    random_state=38,
    shuffle=True # bagunçar a ordem original
)

# Separar X do y
df_sem_y = df_enc.copy().drop(columns='PedidoDemissao')
df_y = df_enc.copy()['PedidoDemissao'].to_frame()

# Vamos executar aqui apenas pra mostrar como fica
for index_train, index_test in splitter.split(df_sem_y, df_y):
  print('Indices de treino:', index_train)
  print('Indices de teste:', index_test)

  X_train = df_sem_y.iloc[index_train]
  y_train = df_y.iloc[index_train]
  X_test = df_sem_y.iloc[index_test]
  y_test =  df_y.iloc[index_test]

  print('\nY Train:')
  display(y_train)

  print('\nY Test:')
  display(y_test)

  print('\nY Train Value Counts (%)')
  display(y_train.value_counts() / y_train.shape[0] * 100)

  print('\nY Test Value Counts (%)')
  display(y_test.value_counts() / y_test.shape[0] * 100)
  break

Indices de treino: [   0    1    2 ... 4648 4649 4650]
Indices de teste: [   3    8   13   21   33   42   49   50   52   55   56   58   59   61
   63   64   74   76   79   85   86   89   96   99  106  107  117  122
  126  134  139  140  141  146  156  158  160  165  174  181  182  186
  193  196  201  202  203  206  223  225  235  236  240  245  247  252
  257  260  263  267  270  273  274  275  277  283  286  289  300  303
  305  307  323  326  333  340  341  345  349  351  352  353  359  372
  375  382  384  388  398  407  409  414  416  422  425  426  428  437
  441  445  452  454  476  477  481  485  489  499  509  510  515  516
  519  521  533  535  543  553  565  571  578  582  590  591  595  605
  607  608  615  616  618  625  639  646  647  657  659  660  663  671
  673  683  692  701  704  708  716  718  730  731  752  763  768  771
  773  783  784  785  787  790  791  797  800  803  806  809  811  816
  821  822  826  834  845  853  858  861  863  871  880  887  890  905
  91

Unnamed: 0,PedidoDemissao
0,0
1,1
2,0
4,1
5,0
...,...
4646,0
4647,0
4649,1
4650,1



Y Test:


Unnamed: 0,PedidoDemissao
3,1
8,0
13,0
21,0
33,0
...,...
4632,1
4637,1
4641,1
4648,0



Y Train Value Counts (%)


PedidoDemissao
0                 65.600645
1                 34.399355
dtype: float64


Y Test Value Counts (%)


PedidoDemissao
0                 65.628357
1                 34.371643
dtype: float64

In [19]:
# Listas para armazenarem nossos grupos da validação cruzada
X_train_fold = []
y_train_fold = []
X_test_fold = []
y_test_fold = []

for index_train, index_test in splitter.split(df_sem_y, df_y):
  X_train = df_sem_y.iloc[index_train]
  y_train = df_y.iloc[index_train]
  X_test = df_sem_y.iloc[index_test]
  y_test =  df_y.iloc[index_test]

  # adicionar na lista
  X_train_fold.append(X_train)
  y_train_fold.append(y_train)
  X_test_fold.append(X_test)
  y_test_fold.append(y_test)

In [20]:
X_train_fold[0]

Unnamed: 0,Escolaridade,AnoIngresso,NivelSalario,Idade,Sexo,JaFicouOcioso,AnosExperiencia,Cidade_New Delhi,Cidade_Pune
0,0,2017,3,34,1,0,0.0,0,0
1,0,2013,1,28,0,0,3.0,0,1
2,0,2014,3,38,0,0,2.0,1,0
4,1,2017,3,24,1,1,2.0,0,1
5,0,2016,3,22,1,0,0.0,0,0
...,...,...,...,...,...,...,...,...,...
4646,0,2013,3,25,0,0,3.0,0,0
4647,0,2016,3,30,1,0,2.0,0,1
4649,1,2013,2,37,1,0,2.0,0,1
4650,1,2018,3,27,1,0,5.0,1,0


## Rescaling dos dados e treino do modelo

In [21]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression

# Esse é o nosso dicionário que vai conter as métricas do treino
train_metrics = {
    'accuracy': [],
    'recall': [],
    'precision': [],
    'f1': []
}

# Esse é o nosso dicionário que vai conter as métricas do teste
test_metrics = {
    'accuracy': [],
    'recall': [],
    'precision': [],
    'f1': []
}

# Listas para guardar as previsões feitas pro grupo de treino e teste
y_hat_train_fold = []
y_hat_test_fold = []
y_hat_test_proba_fold = []

# Executar para cada fold
for fold in range(N_folds):

  # Recuperando os dados desse fold específico
  X_train = X_train_fold[fold]
  X_test = X_test_fold[fold]
  y_train = y_train_fold[fold]
  y_test = y_test_fold[fold]

  # Criando o scaler aqui dentro para só escalar nos dados de treino
  x_scaler = StandardScaler()
  x_scaler.fit(X_train)

  # Aplicar a normalização
  X_train_norm = x_scaler.transform(X_train)
  X_test_norm = x_scaler.transform(X_test)

  # Treinar o modelo
  model = LogisticRegression(
      fit_intercept=False,
      random_state=42
  )
  model.fit(X_train_norm, y_train.iloc[:,0])

  # Realizar as previsões
  y_hat_train = model.predict(X_train_norm)
  y_hat_test = model.predict(X_test_norm)
  y_hat_proba_test = model.predict_proba(X_test_norm)[:,1]

  # Salvar previsões para o conjunto de treino
  y_hat_train_fold.append(y_hat_train)
  y_hat_test_fold.append(y_hat_test)
  y_hat_test_proba_fold.append(y_hat_proba_test)

  # Calcular métricas do treino
  acc = accuracy_score(y_train, y_hat_train)
  train_metrics['accuracy'].append(acc)

  rec = recall_score(y_train, y_hat_train)
  train_metrics['recall'].append(rec)

  precision = precision_score(y_train, y_hat_train)
  train_metrics['precision'].append(precision)

  f1 = f1_score(y_train, y_hat_train)
  train_metrics['f1'].append(f1)

  # Calcular métricas do teste
  acc = accuracy_score(y_test, y_hat_test)
  test_metrics['accuracy'].append(acc)

  rec = recall_score(y_test, y_hat_test)
  test_metrics['recall'].append(rec)

  precision = precision_score(y_test, y_hat_test)
  test_metrics['precision'].append(precision)

  f1 = f1_score(y_test, y_hat_test)
  test_metrics['f1'].append(f1)


In [22]:
train_metrics

{'accuracy': [0.6535877452297769,
  0.6589626444504165,
  0.6477700161203654,
  0.6560988715744224,
  0.6660397635679742],
 'recall': [0.65859375, 0.65703125, 0.65234375, 0.6546875, 0.6734375],
 'precision': [0.49734513274336284,
  0.5032914422501497,
  0.49088771310993534,
  0.5,
  0.5109662122110255],
 'f1': [0.5667226890756303,
  0.5699762792273807,
  0.5602146930560215,
  0.5669824086603518,
  0.5810583080552747]}

In [23]:
test_metrics

{'accuracy': [0.6466165413533834,
  0.6423200859291085,
  0.6559139784946236,
  0.656989247311828,
  0.6473118279569893],
 'recall': [0.64375, 0.65, 0.628125, 0.678125, 0.678125],
 'precision': [0.48931116389548696,
  0.48484848484848486,
  0.5,
  0.5011547344110855,
  0.49095022624434387],
 'f1': [0.5560053981106613,
  0.5554072096128172,
  0.556786703601108,
  0.5763612217795485,
  0.5695538057742782]}

## Avaliação de métricas

In [24]:
df_train_metrics = pd.DataFrame(train_metrics)
df_train_metrics

Unnamed: 0,accuracy,recall,precision,f1
0,0.653588,0.658594,0.497345,0.566723
1,0.658963,0.657031,0.503291,0.569976
2,0.64777,0.652344,0.490888,0.560215
3,0.656099,0.654687,0.5,0.566982
4,0.66604,0.673438,0.510966,0.581058


In [25]:
df_train_metrics = df_train_metrics.mean(axis=0)
df_train_metrics

accuracy     0.656492
recall       0.659219
precision    0.500498
f1           0.568991
dtype: float64

In [26]:
df_test_metrics = pd.DataFrame(test_metrics)
df_test_metrics

Unnamed: 0,accuracy,recall,precision,f1
0,0.646617,0.64375,0.489311,0.556005
1,0.64232,0.65,0.484848,0.555407
2,0.655914,0.628125,0.5,0.556787
3,0.656989,0.678125,0.501155,0.576361
4,0.647312,0.678125,0.49095,0.569554


In [27]:
df_test_metrics = df_test_metrics.mean(axis=0)
df_test_metrics

accuracy     0.649830
recall       0.655625
precision    0.493253
f1           0.562823
dtype: float64

In [28]:
df_metrics = pd.concat([df_train_metrics, df_test_metrics], axis=1)
df_metrics

Unnamed: 0,0,1
accuracy,0.656492,0.64983
recall,0.659219,0.655625
precision,0.500498,0.493253
f1,0.568991,0.562823


In [29]:
df_metrics = df_metrics.rename(columns={0: 'Treino', 1: 'Teste'}).round(3)
df_metrics

Unnamed: 0,Treino,Teste
accuracy,0.656,0.65
recall,0.659,0.656
precision,0.5,0.493
f1,0.569,0.563


## Matriz de confusão

In [30]:
y_hat_test_fold[0]

array([0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,

In [31]:
# Converter de 5 listas de y's para uma única lista com todos os valores
y_test_list = []
for y_fold in y_test_fold:
  y_test_list.extend(y_fold.iloc[:,0])

y_hat_test_list = []
for lista_fold in y_hat_test_fold:
  y_hat_test_list.extend(lista_fold)

y_test_hat_proba_list = []
for lista_fold in y_hat_test_proba_fold:
  y_test_hat_proba_list.extend(lista_fold)

In [32]:
y_hat_test_list[:10]

[0, 1, 0, 0, 0, 0, 0, 1, 1, 0]

In [33]:
confusion_matrix = pd.DataFrame(
    sklearn.metrics.confusion_matrix(y_test_list, y_hat_test_list),
    index=['real_negativo', 'real_positivo'],
    columns=['pred_negativo', 'pred_positivo'],
)

display(confusion_matrix.style.background_gradient(axis=None))

Unnamed: 0,pred_negativo,pred_positivo
real_negativo,1974,1078
real_positivo,551,1049


Será que essa é a única resposta que eu posso conseguir? Parece meio desbalanceado entre Falsos Positivos e Falsos Negativos... Será que tem algo que eu consigo ajustar para fazer minha escolha?

## Escolha do Threshold

In [34]:
y_test_hat_proba_list[:10]

[0.45500040054453034,
 0.6106870529035913,
 0.3228509494884076,
 0.49472098706261586,
 0.29384562163680916,
 0.27516126671222435,
 0.47103589707512167,
 0.7850592375362625,
 0.5932235284730085,
 0.32858121086422265]

In [35]:
# Testar para vários thresholds (limiares)
threshold_list = np.arange(0.05, 0.95, 0.01)

In [36]:
threshold_list

array([0.05, 0.06, 0.07, 0.08, 0.09, 0.1 , 0.11, 0.12, 0.13, 0.14, 0.15,
       0.16, 0.17, 0.18, 0.19, 0.2 , 0.21, 0.22, 0.23, 0.24, 0.25, 0.26,
       0.27, 0.28, 0.29, 0.3 , 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37,
       0.38, 0.39, 0.4 , 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48,
       0.49, 0.5 , 0.51, 0.52, 0.53, 0.54, 0.55, 0.56, 0.57, 0.58, 0.59,
       0.6 , 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.7 ,
       0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77, 0.78, 0.79, 0.8 , 0.81,
       0.82, 0.83, 0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.9 , 0.91, 0.92,
       0.93, 0.94])

In [51]:
# Calcular o valor do f1 para cada threshold
f1_th_list = []
for threshold in threshold_list:
  y_pred_th = (y_test_hat_proba_list > threshold).astype(int)
  f1 = f1_score(y_test_list, y_pred_th, average='macro')
  f1_th_list.append(f1)

In [52]:
df_th = pd.DataFrame(
    data={
        'Threshold': threshold_list,
        'F1': f1_th_list
    }
)
df_th

Unnamed: 0,Threshold,F1
0,0.05,0.255918
1,0.06,0.255918
2,0.07,0.255918
3,0.08,0.255918
4,0.09,0.255918
...,...,...
85,0.90,0.399450
86,0.91,0.398857
87,0.92,0.397509
88,0.93,0.396834


In [53]:
# Gráfico de linhas do F1 por Threshold
px.line(df_th,x='Threshold',y='F1',title='F1 por Threshold',markers=True)

In [54]:
from sklearn.metrics import confusion_matrix

# Para testes manuais
y_test_th = (np.array(y_test_hat_proba_list) > 0.6).astype(int)

# Criação da matriz de confusão
confusion_matrix = pd.DataFrame(
    confusion_matrix(y_test_list, y_test_th),
    index=['real_negativo', 'real_positivo'],
    columns=['pred_negativo', 'pred_positivo'],
)

# Mostra F1
f1 = f1_score(
    y_true=y_test_list,
    y_pred=y_test_th,
    average='macro'
    )
print('F1:', f'{f1:.3f}')

# Mostra a matriz de confusão
display(confusion_matrix.style.background_gradient(axis=None))

F1: 0.679


Unnamed: 0,pred_negativo,pred_positivo
real_negativo,2514,538
real_positivo,763,837


## Exercício: Fazer a escolha do Threshold com base em custo
Para isso, utilizar um "custo" que faça sentido para cada quadrante. Por exemplo:
+ Se eu acertar, meu custo é 0;
+ Os falsos positivos me fazem investir 1000 reais para dar um aumento para um funcionário que não estava insatisfeito;
+ Os falsos negativos me fazem gastar 4000 reais na contratação de um novo funcionário.