# Table of contents

## Importing libraries

In [19]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelBinarizer, MinMaxScaler

## Loading data

In [20]:
df_train = pd.read_csv('./csv/conjunto_de_treinamento.csv')
df_test = pd.read_csv('./csv/conjunto_de_teste.csv')

In [21]:
df_train.shape, df_test.shape

((20000, 42), (5000, 41))

## First 5 rows of training dataset

In [22]:
df_train.head()

Unnamed: 0,id_solicitante,produto_solicitado,dia_vencimento,forma_envio_solicitacao,tipo_endereco,sexo,idade,estado_civil,qtde_dependentes,grau_instrucao,...,possui_telefone_trabalho,codigo_area_telefone_trabalho,meses_no_trabalho,profissao,ocupacao,profissao_companheiro,grau_instrucao_companheiro,local_onde_reside,local_onde_trabalha,inadimplente
0,1,1,10,presencial,1,M,85,2,0,0,...,N,,0,9.0,1.0,0.0,0.0,600.0,600.0,0
1,2,1,25,internet,1,F,38,1,0,0,...,N,,0,2.0,5.0,,,492.0,492.0,0
2,3,1,20,internet,1,F,37,2,0,0,...,N,,0,,,,,450.0,450.0,1
3,4,1,20,internet,1,M,37,1,1,0,...,Y,54.0,0,9.0,2.0,,,932.0,932.0,1
4,5,7,1,internet,1,F,51,1,3,0,...,N,,0,9.0,5.0,,,440.0,440.0,1


## Summary statistics for numerical columns

In [23]:
num_cols = df_train.select_dtypes(include=np.number).columns
df_train[num_cols].describe()

Unnamed: 0,id_solicitante,produto_solicitado,dia_vencimento,tipo_endereco,idade,estado_civil,qtde_dependentes,grau_instrucao,nacionalidade,tipo_residencia,...,valor_patrimonio_pessoal,possui_carro,meses_no_trabalho,profissao,ocupacao,profissao_companheiro,grau_instrucao_companheiro,local_onde_reside,local_onde_trabalha,inadimplente
count,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,19464.0,...,20000.0,20000.0,20000.0,16903.0,17022.0,8486.0,7140.0,20000.0,20000.0,20000.0
mean,10000.5,1.28435,13.14725,1.00635,42.3525,2.12085,0.6664,0.0,0.9616,1.261303,...,2095.614,0.33905,0.0089,8.045081,2.53331,3.708107,0.288095,581.29525,581.29525,0.5
std,5773.647028,1.008239,6.748507,0.079435,14.930177,1.332004,1.236725,0.0,0.202305,0.88358,...,44033.44,0.473399,0.388881,3.21079,1.532765,5.181241,0.944339,227.369798,227.369798,0.500013
min,1.0,1.0,1.0,1.0,7.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,105.0,105.0,0.0
25%,5000.75,1.0,10.0,1.0,31.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,9.0,1.0,0.0,0.0,444.0,444.0,0.0
50%,10000.5,1.0,10.0,1.0,40.0,2.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,9.0,2.0,0.0,0.0,596.0,596.0,0.5
75%,15000.25,1.0,20.0,1.0,52.0,2.0,1.0,0.0,1.0,1.0,...,0.0,1.0,0.0,9.0,4.0,11.0,0.0,728.0,728.0,1.0
max,20000.0,7.0,25.0,2.0,106.0,7.0,53.0,0.0,2.0,5.0,...,6000000.0,1.0,32.0,17.0,5.0,17.0,5.0,999.0,999.0,1.0


## Identify categorical variables and print unique values and their counts

In [24]:
cat_cols = df_train.select_dtypes(include='object').columns
for col in cat_cols:
    display(f'{col} unique values and counts:', df_train[col].value_counts())

'forma_envio_solicitacao unique values and counts:'

internet      11264
presencial     7855
correio         881
Name: forma_envio_solicitacao, dtype: int64

'sexo unique values and counts:'

F    12246
M     7722
N       25
         7
Name: sexo, dtype: int64

'estado_onde_nasceu unique values and counts:'

BA    2351
SP    2336
RS    1919
CE    1910
PE    1651
MG    1446
RN     827
       822
PR     764
RJ     720
AL     678
PA     676
PB     608
MA     577
GO     460
MT     375
PI     284
SC     271
ES     251
MS     199
AC     192
SE     153
AM     147
DF     131
AP     102
TO      82
RO      53
RR      15
Name: estado_onde_nasceu, dtype: int64

'estado_onde_reside unique values and counts:'

SP    3578
BA    2045
RS    1995
CE    1865
PE    1484
MG    1187
PA     927
RJ     863
RN     846
GO     682
PR     610
AL     592
MT     537
PB     499
MA     290
DF     285
MS     274
ES     267
SC     246
AP     198
AM     162
PI     157
SE     125
RO     125
AC      93
TO      43
RR      25
Name: estado_onde_reside, dtype: int64

'possui_telefone_residencial unique values and counts:'

Y    16474
N     3526
Name: possui_telefone_residencial, dtype: int64

'codigo_area_telefone_residencial unique values and counts:'

       3534
5      1838
107    1142
97     1142
54      904
       ... 
113       1
36        1
89        1
93        1
99        1
Name: codigo_area_telefone_residencial, Length: 81, dtype: int64

'possui_telefone_celular unique values and counts:'

N    20000
Name: possui_telefone_celular, dtype: int64

'vinculo_formal_com_empresa unique values and counts:'

N    11174
Y     8826
Name: vinculo_formal_com_empresa, dtype: int64

'estado_onde_trabalha unique values and counts:'

      13573
SP     1010
RS      819
CE      588
BA      569
MG      500
PE      369
PA      316
PR      236
RJ      229
MT      224
GO      218
RN      212
AL      151
MS      150
PB      132
DF      115
SC      114
ES       86
AP       77
MA       73
RO       61
AM       54
PI       39
AC       36
SE       25
TO       18
RR        6
Name: estado_onde_trabalha, dtype: int64

'possui_telefone_trabalho unique values and counts:'

N    14519
Y     5481
Name: possui_telefone_trabalho, dtype: int64

'codigo_area_telefone_trabalho unique values and counts:'

       14525
5        631
54       442
107      407
97       264
       ...  
115        1
96         1
53         1
122        1
74         1
Name: codigo_area_telefone_trabalho, Length: 77, dtype: int64

# Preprocess the data

In [26]:
scaler = MinMaxScaler()
df_train[num_cols] = scaler.fit_transform(df_train[num_cols])

df_train[num_cols] = scaler.transform(df_train[num_cols])
df_test[num_cols] = scaler.transform(df_test[num_cols])

label_bin = LabelBinarizer()
df_train[cat_cols] = label_bin.fit_transform(df_train[cat_cols])

df_test[cat_cols] = label_bin.transform(df_test[cat_cols])

KeyError: "['inadimplente'] not in index"

## Define high cardinality columns to drop

In [None]:
high_card_cols = ['estado_onde_nasceu', 'estado_onde_reside', 'codigo_area_telefone_residencial',
                  'possui_telefone_celular', 'estado_onde_trabalha', 'codigo_area_telefone_trabalho',
                  'qtde_contas_bancarias_especiais', 'grau_instrucao', 'meses_no_trabalho']

## Drop high cardinality columns

In [None]:
dados.drop(high_card_cols, axis=1, inplace=True)
resposta.drop(high_card_cols, axis=1, inplace=True)

## Apply one-hot encoding to categorical columns with 3 or more categories

In [None]:
cat_cols = ['sexo', 'forma_envio_solicitacao']
dados = pd.get_dummies(dados, columns=cat_cols)
resposta = pd.get_dummies(resposta, columns=cat_cols)

## Apply binary encoding to binary columns

In [None]:
bin_cols = ['possui_telefone_residencial', 'vinculo_formal_com_empresa', 'possui_telefone_trabalho']
binarizador = LabelBinarizer()
for col in bin_cols:
    dados[col] = binarizador.fit_transform(dados[col])
    resposta[col] = binarizador.fit_transform(resposta[col])

## Fill missing values in selected columns with 0 or 'N'

In [None]:
missing_cols = ['profissao_companheiro', 'grau_instrucao_companheiro']
dados[missing_cols] = dados[missing_cols].fillna(0)
resposta[missing_cols] = resposta[missing_cols].fillna(0)
dados['sexo'] = dados['sexo'].fillna('N')
resposta['sexo'] = resposta['sexo'].fillna('N')

## Display preprocessed data

In [None]:
display(dados.head(5).T)

## Check class distribution

In [None]:
print("Class distribution:")
display(dados['inadimplente'].value_counts())

## Check mean values of attributes in each class

In [None]:
print("\nMean values of attributes in each class:")
display(dados.groupby(['inadimplente']).mean().T)