# Table of contents

## Importing libraries

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelBinarizer, MinMaxScaler

## Loading data

In [2]:
df_train = pd.read_csv('./csv/conjunto_de_treinamento.csv')
df_test = pd.read_csv('./csv/conjunto_de_teste.csv')

In [3]:
df_train.shape, df_test.shape

((20000, 42), (5000, 41))

## First 5 rows of training dataset

In [4]:
df_train.head()

Unnamed: 0,id_solicitante,produto_solicitado,dia_vencimento,forma_envio_solicitacao,tipo_endereco,sexo,idade,estado_civil,qtde_dependentes,grau_instrucao,...,possui_telefone_trabalho,codigo_area_telefone_trabalho,meses_no_trabalho,profissao,ocupacao,profissao_companheiro,grau_instrucao_companheiro,local_onde_reside,local_onde_trabalha,inadimplente
0,1,1,10,presencial,1,M,85,2,0,0,...,N,,0,9.0,1.0,0.0,0.0,600.0,600.0,0
1,2,1,25,internet,1,F,38,1,0,0,...,N,,0,2.0,5.0,,,492.0,492.0,0
2,3,1,20,internet,1,F,37,2,0,0,...,N,,0,,,,,450.0,450.0,1
3,4,1,20,internet,1,M,37,1,1,0,...,Y,54.0,0,9.0,2.0,,,932.0,932.0,1
4,5,7,1,internet,1,F,51,1,3,0,...,N,,0,9.0,5.0,,,440.0,440.0,1


## Summary statistics for numerical columns

In [5]:
num_cols = df_train.select_dtypes(include=np.number).describe()

## Identify categorical variables and print unique values and their counts

In [6]:
cat_vars = df_train.select_dtypes(include='object').columns
for cat_var in cat_vars:
    display(f'{cat_var} unique values and counts:', df_train[cat_var].value_counts())

'forma_envio_solicitacao unique values and counts:'

internet      11264
presencial     7855
correio         881
Name: forma_envio_solicitacao, dtype: int64

'sexo unique values and counts:'

F    12246
M     7722
N       25
         7
Name: sexo, dtype: int64

'estado_onde_nasceu unique values and counts:'

BA    2351
SP    2336
RS    1919
CE    1910
PE    1651
MG    1446
RN     827
       822
PR     764
RJ     720
AL     678
PA     676
PB     608
MA     577
GO     460
MT     375
PI     284
SC     271
ES     251
MS     199
AC     192
SE     153
AM     147
DF     131
AP     102
TO      82
RO      53
RR      15
Name: estado_onde_nasceu, dtype: int64

'estado_onde_reside unique values and counts:'

SP    3578
BA    2045
RS    1995
CE    1865
PE    1484
MG    1187
PA     927
RJ     863
RN     846
GO     682
PR     610
AL     592
MT     537
PB     499
MA     290
DF     285
MS     274
ES     267
SC     246
AP     198
AM     162
PI     157
SE     125
RO     125
AC      93
TO      43
RR      25
Name: estado_onde_reside, dtype: int64

'possui_telefone_residencial unique values and counts:'

Y    16474
N     3526
Name: possui_telefone_residencial, dtype: int64

'codigo_area_telefone_residencial unique values and counts:'

       3534
5      1838
107    1142
97     1142
54      904
       ... 
113       1
36        1
89        1
93        1
99        1
Name: codigo_area_telefone_residencial, Length: 81, dtype: int64

'possui_telefone_celular unique values and counts:'

N    20000
Name: possui_telefone_celular, dtype: int64

'vinculo_formal_com_empresa unique values and counts:'

N    11174
Y     8826
Name: vinculo_formal_com_empresa, dtype: int64

'estado_onde_trabalha unique values and counts:'

      13573
SP     1010
RS      819
CE      588
BA      569
MG      500
PE      369
PA      316
PR      236
RJ      229
MT      224
GO      218
RN      212
AL      151
MS      150
PB      132
DF      115
SC      114
ES       86
AP       77
MA       73
RO       61
AM       54
PI       39
AC       36
SE       25
TO       18
RR        6
Name: estado_onde_trabalha, dtype: int64

'possui_telefone_trabalho unique values and counts:'

N    14519
Y     5481
Name: possui_telefone_trabalho, dtype: int64

'codigo_area_telefone_trabalho unique values and counts:'

       14525
5        631
54       442
107      407
97       264
       ...  
115        1
96         1
53         1
122        1
74         1
Name: codigo_area_telefone_trabalho, Length: 77, dtype: int64

In [7]:
HIGH_CARDINALITY_THRESHOLD = 25

categorical_cols = df_train.select_dtypes(include=["object"]).columns.tolist()

high_cardinality_cols = []
for col in categorical_cols:
    if df_train[col].nunique() > HIGH_CARDINALITY_THRESHOLD:
        high_cardinality_cols.append(col)

high_cardinality_cols

['estado_onde_nasceu',
 'estado_onde_reside',
 'codigo_area_telefone_residencial',
 'estado_onde_trabalha',
 'codigo_area_telefone_trabalho']

In [8]:
selected_attributes = [
    "id_solicitante",
    "produto_solicitado",
    "dia_vencimento",
    "forma_envio_solicitacao",
    "tipo_endereco",
    "sexo",
    "idade",
    "estado_civil",
    "qtde_dependentes",
    "grau_instrucao",
    "nacionalidade",
    # "estado_onde_nasceu",
    # "estado_onde_reside",
    "possui_telefone_residencial",
    # "codigo_area_telefone_residencial",
    "tipo_residencia",
    "meses_na_residencia",
    "possui_telefone_celular",
    "possui_email",
    "renda_mensal_regular",
    "renda_extra",
    "possui_cartao_visa",
    "possui_cartao_mastercard",
    "possui_cartao_diners",
    "possui_cartao_amex",
    "possui_outros_cartoes",
    "qtde_contas_bancarias",
    "qtde_contas_bancarias_especiais",
    "valor_patrimonio_pessoal",
    "possui_carro",
    "vinculo_formal_com_empresa",
    # "estado_onde_trabalha",
    "possui_telefone_trabalho",
    # "codigo_area_telefone_trabalho",
    "meses_no_trabalho",
    "profissao",
    "ocupacao",
    "profissao_companheiro",
    "grau_instrucao_companheiro",
    "local_onde_reside",
    "local_onde_trabalha",
    # "inadimplente"
]


In [9]:
df_train.drop(high_cardinality_cols, axis=1, inplace=True)
df_test.drop(high_cardinality_cols, axis=1, inplace=True)

inadimplente = df_train['inadimplente']
df_train.drop(['inadimplente'], axis=1, inplace=True)

df_train = df_train.replace(r'^\s*$', np.NaN, regex=True)
df_test = df_test.replace(r'^\s*$', np.NaN, regex=True)

for col in df_train:
    if df_train[col].dtype == 'O':
        df_train[col] = df_train[col].fillna(df_train[col].mode().iloc[0])
        df_test[col] = df_test[col].fillna(df_train[col].mode().iloc[0])
    else:
        df_train[col] = df_train[col].fillna(df_train[col].mean())
        df_test[col] = df_test[col].fillna(df_train[col].mean())

cat_vars = df_train.select_dtypes(include=['object', 'bool']).columns.tolist()

for var in cat_vars:
    if len(df_train[var].unique()) == 2:
        lb = LabelBinarizer()
        df_train[var] = lb.fit_transform(df_train[var])
        df_test[var] = lb.transform(df_test[var])
    elif len(df_train[var].unique()) == len(df_test[var].unique()):
        df_train = pd.get_dummies(df_train, columns=[var], prefix=[var])
        df_test = pd.get_dummies(df_test, columns=[var], prefix=[var])
    else:
        df_train.drop([var], axis=1, inplace=True)
        df_test.drop([var], axis=1, inplace=True)

cat_vars = df_train.select_dtypes(include=['object']).columns.tolist()
df_train = pd.get_dummies(df_train, columns=cat_vars)
df_test = pd.get_dummies(df_test, columns=cat_vars)

df_train['inadimplente'] = inadimplente

## Display preprocessed data

In [10]:
display(df_train.head(5).T)

Unnamed: 0,0,1,2,3,4
id_solicitante,1.0,2.0,3.0,4.0,5.0
produto_solicitado,1.0,1.0,1.0,1.0,7.0
dia_vencimento,10.0,25.0,20.0,20.0,1.0
tipo_endereco,1.0,1.0,1.0,1.0,1.0
idade,85.0,38.0,37.0,37.0,51.0
estado_civil,2.0,1.0,2.0,1.0,1.0
qtde_dependentes,0.0,0.0,0.0,1.0,3.0
grau_instrucao,0.0,0.0,0.0,0.0,0.0
nacionalidade,1.0,1.0,1.0,1.0,1.0
possui_telefone_residencial,1.0,1.0,1.0,1.0,1.0


## Check class distribution

In [11]:
display(df_train['inadimplente'].value_counts())

0    10000
1    10000
Name: inadimplente, dtype: int64

## Check mean values of attributes in each class

In [12]:
display(df_train.groupby(['inadimplente']).mean().T)

inadimplente,0,1
id_solicitante,9899.1986,10101.8014
produto_solicitado,1.254,1.3147
dia_vencimento,12.6051,13.6894
tipo_endereco,1.0067,1.006
idade,44.1542,40.5508
estado_civil,2.1625,2.0792
qtde_dependentes,0.6446,0.6882
grau_instrucao,0.0,0.0
nacionalidade,0.9615,0.9617
possui_telefone_residencial,0.8545,0.7929


In [13]:
df_train = df_train.sample(frac=1,random_state=12345)

In [14]:
x = df_train.loc[:, df_train.columns != 'inadimplente'].values
y = df_train.loc[:, df_train.columns == 'inadimplente'].values

In [15]:
split_index = len(df_train)//2

x_train = x[:split_index, :]
y_train = y[:split_index].ravel()

x_test = x[split_index:, :]
y_test = y[split_index:].ravel()

scaler = MinMaxScaler()

x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
df_test = scaler.transform(df_test)

classifier = KNeighborsClassifier()

classifier.fit(x_train, y_train)

y_answer_train = classifier.predict(x_train)
y_answer_test = classifier.predict(x_test)

print('DESEMPENHO DENTRO DA AMOSTRA DE TREINO')

total = len(y_train)
hits = sum(y_answer_train == y_train)
misses = sum(y_answer_train != y_train)

print('Total de amostras: ', total)
print('Respostas corretas:', hits)
print('Respostas erradas: ', misses)

accuracy_train = hits / total

print('Acurácia = %.1f %%' % (100*accuracy_train))

print('DESEMPENHO FORA DA AMOSTRA DE TREINO')

total = len(y_test)
hits = sum(y_answer_test == y_test)
misses = sum(y_answer_test != y_test)

print('Total de amostras: ', total)
print('Respostas corretas:', hits)
print('Respostas erradas: ', misses)

accuracy_test = hits / total

print('Acurácia = %.1f %%' % (100*accuracy_test))

print('K TREINO TESTE')

for k in range(1, x_train.shape[1]+1):
    classificador = KNeighborsClassifier(
        n_neighbors=k,
        weights='uniform',
        p=1
    )

    classificador = classificador.fit(x_train, y_train)

    y_answer_train = classificador.predict(x_train)
    y_answer_test = classificador.predict(x_test)

    accuracy_train = sum(y_answer_train == y_train)/len(y_train)
    accuracy_test = sum(y_answer_test == y_test) / len(y_test)

    print(
        '%3d' % k,
        '%6.1f' % (100*accuracy_train),
        '%6.1f' % (100*accuracy_test)
    )

'''
answer = classifier.predict(df_test)

id = []
for i in range(1, len(answer)+1):
    id.append(str(len(df_train)+i))

data = {'id_solicitante': id, 'inadimplente': answer}

df = pd.DataFrame(data)
df.to_csv('inadimplente.csv', index=False)
'''



DESEMPENHO DENTRO DA AMOSTRA DE TREINO
Total de amostras:  10000
Respostas corretas: 7105
Respostas erradas:  2895
Acurácia = 71.0 %
DESEMPENHO FORA DA AMOSTRA DE TREINO
Total de amostras:  10000
Respostas corretas: 5262
Respostas erradas:  4738
Acurácia = 52.6 %
K TREINO TESTE
  1  100.0   53.0
  2   76.6   52.6
  3   77.1   52.7
  4   71.2   52.5
  5   71.1   53.2
  6   68.1   52.7
  7   68.0   53.0
  8   66.4   52.9
  9   66.2   53.4
 10   65.0   53.1
 11   64.8   53.5
 12   63.8   53.2
 13   64.3   53.5
 14   63.6   53.5
 15   63.6   53.9
 16   63.0   53.9
 17   62.9   54.0
 18   62.5   54.1
 19   62.8   53.9
 20   62.5   54.1
 21   62.5   54.2
 22   62.2   53.6
 23   62.1   54.4
 24   61.8   53.8
 25   61.7   54.2
 26   61.6   53.8
 27   61.4   53.7
 28   61.2   53.9
 29   61.1   53.9
 30   61.1   54.2
 31   61.2   54.5
 32   61.0   54.6
 33   61.1   54.5
 34   60.7   54.6
 35   60.9   54.6
 36   60.9   54.3
 37   60.7   54.7
 38   60.9   54.7
 39   60.5   54.8
 40   60.7   54.8


"\nanswer = classifier.predict(df_test)\n\nid = []\nfor i in range(1, len(answer)+1):\n    id.append(str(len(df_train)+i))\n\ndata = {'id_solicitante': id, 'inadimplente': answer}\n\ndf = pd.DataFrame(data)\ndf.to_csv('inadimplente.csv', index=False)\n"