# **Importação dos dados**

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd

import seaborn as sns
from matplotlib import pyplot as plt

In [22]:
credit_df = pd.read_csv('https://raw.githubusercontent.com/mrafaelbatista/uniesp_mba_machine_learning/refs/heads/main/classwork/credit.csv')
credit_df.head()

Unnamed: 0,Id,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,good
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,2,49,male,1,own,little,,2096,12,education,good
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,4,53,male,2,free,little,little,4870,24,car,bad


In [23]:
credit_df.columns

Index(['Id', 'Age', 'Sex', 'Job', 'Housing', 'Saving accounts',
       'Checking account', 'Credit amount', 'Duration', 'Purpose', 'Risk'],
      dtype='object')

In [24]:
credit_df.Risk.value_counts()

Unnamed: 0_level_0,count
Risk,Unnamed: 1_level_1
good,700
bad,300



### **-- Precisamos preprocessar os dados -- Tarefa de vcs!!**

In [25]:
credit_df['Risk_Num'] = np.where(credit_df['Risk']=='good', 1, 0)
credit_df.Risk_Num.value_counts()

Unnamed: 0_level_0,count
Risk_Num,Unnamed: 1_level_1
1,700
0,300


In [26]:
# Cria classe nova para NAs
credit_df=credit_df.fillna('Null')

credit_df.head()

Unnamed: 0,Id,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk,Risk_Num
0,0,67,male,2,own,Null,little,1169,6,radio/TV,good,1
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad,0
2,2,49,male,1,own,little,Null,2096,12,education,good,1
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,good,1
4,4,53,male,2,free,little,little,4870,24,car,bad,0


In [27]:
# Entendendo as variaveis categoricas
print(credit_df['Job'].value_counts())
print(credit_df['Housing'].value_counts())
print(credit_df['Saving accounts'].value_counts())
print(credit_df['Checking account'].value_counts())
print(credit_df['Purpose'].value_counts())

Job
2    630
1    200
3    148
0     22
Name: count, dtype: int64
Housing
own     713
rent    179
free    108
Name: count, dtype: int64
Saving accounts
little        603
Null          183
moderate      103
quite rich     63
rich           48
Name: count, dtype: int64
Checking account
Null        394
little      274
moderate    269
rich         63
Name: count, dtype: int64
Purpose
car                    337
radio/TV               280
furniture/equipment    181
business                97
education               59
repairs                 22
domestic appliances     12
vacation/others         12
Name: count, dtype: int64


In [28]:
# Converte as variaveis categoricas em dummies
credit_df=pd.get_dummies(credit_df)

credit_df.head()

Unnamed: 0,Id,Age,Job,Credit amount,Duration,Risk_Num,Sex_female,Sex_male,Housing_free,Housing_own,...,Purpose_business,Purpose_car,Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_radio/TV,Purpose_repairs,Purpose_vacation/others,Risk_bad,Risk_good
0,0,67,2,1169,6,1,False,True,False,True,...,False,False,False,False,False,True,False,False,False,True
1,1,22,2,5951,48,0,True,False,False,True,...,False,False,False,False,False,True,False,False,True,False
2,2,49,1,2096,12,1,False,True,False,True,...,False,False,False,True,False,False,False,False,False,True
3,3,45,2,7882,42,1,False,True,True,False,...,False,False,False,False,True,False,False,False,False,True
4,4,53,2,4870,24,0,False,True,True,False,...,False,True,False,False,False,False,False,False,True,False


In [29]:
# As variaveis numericas nao sao convertidas, entao
# vamos primeiro criar um df com as dummies dela, depois adicionamos ao nosso df original
dummy_job = pd.get_dummies(credit_df['Job'])
dummy_job = dummy_job.rename(columns={0: "Job 0", 1: "Job 1", 2: "Job 2", 3: "Job 3"})
dummy_job.head()

Unnamed: 0,Job 0,Job 1,Job 2,Job 3
0,False,False,True,False
1,False,False,True,False
2,False,True,False,False
3,False,False,True,False
4,False,False,True,False


In [30]:
# Passa as variaveis Job para nossa base
credit_df["Job_0"] = dummy_job["Job 0"]
credit_df["Job_1"] = dummy_job["Job 1"]
credit_df["Job_2"] = dummy_job["Job 2"]
credit_df["Job_3"] = dummy_job["Job 3"]
credit_df.head()

Unnamed: 0,Id,Age,Job,Credit amount,Duration,Risk_Num,Sex_female,Sex_male,Housing_free,Housing_own,...,Purpose_furniture/equipment,Purpose_radio/TV,Purpose_repairs,Purpose_vacation/others,Risk_bad,Risk_good,Job_0,Job_1,Job_2,Job_3
0,0,67,2,1169,6,1,False,True,False,True,...,False,True,False,False,False,True,False,False,True,False
1,1,22,2,5951,48,0,True,False,False,True,...,False,True,False,False,True,False,False,False,True,False
2,2,49,1,2096,12,1,False,True,False,True,...,False,False,False,False,False,True,False,True,False,False
3,3,45,2,7882,42,1,False,True,True,False,...,True,False,False,False,False,True,False,False,True,False
4,4,53,2,4870,24,0,False,True,True,False,...,False,False,False,False,True,False,False,False,True,False


## **Segmentando a amostra**

In [31]:
metrics = ['Age','Sex_female','Job_0','Job_1','Job_2','Credit amount','Duration','Housing_free','Housing_own','Saving accounts_Null','Saving accounts_little','Saving accounts_moderate','Saving accounts_quite rich','Checking account_Null','Checking account_little','Checking account_moderate','Purpose_business','Purpose_car','Purpose_domestic appliances','Purpose_education','Purpose_furniture/equipment','Purpose_radio/TV','Purpose_repairs']
x = credit_df[metrics]
x.head()

Unnamed: 0,Age,Sex_female,Job_0,Job_1,Job_2,Credit amount,Duration,Housing_free,Housing_own,Saving accounts_Null,...,Checking account_Null,Checking account_little,Checking account_moderate,Purpose_business,Purpose_car,Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_radio/TV,Purpose_repairs
0,67,False,False,False,True,1169,6,False,True,True,...,False,True,False,False,False,False,False,False,True,False
1,22,True,False,False,True,5951,48,False,True,False,...,False,False,True,False,False,False,False,False,True,False
2,49,False,False,True,False,2096,12,False,True,False,...,True,False,False,False,False,False,True,False,False,False
3,45,False,False,False,True,7882,42,True,False,False,...,False,True,False,False,False,False,False,True,False,False
4,53,False,False,False,True,4870,24,True,False,False,...,False,True,False,False,True,False,False,False,False,False


In [32]:
outcome = ["Risk_Num"]
y = credit_df[outcome]
y.head()

Unnamed: 0,Risk_Num
0,1
1,0
2,1
3,1
4,0


In [33]:
X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size =0.30, stratify=y)
print(X_train.shape)
print(X_test.shape)

(700, 23)
(300, 23)


In [34]:
credit_df.corr()

Unnamed: 0,Id,Age,Job,Credit amount,Duration,Risk_Num,Sex_female,Sex_male,Housing_free,Housing_own,...,Purpose_furniture/equipment,Purpose_radio/TV,Purpose_repairs,Purpose_vacation/others,Risk_bad,Risk_good,Job_0,Job_1,Job_2,Job_3
Id,1.0,-0.010096,-0.027345,0.013488,0.030788,-0.034606,0.001693,-0.001693,-0.012121,-0.013244,...,-0.003846,-0.017483,-0.012564,-0.028569,0.034606,-0.034606,0.025198,0.011986,-0.005826,-0.015989
Age,-0.010096,1.0,0.015673,0.032716,-0.036136,0.091127,-0.161694,0.161694,0.253058,0.006553,...,-0.127657,-0.046401,0.039567,0.042365,-0.091127,0.091127,0.059954,0.043712,-0.148283,0.127605
Job,-0.027345,0.015673,1.0,0.285385,0.21091,-0.032735,-0.070298,0.070298,0.134972,-0.059393,...,0.013422,-0.027683,-0.092751,0.100544,0.032735,-0.032735,-0.437124,-0.691886,0.191751,0.699226
Credit amount,0.013488,0.032716,0.285385,1.0,0.624984,-0.154739,-0.093482,0.093482,0.201643,-0.117497,...,-0.034037,-0.173203,-0.028875,0.192893,0.154739,-0.154739,-0.027969,-0.161757,-0.092636,0.319715
Duration,0.030788,-0.036136,0.21091,0.624984,1.0,-0.214927,-0.081432,0.081432,0.189117,-0.075169,...,-0.062804,-0.044319,-0.022549,0.104516,0.214927,-0.214927,-0.044043,-0.181203,0.05501,0.147515
Risk_Num,-0.034606,0.091127,-0.032735,-0.154739,-0.214927,1.0,-0.075493,0.075493,-0.081556,0.134589,...,-0.020971,0.106922,-0.020828,-0.028058,-1.0,1.0,-0.005951,0.021822,0.013559,-0.040559
Sex_female,0.001693,-0.161694,-0.070298,-0.093482,-0.081432,-0.075493,1.0,-1.0,-0.100872,-0.119638,...,0.100467,-0.008668,-0.026828,-0.014297,0.075493,-0.075493,0.076356,0.010811,0.007613,-0.05407
Sex_male,-0.001693,0.161694,0.070298,0.093482,0.081432,0.075493,-1.0,1.0,0.100872,0.119638,...,-0.100467,0.008668,0.026828,0.014297,-0.075493,0.075493,-0.076356,-0.010811,-0.007613,0.05407
Housing_free,-0.012121,0.253058,0.134972,0.201643,0.189117,-0.081556,-0.100872,0.100872,1.0,-0.548445,...,-0.07153,-0.109357,0.013706,0.08001,0.081556,-0.081556,0.035671,-0.109543,-0.033633,0.154388
Housing_own,-0.013244,0.006553,-0.059393,-0.117497,-0.075169,0.134589,-0.119638,0.119638,-0.548445,1.0,...,-0.040496,0.134705,0.019803,-0.011288,-0.134589,0.134589,-0.04048,0.063003,0.012866,-0.071741


In [35]:
credit_df

Unnamed: 0,Id,Age,Job,Credit amount,Duration,Risk_Num,Sex_female,Sex_male,Housing_free,Housing_own,...,Purpose_furniture/equipment,Purpose_radio/TV,Purpose_repairs,Purpose_vacation/others,Risk_bad,Risk_good,Job_0,Job_1,Job_2,Job_3
0,0,67,2,1169,6,1,False,True,False,True,...,False,True,False,False,False,True,False,False,True,False
1,1,22,2,5951,48,0,True,False,False,True,...,False,True,False,False,True,False,False,False,True,False
2,2,49,1,2096,12,1,False,True,False,True,...,False,False,False,False,False,True,False,True,False,False
3,3,45,2,7882,42,1,False,True,True,False,...,True,False,False,False,False,True,False,False,True,False
4,4,53,2,4870,24,0,False,True,True,False,...,False,False,False,False,True,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,31,1,1736,12,1,True,False,False,True,...,True,False,False,False,False,True,False,True,False,False
996,996,40,3,3857,30,1,False,True,False,True,...,False,False,False,False,False,True,False,False,False,True
997,997,38,2,804,12,1,False,True,False,True,...,False,True,False,False,False,True,False,False,True,False
998,998,23,2,1845,45,0,False,True,True,False,...,False,True,False,False,True,False,False,False,True,False


### **Regressão Logística**

In [36]:
lr = LogisticRegression(random_state=42, solver='lbfgs')

In [37]:
#treino do modelo
lr.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [38]:
y_pred = lr.predict(X_test)
y_pred

array([1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0])

In [39]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.7533333333333333

### **Respondendo às perguntas**

In [40]:
'''
Age
Sex_female
Job_0	Job_1	Job_2
Credit amount
Duration
Housing_free	Housing_own
Saving accounts_Null Saving accounts_little	Saving accounts_moderate	Saving accounts_quite rich
Checking account_Null	Checking account_little	Checking account_moderate
Purpose_business	Purpose_car	Purpose_domestic appliances	Purpose_education	Purpose_furniture/equipment	Purpose_radio/TV	Purpose_repairs


Id,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
1 - Cliente A - Gertrude Rocha
1001, 60, female,2,own,quite rich,NA,2835,24,furniture/equipment,?

2 - Cliente B - Abelardo Jurema
1002, 53, male,1,own,little,NA,2835,36,furniture/equipment,?

3 - Cliente C - Sérgio
1003,20,male,2,free,moderate,moderate,5866,18,car,?

#23 colunas
1- 60,1, 0,0,1, 2835,24, 0,1, 0,0,0,1, 1,0,0, 0,0,0,0,1,0,0
2- 53,0, 0,1,0, 2835,36, 0,1, 0,1,0,0, 1,0,0, 0,0,1,0,0,0,0
3- 20,0, 0,0,1, 5866,18, 1,0, 0,0,1,0, 0,0,1, 0,1,0,0,0,0,0
'''

cliente1 = [60,1, 0,0,1, 2835,24, 0,1, 0,0,0,1, 1,0,0, 0,0,0,0,1,0,0]
cliente2 = [53,0, 0,1,0, 2835,36, 0,1, 0,1,0,0, 1,0,0, 0,0,1,0,0,0,0]
cliente3 = [20,0, 0,0,1, 5866,18, 1,0, 0,0,1,0, 0,0,1, 0,1,0,0,0,0,0]

clientesAnalise = [cliente1, cliente2, cliente3]

respostas = lr.predict(clientesAnalise)
respostas #1: good e 0:bad



array([1, 1, 1])