Infos iniciais do projeto

## Desenvolvimento do projeto

### Carregando as bibliotecas

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

### Carregando os dados

In [4]:
df = pd.read_csv("train.csv")

### Descrição dos dados

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,target,TaxaDeUtilizacaoDeLinhasNaoGarantidas,Idade,NumeroDeVezes30-59DiasAtrasoNaoPior,TaxaDeEndividamento,RendaMensal,NumeroDeLinhasDeCreditoEEmprestimosAbertos,NumeroDeVezes90DiasAtraso,NumeroDeEmprestimosOuLinhasImobiliarias,NumeroDeVezes60-89DiasAtrasoNaoPior,NumeroDeDependentes
0,0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [6]:
df.drop('Unnamed: 0', axis = 1, inplace = True)

In [7]:
df.shape

(150000, 11)

In [8]:
df.dtypes

target                                          int64
TaxaDeUtilizacaoDeLinhasNaoGarantidas         float64
Idade                                           int64
NumeroDeVezes30-59DiasAtrasoNaoPior             int64
TaxaDeEndividamento                           float64
RendaMensal                                   float64
NumeroDeLinhasDeCreditoEEmprestimosAbertos      int64
NumeroDeVezes90DiasAtraso                       int64
NumeroDeEmprestimosOuLinhasImobiliarias         int64
NumeroDeVezes60-89DiasAtrasoNaoPior             int64
NumeroDeDependentes                           float64
dtype: object

In [9]:
df.isna().sum()

target                                            0
TaxaDeUtilizacaoDeLinhasNaoGarantidas             0
Idade                                             0
NumeroDeVezes30-59DiasAtrasoNaoPior               0
TaxaDeEndividamento                               0
RendaMensal                                   29731
NumeroDeLinhasDeCreditoEEmprestimosAbertos        0
NumeroDeVezes90DiasAtraso                         0
NumeroDeEmprestimosOuLinhasImobiliarias           0
NumeroDeVezes60-89DiasAtrasoNaoPior               0
NumeroDeDependentes                            3924
dtype: int64

In [10]:
df.describe()

Unnamed: 0,target,TaxaDeUtilizacaoDeLinhasNaoGarantidas,Idade,NumeroDeVezes30-59DiasAtrasoNaoPior,TaxaDeEndividamento,RendaMensal,NumeroDeLinhasDeCreditoEEmprestimosAbertos,NumeroDeVezes90DiasAtraso,NumeroDeEmprestimosOuLinhasImobiliarias,NumeroDeVezes60-89DiasAtrasoNaoPior,NumeroDeDependentes
count,150000.0,150000.0,150000.0,150000.0,150000.0,120269.0,150000.0,150000.0,150000.0,150000.0,146076.0
mean,0.06684,6.048438,52.295207,0.421033,353.005076,6670.221,8.45276,0.265973,1.01824,0.240387,0.757222
std,0.249746,249.755371,14.771866,4.192781,2037.818523,14384.67,5.145951,4.169304,1.129771,4.155179,1.115086
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.029867,41.0,0.0,0.175074,3400.0,5.0,0.0,0.0,0.0,0.0
50%,0.0,0.154181,52.0,0.0,0.366508,5400.0,8.0,0.0,1.0,0.0,0.0
75%,0.0,0.559046,63.0,0.0,0.868254,8249.0,11.0,0.0,2.0,0.0,1.0
max,1.0,50708.0,109.0,98.0,329664.0,3008750.0,58.0,98.0,54.0,98.0,20.0


### Variável target

In [11]:
df.target.value_counts(normalize = True)

target
0    0.93316
1    0.06684
Name: proportion, dtype: float64

## Separação dos dados em treino e validação antes de continuar com qualquer tipo de processamento

In [12]:
X = df.drop('target', axis = 1)
y = df.target

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y)

### Tratamento dos dados

In [13]:
from sklearn.impute import SimpleImputer

In [14]:
imp_mean = SimpleImputer(strategy='mean')
cols_to_impute = ['RendaMensal', 'NumeroDeDependentes']
imp_mean.fit(X_train[cols_to_impute])

In [15]:
X_train[cols_to_impute] = imp_mean.transform(X_train[cols_to_impute]) 
X_val[cols_to_impute] = imp_mean.transform(X_val[cols_to_impute])

In [16]:
X_train.isna().sum()

TaxaDeUtilizacaoDeLinhasNaoGarantidas         0
Idade                                         0
NumeroDeVezes30-59DiasAtrasoNaoPior           0
TaxaDeEndividamento                           0
RendaMensal                                   0
NumeroDeLinhasDeCreditoEEmprestimosAbertos    0
NumeroDeVezes90DiasAtraso                     0
NumeroDeEmprestimosOuLinhasImobiliarias       0
NumeroDeVezes60-89DiasAtrasoNaoPior           0
NumeroDeDependentes                           0
dtype: int64

### Feature selection

In [17]:
from sklearn.feature_selection import SelectPercentile

In [19]:
select = SelectPercentile(percentile=50)
select.fit(X_train, y_train)

# Transform training set

X_train_selected = select.transform(X_train)
print("X_train.shape: {}".format(X_train.shape))
print("X_train_selected.shape: {}".format(X_train_selected.shape))

X_train.shape: (120000, 10)
X_train_selected.shape: (120000, 5)


In [27]:
selected_features = [X_train.columns[i] for i in range(len(X_train_selected))]
selected_features

IndexError: index 10 is out of bounds for axis 0 with size 10

In [29]:
X_train_selected

array([[43.,  0.,  0.,  0.,  1.],
       [73.,  0.,  0.,  0.,  0.],
       [54.,  1.,  0.,  1.,  2.],
       ...,
       [59.,  1.,  0.,  0.,  0.],
       [51.,  0.,  0.,  0.,  0.],
       [61.,  0.,  0.,  0.,  1.]])

### Treinando modelo

#### Baseline

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

X_val_selected = select.transform(X_val)