In [1]:
# Importando as bibliotecas

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import IsolationForest

In [2]:
# Para remover os warnings
import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

In [3]:
# Carregando a base de dados

fraud = pd.read_csv("creditcard.csv", sep = ",")
df = fraud.copy()

fraud.shape[0]

284807

In [4]:
# Visualizando a tabela dos dados
fraud.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [5]:
# Estatisticas da base

fraud.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,3.91956e-15,5.688174e-16,-8.769071e-15,2.782312e-15,-1.552563e-15,2.010663e-15,-1.694249e-15,-1.927028e-16,-3.137024e-15,...,1.537294e-16,7.959909e-16,5.36759e-16,4.458112e-15,1.453003e-15,1.699104e-15,-3.660161e-16,-1.206049e-16,88.349619,0.001727
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.5,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84692.0,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.5,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


In [6]:
# Existem missing values?

fraud.isnull().values.any()

False

In [7]:
# Distribuiçao entre as classes?

print("Percentual de fraudes: {}".format(round(fraud[fraud["Class"] == 1].shape[0]/fraud.shape[0]*100, 3)))
print("Percentual de não fraudes: {}".format(round(fraud[fraud["Class"] == 0].shape[0]/fraud.shape[0]*100, 3)))

Percentual de fraudes: 0.173
Percentual de não fraudes: 99.827


In [8]:
# Estatisticas para fraudes (1) e nao fraudes (0)

fraud_df = pd.DataFrame(fraud[fraud["Class"] == 1]["Amount"].describe())
fraud_df.columns = ['Valores de Fraude']
fraud_df["Valores sem Fraude"] = pd.Series(fraud[fraud["Class"] == 0]["Amount"].describe())
fraud_df

Unnamed: 0,Valores de Fraude,Valores sem Fraude
count,492.0,284315.0
mean,122.211321,88.291022
std,256.683288,250.105092
min,0.0,0.0
25%,1.0,5.65
50%,9.25,22.0
75%,105.89,77.05
max,2125.87,25691.16


In [9]:

# Separando os dados com as variáveis em x e o alvo em y
x = fraud.drop("Class", axis = 1)
y = fraud["Class"]

# Separando entre treinamento e teste
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.5, random_state = 42)

In [10]:
x

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.524980,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.208038,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,1.475829,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.059616,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.001396,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.127434,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00


In [11]:
# Treinando o modelo
iForest = IsolationForest(n_estimators = 200, max_samples = len(x), random_state = 42) #Informando parâmetros do modelo

iForest.fit(x_train) #Comando que treina o Modelo

IsolationForest(max_samples=284807, n_estimators=200, random_state=42)

In [12]:
# Predições para o treinamento e teste

y_pred_train1 = iForest.predict(x_train)
y_pred_test1 = iForest.predict(x_test)

In [13]:
y_pred_test1

array([-1,  1,  1, ...,  1,  1,  1])

In [16]:
# Reajustando a previsão para ficar alinhado com a Class da base

y_pred_train1 = np.where(y_pred_train1 == -1, 1, 0)
y_pred_test1 = np.where(y_pred_test1 == -1, 1, 0)

In [17]:
print(np.count_nonzero(y_pred_train1 == 1))

print(np.count_nonzero(y_train == 1))

331
246


In [18]:
print(np.count_nonzero(y_pred_test1 == 1))

print(np.count_nonzero(y_test == 1))

320
246


In [19]:
print("Base de treinamento")
print("Percentual de fraudes: {}".format(round(accuracy_score(y_train,y_pred_train1)*100, 2)))
print("------------------------------")
print("Base de teste")
print("Percentual de fraudes: {}".format(round(accuracy_score(y_test,y_pred_test1)*100, 2)))

Base de treinamento
Percentual de fraudes: 99.71
------------------------------
Base de teste
Percentual de fraudes: 99.74


In [20]:
# Treinamento

print(confusion_matrix(y_train, y_pred_train1))
print()
print("--------------------------------------------------------------------------------")
print()
print(classification_report(y_train, y_pred_train1))

[[141906    251]
 [   166     80]]

--------------------------------------------------------------------------------

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    142157
           1       0.24      0.33      0.28       246

    accuracy                           1.00    142403
   macro avg       0.62      0.66      0.64    142403
weighted avg       1.00      1.00      1.00    142403



In [21]:
# Teste

print(confusion_matrix(y_test, y_pred_test1))
print()
print("--------------------------------------------------------------------------------")
print()
print(classification_report(y_test, y_pred_test1))

[[141938    220]
 [   146    100]]

--------------------------------------------------------------------------------

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    142158
           1       0.31      0.41      0.35       246

    accuracy                           1.00    142404
   macro avg       0.66      0.70      0.68    142404
weighted avg       1.00      1.00      1.00    142404



In [22]:

# Essa biblioteca coloca o número já na moeda local, sem que tenhamos que ficar preenchendo a mão
# de acordo com a localização do seu computador

import locale
locale.setlocale( locale.LC_ALL, '' )

'pt_BR.UTF-8'

In [23]:
print("Transação Normal")
print("Valor médio: {}".format(locale.currency(round(fraud[fraud["Class"] == 0]["Amount"].describe()[1], 2))))
print("___________________________")
print("Transação Fraudulenta")
print("Valor médio: {}".format(locale.currency(round(fraud[fraud["Class"] == 1]["Amount"].describe()[1], 2))))

Transação Normal
Valor médio: R$ 88,29
___________________________
Transação Fraudulenta
Valor médio: R$ 122,21


In [24]:
print("Teriamos bloqueado mediamente: {}".format(locale.currency(
    round(fraud[fraud["Class"] == 1]["Amount"].describe()[1]*99, 2), grouping=True)))
print("------------------------------")
print("Ao custo médio de : {}".format(locale.currency(
    round(fraud[fraud["Class"] == 0]["Amount"].describe()[1]*215, 2), grouping=True)))

Teriamos bloqueado mediamente: R$ 12.098,92
------------------------------
Ao custo médio de : R$ 18.982,57


In [25]:
# Vamos olhar o valor real para a base inteira

# aqui usei o mesmo dataset mas como nome de "df", simplesmente para não impactar o dataset original.

df["IF"] = np.where(iForest.predict(df.iloc[:, 0:30]) == -1, 1, 0)

In [26]:
locale.currency(df[df["Class"] == 1]["Amount"].sum())

'R$ 60127,97'

In [27]:
locale.currency(df[df["IF"] == 1]["Amount"].sum())

'R$ 831237,37'

In [28]:
print("Teríamos bloqueado: {}".format(locale.currency(df[(df["IF"] == 1) & (df["Class"] == 1)]["Amount"].sum()
                                                      , grouping=True)))
print()
print("Que representa % do total: {}".format(round(df[(df["IF"] == 1) & (df["Class"] == 1)]["Amount"].sum()/
                                              df[df["Class"] == 1]["Amount"].sum()*100, 2)))
print()
print("Autorizando indevidamente: {}".format(locale.currency(df[(df["IF"] == 0) & (fraud["Class"] == 1)]["Amount"].sum()
                                                    , grouping=True)))
print("------------------------------")
print("Ao custo de: {}".format(locale.currency(df[(df["IF"] == 1) & (df["Class"] == 0)]["Amount"].sum(), grouping=True)))
print("")
print("Que representa % do total: {}".format(round(df[(df["IF"] == 0) & (df["Class"] == 0)]["Amount"].sum()/
                                              df[df["Class"] == 0]["Amount"].sum()*100, 2)))

Teríamos bloqueado: R$ 15.454,34

Que representa % do total: 25.7

Autorizando indevidamente: R$ 44.673,63
------------------------------
Ao custo de: R$ 815.783,03

Que representa % do total: 96.75


In [29]:

# Separando os dados com as variáveis em x e o alvo em y
x = df.drop(["Class","IF"], axis = 1)
y = df["Class"]

# Separando entre treinamento e teste
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.5, random_state = 42)

In [30]:
# Treinando o modelo

iForest2 = IsolationForest(n_estimators = 1, max_samples = 100000, random_state = 42)

iForest2.fit(x_train)

IsolationForest(max_samples=100000, n_estimators=1, random_state=42)

In [31]:
# Prediçoes para o treinamento e teste

y_pred_train2 = iForest2.predict(x_train)
y_pred_test2 = iForest2.predict(x_test)

# Returns -1 for outliers and 1 for inliers.

In [32]:
# Returns -1 for outliers and 1 for inliers.

# Reajustando a previsao para ficar alinhado com a Class da base

y_pred_train2 = np.where(y_pred_train2 == -1, 1, 0)
y_pred_test2 = np.where(y_pred_test2 == -1, 1, 0)

In [33]:
print("Base de treinamento")
print("Percentual de fraudes: {}".format(round(accuracy_score(y_train,y_pred_train2)*100, 2)))
print("------------------------------")
print("Base de teste")
print("Percentual de fraudes: {}".format(round(accuracy_score(y_test,y_pred_test2)*100, 2)))

Base de treinamento
Percentual de fraudes: 99.48
------------------------------
Base de teste
Percentual de fraudes: 99.48


In [34]:
# Mas qual o valor real?

df["IF2"] = np.where(iForest2.predict(df.iloc[:, 0:30]) == -1, 1, 0)

In [35]:
print("Teriamos bloqueado: {}".format(locale.currency(df[(df["IF2"] == 1) & (df["Class"] == 1)]["Amount"].sum()
                                                      , grouping=True)))
print()
print("% do total: {}".format(round(df[(df["IF2"] == 1) & (df["Class"] == 1)]["Amount"].sum()/
                                              df[df["Class"] == 1]["Amount"].sum()*100, 2)))
print()
print("Deixando passar : {}".format(locale.currency(df[(df["IF2"] == 0) & (df["Class"] == 1)]["Amount"].sum()
                                                    , grouping=True)))
print("------------------------------")
print("Ao custo de : {}".format(locale.currency(df[(df["IF2"] == 1) & (df["Class"] == 0)]["Amount"].sum(), grouping=True)))
print("")
print("% do total ainda aceito: {}".format(round(df[(df["IF2"] == 0) & (df["Class"] == 0)]["Amount"].sum()/
                                              df[df["Class"] == 0]["Amount"].sum()*100, 2)))

Teriamos bloqueado: R$ 16.076,07

% do total: 26.74

Deixando passar : R$ 44.051,90
------------------------------
Ao custo de : R$ 681.367,22

% do total ainda aceito: 97.29
