# IEEE Fraud Detection

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## 1 - Carregando os datasets

In [2]:
df_train_id = pd.read_csv("data/train_identity.csv")
df_train_tr = pd.read_csv("data/train_transaction.csv")
df_test_id = pd.read_csv("data/test_identity.csv")
df_test_tr = pd.read_csv("data/train_transaction.csv")

In [3]:
df_train_id.shape, df_train_tr.shape, df_test_id.shape, df_test_tr.shape

((144233, 41), (590540, 394), (141907, 41), (590540, 394))

In [32]:
df_train = pd.merge(df_train_tr, df_train_id, on='TransactionID', how='left')
df_test = pd.merge(df_test_tr, df_test_id, on='TransactionID', how='left')

In [33]:
# Retirando a coluna isFraud para podermos fazer a predição desta
df_test.drop(columns=['isFraud'], axis=1, inplace=True)

In [34]:
print(df_train.shape)
print(df_test.shape)

(590540, 434)
(590540, 433)


In [35]:
df_train.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


In [36]:
df_test.head()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id-31,id-32,id-33,id-34,id-35,id-36,id-37,id-38,DeviceType,DeviceInfo
0,2987000,86400,68.5,W,13926,,150.0,discover,142.0,credit,...,,,,,,,,,,
1,2987001,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,...,,,,,,,,,,
2,2987002,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,...,,,,,,,,,,
3,2987003,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,...,,,,,,,,,,
4,2987004,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,...,,,,,,,,,,


In [37]:
# Variável alvo
target = df_train['isFraud'].value_counts()
target

0    569877
1     20663
Name: isFraud, dtype: int64

## 2 - Análise Exploratória

#### Avaliando o tipo de aparelho em que a transação foi realizada

In [28]:
devicetype_freq = pd.DataFrame.from_dict(df_train['DeviceType'].value_counts(normalize=True, dropna=False))

devicetype_freq.rename(columns={'DeviceType': 'Freq'}, inplace=True)
devicetype_freq

Unnamed: 0,Freq
,0.761557
desktop,0.144215
mobile,0.094227


#### Avaliando o sistema operacional em que a transação foi realizada

In [29]:
dev_freq = pd.DataFrame.from_dict(df_train['DeviceInfo'].value_counts(normalize=True, dropna=False))

dev_freq.rename(columns={'DeviceInfo': 'Freq'}, inplace=True)
dev_freq = dev_freq[dev_freq['Freq'] >= 0.1/100]

dev_freq

Unnamed: 0,Freq
,0.799055
Windows,0.080811
iOS Device,0.033498
MacOS,0.021291
Trident/7.0,0.012599
rv:11.0,0.003219
rv:57.0,0.001629


- Aproximadamente $ 80$% dos dados são nulos

#### Avaliando a bandeira do cartão de crédito

In [30]:
cc_freq = pd.DataFrame.from_dict(df_train['card4'].value_counts(normalize=True, dropna=False))

cc_freq.rename(columns={'card4': 'Freq'}, inplace=True)
cc_freq 

Unnamed: 0,Freq
visa,0.651551
mastercard,0.320414
american express,0.014102
discover,0.011263
,0.00267


In [31]:
df_train[['card4', 'isFraud']].groupby(['card4'], as_index=True).mean()

Unnamed: 0_level_0,isFraud
card4,Unnamed: 1_level_1
american express,0.028698
discover,0.077282
mastercard,0.034331
visa,0.034756


Unnamed: 0,TransactionID,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,...,V331,V332,V333,V334,V335,V336,V337,V338,V339,TransacionDT
0,2987004,0.0,70787.0,,,,,,,,...,,,,,,,,,,"(2986999.999, 3090506.4]"
1,2987008,-5.0,98945.0,,,0.0,-5.0,,,,...,,,,,,,,,,"(2986999.999, 3090506.4]"
2,2987010,-5.0,191631.0,0.0,0.0,0.0,0.0,,,0.0,...,,,,,,,,,,"(2986999.999, 3090506.4]"
3,2987011,-5.0,221832.0,,,0.0,-6.0,,,,...,,,,,,,,,,"(2986999.999, 3090506.4]"
4,2987016,0.0,7460.0,0.0,0.0,1.0,0.0,,,0.0,...,,,,,,,,,,"(2986999.999, 3090506.4]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
734768,3577535,,,,,,,,,,...,,,,,,,,,,"(3455174.6, 3577539.0]"
734769,3577536,,,,,,,,,,...,,,,,,,,,,"(3455174.6, 3577539.0]"
734770,3577537,,,,,,,,,,...,,,,,,,,,,"(3455174.6, 3577539.0]"
734771,3577538,,,,,,,,,,...,,,,,,,,,,"(3455174.6, 3577539.0]"
