# IEEE FRAUD NOTEBOOK

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
from sklearn.metrics import roc_auc_score
import gc

from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

pd.set_option('max_columns', None)

In [2]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True, index_col='TransactionID')
    df = reduce_mem_usage(df)
    return df

In [3]:
%%time

print('Loading data...')

train_identity = import_data('train_identity.csv')
print('\tSuccessfully loaded train_identity!')

train_transaction = import_data('train_transaction.csv')
print('\tSuccessfully loaded train_transaction!')

test_identity = import_data('test_identity.csv')
print('\tSuccessfully loaded test_identity!')

test_transaction = import_data('test_transaction.csv')
print('\tSuccessfully loaded test_transaction!')


test_identity.columns = train_identity.columns

print('Data was successfully loaded!\n')

Loading data...
Memory usage of dataframe is 45.12 MB
Memory usage after optimization is: 10.57 MB
Decreased by 76.6%
	Successfully loaded train_identity!
Memory usage of dataframe is 1775.15 MB
Memory usage after optimization is: 489.41 MB
Decreased by 72.4%
	Successfully loaded train_transaction!
Memory usage of dataframe is 44.39 MB
Memory usage after optimization is: 10.40 MB
Decreased by 76.6%
	Successfully loaded test_identity!
Memory usage of dataframe is 1519.24 MB
Memory usage after optimization is: 427.17 MB
Decreased by 71.9%
	Successfully loaded test_transaction!
Data was successfully loaded!

Wall time: 9min 53s


In [4]:
print('Merging data...')
train = train_transaction.merge(train_identity, how='left', left_index=True, right_index=True)
test = test_transaction.merge(test_identity, how='left', left_index=True, right_index=True)

# "isFraud" bagımlı degiskenini y_train degiskenine atadık
y_train = train["isFraud"]
# Train bagimsiz degiskenleri
df_train = train.drop("isFraud", axis=1)

print('Data was successfully merged!\n')

del train_identity, train_transaction, test_identity, test_transaction

print(f'Train dataset has {train.shape[0]} rows and {train.shape[1]} columns.')
print(f'Test dataset has {test.shape[0]} rows and {test.shape[1]} columns.')

gc.collect()

Merging data...
Data was successfully merged!

Train dataset has 590540 rows and 433 columns.
Test dataset has 506691 rows and 432 columns.


53

### NOT: LÜTFEN YUKARIDAKİ HİÇBİR DEĞERİ VE DEĞİŞKENİ DEĞİŞTİRMEYİN.

Örneğin train dosyasında değişiklik yapacaksanız aşağıdaki gibi copyasını alarak yapınız.
df_train_copya = df_train.copy()


# 1. Ozan


# 2. Ümit

# 3. Muhammet(Alm)

# 4. Ismail

### 1-) (card1 - card4 - addr1) ile (TransactionAmt - id_02 - D15) arasinda uretilen degiskenler

- https://www.kaggle.com/davidcairuz/feature-engineering-lightgbm

In [None]:
columns_a = ['TransactionAmt', 'id_02', 'D15']
columns_b = ['card1', 'card4', 'addr1']

for col_a in columns_a:
    for col_b in columns_b:
        for df in [train, test]:
            df[f'{col_a}_to_mean_{col_b}'] = df[col_a] / df.groupby([col_b])[col_a].transform('mean')
            df[f'{col_a}_to_std_{col_b}'] = df[col_a] / df.groupby([col_b])[col_a].transform('std')

#### card1 - TransactionAmt

- card1 degiskeni onemli bir degisken, kullanici belirlemede ise yarayabilecek bir degisken, burada yapilan her bir harcamanin (transactionID) card1 bilgisi var ama bu card ile baska harcamalarda yapilmis, bu card ile yapilan toplam harcamalarin ortalamasini (std icin de yapiyor) belirliyor ve sadece bu harcamadaki miktar ile oranina bakiyor,

- mesela x card, 3 farkli islemde kullanilmis ve her islemde 5, 10, 15 para birimlik harcama yapilmis, sonucta x card islem basina harcama ortalamasi 10

- 1.islemde gercek harcama 5/ ortalama harcama 10
- 2.islemde 10/10
- 3.islemde 15/10 

- bu sekilde her islemdeki card'in o islemdeki harcamasinin toplam card harcama ortalamasina oranini tespit ediyoruz

- bu islem hem MEAN hem de STD icin ayri ayri yapiliyor

In [None]:
(train['TransactionAmt'] / train.groupby('card1')['TransactionAmt'].transform('mean')).head()

#### card4 - TransactionAmt

- card4 kartin visa, mastercard, debit vs... olup olmadigini bilgisini tutuyor, burada da ayni yukarida oldugu gibi, her bir islemdeki tutarin genel ortalamaya oranini veriyor

In [None]:
(train['TransactionAmt'] / train.groupby('card4')['TransactionAmt'].transform('mean')).head()

#### addr1 - TransactionAmt
- addr1 degiskeni zipcode bilgisi veriyor, yani ayni zipcode'dan yapilan islemlerin tamamindaki miktarin ortalamasi ile ayni zipcode'dan yapilan her bir islemin orani hesaplanmis

In [None]:
(train['TransactionAmt'] / train.groupby('addr1')['TransactionAmt'].transform('mean')).head()

#### card1 - id_02

- card1'i grupluyor ve id_02 degerlerinin mean ve std'sini cikartiyor, 
- id_02 degeri 1'den baslayip yaklasik 1 milyona kadar artan, ortalamasi 175000 civari olan bir sayisal deger, ne ifade ettigini anlayamadim
- id_02'yi anlayamadigim icin buradaki feature uretmenin de mantigini cozemedim

In [None]:
(train['id_02'] / train.groupby('card1')['id_02'].transform('mean')).head(30)

#### card4 - id_02  - addr1 - id_02
- benzer sekilde id_02 degiskeni ile card4 ve addr1 arasindaki iliskiyi de anlayamadim

#### (card1 - D15), (card4 - D15), (addr1 - D15)
- D15 degiskeni hakkinda bir kanaat olusmadigindan bu yeni feature olusturmanin mantigi da kavranamadi.

### 2-) TransactionAmt'nin LOGORITMASI
- gerek var mi?

In [None]:
train['TransactionAmt_Log'] = np.log(train['TransactionAmt'])
test['TransactionAmt_Log'] = np.log(test['TransactionAmt'])

### 3-) TransactionAmt'nin virgulden sonraki rakamlari
- 5 ve 10'nun kati rakamlarin dolar cinsinden, digerlerinin ise farkli para birimi cinsinden olabilecegine dair uretilmis bir degisken

In [None]:
train['TransactionAmt_decimal'] = ((train['TransactionAmt'] - train['TransactionAmt'].astype(int)) * 1000).astype(int)
test['TransactionAmt_decimal'] = ((test['TransactionAmt'] - test['TransactionAmt'].astype(int)) * 1000).astype(int)

### 4-) Zaman degiskeni TransactionDT kullanilarak islemin haftanin hangi gunu ve gunun hangi saati yapildigini tespit eden yeni degisken
- zaman degiskeni varken buna gerek var mi?

In [None]:
train['Transaction_day_of_week'] = np.floor((train['TransactionDT'] / (3600 * 24) - 1) % 7)
test['Transaction_day_of_week'] = np.floor((test['TransactionDT'] / (3600 * 24) - 1) % 7)

- gunun hangi saati gerceklestigini belirten yeni degisken

In [None]:
train['Transaction_hour'] = np.floor(train['TransactionDT'] / 3600) % 24
test['Transaction_hour'] = np.floor(test['TransactionDT'] / 3600) % 24

### 5-) Some arbitrary features interaction olarak belirtilmis yani rasgele, farkli degiskenler arasinda string formata cevirerek birlestirme islemi
- kullaniciyi belirlemeye yonelik isimize yarayacagini dusundugumuz degiskenler arasinda bu sekilde bir iliski kurmak mantikli olur mu? Mesela device bilgisi 'samsung' ile email domain bilgisini birlestirimek kisinin tespitine yonelik fayda saglar mi? Ayni anda hem device hem de domain bilgisi ayni olanlari filtrelemek gibi birsey oluyor aslinda, 
- card, device, addr degiskenleri ile yapilanlar mantikli olabilir mi?

In [8]:
for feature in ['id_02__id_20', 'id_02__D8', 'D11__DeviceInfo', 'DeviceInfo__P_emaildomain', 'P_emaildomain__C2', 
                'card2__dist1', 'card1__card5', 'card2__id_20', 'card5__P_emaildomain', 'addr1__card1']:

    f1, f2 = feature.split('__')
    train[feature] = train[f1].astype(str) + '_' + train[f2].astype(str)
    test[feature] = test[f1].astype(str) + '_' + test[f2].astype(str)

    le = LabelEncoder()
    le.fit(list(train[feature].astype(str).values) + list(test[feature].astype(str).values))
    train[feature] = le.transform(list(train[feature].astype(str).values))
    test[feature] = le.transform(list(test[feature].astype(str).values))

### 6-) Encoding - count encoding for both train and test
- ornek verelim, 2987000 ID'nin card1 degeri = 13926
- train setinde card1=13926'dan 43 tane var, test setinde de 13 tane var...
- yeni 43+13 degerini hesaplayip card1=13926 olan butun ID'lerin oldugu satira bu degiskeni 56 olarak ekliyor
- gereksiz bir degisken gibi

In [None]:
for feature in ['card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'id_36']:
    train[feature + '_count_full'] = train[feature].map(pd.concat([train[feature], test[feature]], ignore_index=True).value_counts(dropna=False))
    test[feature + '_count_full'] = test[feature].map(pd.concat([train[feature], test[feature]], ignore_index=True).value_counts(dropna=False))

### 7-) Hangi ulkeden baglanti yapildigina dair ise yarayabilecek feature, _bin seklinde olana gerek olmayabilir, digeri mantikli gibi

In [30]:
emails = {'gmail': 'google', 'att.net': 'att', 'twc.com': 'spectrum', 'scranton.edu': 'other', 'optonline.net': 'other', 'hotmail.co.uk': 'microsoft', 
          'comcast.net': 'other', 'yahoo.com.mx': 'yahoo', 'yahoo.fr': 'yahoo', 'yahoo.es': 'yahoo', 'charter.net': 'spectrum', 'live.com': 'microsoft', 
          'aim.com': 'aol', 'hotmail.de': 'microsoft', 'centurylink.net': 'centurylink', 'gmail.com': 'google', 'me.com': 'apple', 'earthlink.net': 'other', 
          'gmx.de': 'other', 'web.de': 'other', 'cfl.rr.com': 'other', 'hotmail.com': 'microsoft', 'protonmail.com': 'other', 'hotmail.fr': 'microsoft', 
          'windstream.net': 'other', 'outlook.es': 'microsoft', 'yahoo.co.jp': 'yahoo', 'yahoo.de': 'yahoo', 'servicios-ta.com': 'other', 'netzero.net': 'other', 
          'suddenlink.net': 'other', 'roadrunner.com': 'other', 'sc.rr.com': 'other', 'live.fr': 'microsoft', 'verizon.net': 'yahoo', 'msn.com': 'microsoft', 
          'q.com': 'centurylink', 'prodigy.net.mx': 'att', 'frontier.com': 'yahoo', 'anonymous.com': 'other', 'rocketmail.com': 'yahoo', 'sbcglobal.net': 'att', 
          'frontiernet.net': 'yahoo', 'ymail.com': 'yahoo', 'outlook.com': 'microsoft', 'mail.com': 'other', 'bellsouth.net': 'other', 'embarqmail.com': 'centurylink', 
          'cableone.net': 'other', 'hotmail.es': 'microsoft', 'mac.com': 'apple', 'yahoo.co.uk': 'yahoo', 'netzero.com': 'other', 'yahoo.com': 'yahoo', 'live.com.mx': 'microsoft', 
          'ptd.net': 'other', 'cox.net': 'other', 'aol.com': 'aol', 'juno.com': 'other', 'icloud.com': 'apple'}

us_emails = ['gmail', 'net', 'edu']

In [31]:
for c in ['P_emaildomain', 'R_emaildomain']:
    train[c + '_bin'] = train[c].map(emails)
    test[c + '_bin'] = test[c].map(emails)
    
    train[c + '_suffix'] = train[c].map(lambda x: str(x).split('.')[-1])
    test[c + '_suffix'] = test[c].map(lambda x: str(x).split('.')[-1])
    
    train[c + '_suffix'] = train[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')
    test[c + '_suffix'] = test[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')

In [32]:
train['P_emaildomain_suffix'].value_counts()

com    466477
us      25038
mx       2499
es        877
de        506
fr        494
uk        161
jp         32
Name: P_emaildomain_suffix, dtype: int64

In [33]:
train['P_emaildomain_bin'].value_counts()

google         228851
yahoo          109149
microsoft       59477
other           52868
aol             28604
apple            8225
att              7210
spectrum         1046
centurylink       654
Name: P_emaildomain_bin, dtype: int64

### 8-) TransactionAMT 
- https://www.kaggle.com/kabure/almost-complete-feature-engineering-ieee-data

- amt ile ilgili yukarida da benzer islemler yapildi ancak burada direk amt'nin degerinin ortalama degerinden farki ve std degerine bolunmesi
- bana cok anlamli gelmeyen degiskenler

In [None]:
train['Trans_min_mean'] = train['TransactionAmt'] - train['TransactionAmt'].mean()
train['Trans_min_std'] = train['Trans_min_mean'] / train['TransactionAmt'].std()
test['Trans_min_mean'] = test['TransactionAmt'] - test['TransactionAmt'].mean()
test['Trans_min_std'] = test['Trans_min_mean'] / test['TransactionAmt'].std()

### 9-) card'lar ile ilgili yapilanlar
- https://www.kaggle.com/kabure/almost-complete-feature-engineering-ieee-data

In [37]:
import datetime
START_DATE = '2017-12-01'
startdate = datetime.datetime.strptime(START_DATE, "%Y-%m-%d")
train["Date"] = train['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds=x)))
train['_Weekdays'] = train['Date'].dt.dayofweek
train['_Hours'] = train['Date'].dt.hour
train['_Days'] = train['Date'].dt.day

test["Date"] = test['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds=x)))
test['_Weekdays'] = test['Date'].dt.dayofweek
test['_Hours'] = test['Date'].dt.hour
test['_Days'] = test['Date'].dt.day

- card (1,2,3,5) degiskenleri birlestirerek yeni bir degisken uretiyor,
- mantikli olabilir cunku bunlari birlestirip kisilere ulasmak mumkun, bir nevi filtreleme yapip ayni degere sahip olanlari bulmak gibi dusunulebilir

In [38]:
def corret_card_id(x): 
    x=x.replace('.0','')
    x=x.replace('-999','nan')
    return x

# create card ID 
cards_cols= ['card1', 'card2', 'card3', 'card5']
for card in cards_cols: 
    if '1' in card: 
        train['Card_ID']= train[card].map(str)
        test['Card_ID']= test[card].map(str)
    else : 
        train['Card_ID']+= ' '+train[card].map(str)
        test['Card_ID']+= ' '+test[card].map(str)
    
# sort train data by Card_ID and then by transaction date 
train= train.sort_values(['Card_ID', 'Date'], ascending=[True, True])
test= test.sort_values(['Card_ID', 'Date'], ascending=[True, True])
    
# small correction of the Card_ID
train['Card_ID']=train['Card_ID'].apply(corret_card_id)
test['Card_ID']=test['Card_ID'].apply(corret_card_id)

In [47]:
train['Card_ID'].value_counts().head()

9500 321 150 226     14112
15885 545 185 138    10332
17188 321 150 226    10312
7919 194 150 166      8844
15066 170 150 102     7918
Name: Card_ID, dtype: int64

- yeni turetilen cardID degiskenine gore groupby islemleri yapiyor, benzerini ust tarafta ayri ayri yapmistik aslinda, rolling function ne oluyor tam olarak anlamayadim

In [None]:
train['mean_last'] = train['TransactionAmt'] - train.groupby('Card_ID')['TransactionAmt'].transform(lambda x: x.rolling(10, 1).mean())
train['min_last'] = train.groupby('Card_ID')['TransactionAmt'].transform(lambda x: x.rolling(10, 1).min())
train['max_last'] = train.groupby('Card_ID')['TransactionAmt'].transform(lambda x: x.rolling(10, 1).max())
train['std_last'] = train['mean_last'] / train.groupby('Card_ID')['TransactionAmt'].transform(lambda x: x.rolling(10, 1).std())

# df['count_last'] = df.groupby('Card_ID')['TransactionAmt'].transform(lambda x: x.rolling(30, 1).count())
train['mean_last'].fillna(0, inplace=True, )
train['std_last'].fillna(0, inplace=True)

test['mean_last'] = test['TransactionAmt'] - test.groupby('Card_ID')['TransactionAmt'].transform(lambda x: x.rolling(10, 1).mean())
test['min_last'] = test.groupby('Card_ID')['TransactionAmt'].transform(lambda x: x.rolling(10, 1).min())
test['max_last'] = test.groupby('Card_ID')['TransactionAmt'].transform(lambda x: x.rolling(10, 1).max())
test['std_last'] = test['mean_last'] / test.groupby('Card_ID')['TransactionAmt'].transform(lambda x: x.rolling(10, 1).std())

# df['count_last'] = df.groupby('Card_ID')['TransactionAmt'].transform(lambda x: x.rolling(30, 1).count())
test['mean_last'].fillna(0, inplace=True, )
test['std_last'].fillna(0, inplace=True)

- ayni islemleri bu kez rolling fonksiyonu kullanmadan yapiyor, bu iki islem arasindaki farki cozemedim

In [51]:
train['TransactionAmt_to_mean_card_id'] = train['TransactionAmt'] - train.groupby(['Card_ID'])['TransactionAmt'].transform('mean')
train['TransactionAmt_to_std_card_id'] = train['TransactionAmt_to_mean_card_id'] / train.groupby(['Card_ID'])['TransactionAmt'].transform('std')
test['TransactionAmt_to_mean_card_id'] = test['TransactionAmt'] - test.groupby(['Card_ID'])['TransactionAmt'].transform('mean')
test['TransactionAmt_to_std_card_id'] = test['TransactionAmt_to_mean_card_id'] / test.groupby(['Card_ID'])['TransactionAmt'].transform('std')

train['TransactionAmt_to_mean_card1'] = train['TransactionAmt'] / train.groupby(['card1'])['TransactionAmt'].transform('mean')
train['TransactionAmt_to_mean_card4'] = train['TransactionAmt'] / train.groupby(['card4'])['TransactionAmt'].transform('mean')
train['TransactionAmt_to_std_card1'] = train['TransactionAmt'] / train.groupby(['card1'])['TransactionAmt'].transform('std')
train['TransactionAmt_to_std_card4'] = train['TransactionAmt'] / train.groupby(['card4'])['TransactionAmt'].transform('std')

test['TransactionAmt_to_mean_card1'] = test['TransactionAmt'] / test.groupby(['card1'])['TransactionAmt'].transform('mean')
test['TransactionAmt_to_mean_card4'] = test['TransactionAmt'] / test.groupby(['card4'])['TransactionAmt'].transform('mean')
test['TransactionAmt_to_std_card1'] = test['TransactionAmt'] / test.groupby(['card1'])['TransactionAmt'].transform('std')
test['TransactionAmt_to_std_card4'] = test['TransactionAmt'] / test.groupby(['card4'])['TransactionAmt'].transform('std')

In [48]:
train['mean_last'].head()

TransactionID
3230924     0.000000
3169988     0.000000
3328484     0.000000
3337343   -14.320312
3337365    -4.000000
Name: mean_last, dtype: float16

In [52]:
train['TransactionAmt_to_mean_card_id'].head()

TransactionID
3230924     0.000000
3169988     0.000000
3328484    17.312500
3337343   -11.335938
3337365    -3.000000
Name: TransactionAmt_to_mean_card_id, dtype: float16

- card1 ve card2 nin ilk ve ilk iki degerleri, normalde kart numara bilgilerinde ilk rakamlar hangi kart tipi oldugunu belirtir ama buradaki numaralar onlara uymamakla birlikte hangi kart gibi oldugu bilgisi baska degiskende var, bu gereksi bir uretim

In [54]:
train['first_value_card1'] = train['card1'].astype(str).str[0:1].astype(float)
train['two_value_card1'] = train['card1'].astype(str).str[0:2].astype(float)

test['first_value_card1'] = test['card1'].astype(str).str[0:1].astype(float)
test['two_value_card1'] = test['card1'].astype(str).str[0:2].astype(float)

train['card2'] = train['card2'].fillna(0)
train['first_value_card2'] = train['card2'].astype(str).str[0:1].astype(float)
train['two_value_card2'] = train['card2'].astype(str).str[0:2].astype(float)

test['card2'] = test['card2'].fillna(0)
test['first_value_card2'] = test['card2'].astype(str).str[0:1].astype(float)
test['two_value_card2'] = test['card2'].astype(str).str[0:2].astype(float)

## 10-) adrr degiskenleri ile ilgili
- adrr1 zipcode, adrr2 ulke kodu gibi dusundugumuzde ikisi arasinda toplama-cikarma islemi yapilmasinin bir mantigi olmadigini dusunuyorum
- adrr1 yani zipcode degerlerinin birinci ve ilk iki rakaminin cikartilmasinin da gerekli olmadigini dusunuyorum

In [None]:
df_train['diff_adrr'] = df_train.addr1 - df_train.addr2
df_test['diff_adrr'] = df_test.addr1 - df_test.addr2

df_train['diff_adrr_plus'] = df_train.addr1 + df_train.addr2
df_test['diff_adrr_plus'] = df_test.addr1 + df_test.addr2

df_train['first_value_addr1'] = df_train['addr1'].astype(str).str[0:1].astype(float)
df_train['two_value_addr1'] = df_train['addr1'].astype(str).str[0:2].astype(float)

df_test['first_value_addr1'] = df_test['addr1'].astype(str).str[0:1].astype(float)
df_test['two_value_addr1'] = df_test['addr1'].astype(str).str[0:2].astype(float)

# 5. Berkan

# 6. Muhammet (Nor)

# LIGHT GBM

# XGBOOST  MODELI