# IEEE FRAUD NOTEBOOK

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
from sklearn.metrics import roc_auc_score
import gc

from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

pd.set_option('max_columns', None)

In [2]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [3]:
%%time

print('Loading data...')

train_identity = pd.read_csv('train_identity.csv', index_col='TransactionID')
print('\tSuccessfully loaded train_identity!')

train_transaction = pd.read_csv('train_transaction.csv', index_col='TransactionID')
print('\tSuccessfully loaded train_transaction!')

test_identity = pd.read_csv('test_identity.csv', index_col='TransactionID')
print('\tSuccessfully loaded test_identity!')

test_transaction = pd.read_csv('test_transaction.csv', index_col='TransactionID')
print('\tSuccessfully loaded test_transaction!')

sub = pd.read_csv('sample_submission.csv')
print('\tSuccessfully loaded sample_submission!')

test_identity.columns = train_identity.columns

print('Data was successfully loaded!\n')

Loading data...
	Successfully loaded train_identity!
	Successfully loaded train_transaction!
	Successfully loaded test_identity!
	Successfully loaded test_transaction!
	Successfully loaded sample_submission!
Data was successfully loaded!

Wall time: 1min 23s


In [4]:
print('Merging data...')
train = train_transaction.merge(train_identity, how='left', left_index=True, right_index=True)
test = test_transaction.merge(test_identity, how='left', left_index=True, right_index=True)

# "isFraud" bagımlı degiskenini y_train degiskenine atadık
y_train = train["isFraud"]
# Train bagimsiz degiskenleri
df_train = train.drop("isFraud", axis=1)

print('Data was successfully merged!\n')

del train_identity, train_transaction, test_identity, test_transaction

print(f'Train dataset has {train.shape[0]} rows and {train.shape[1]} columns.')
print(f'Test dataset has {test.shape[0]} rows and {test.shape[1]} columns.')

gc.collect()

Merging data...
Data was successfully merged!

Train dataset has 590540 rows and 433 columns.
Test dataset has 506691 rows and 432 columns.


82

### NOT: LÜTFEN YUKARIDAKİ HİÇBİR DEĞERİ VE DEĞİŞKENİ DEĞİŞTİRMEYİN.

Örneğin train dosyasında değişiklik yapacaksanız aşağıdaki gibi copyasını alarak yapınız.
df_train_copya = df_train.copy()


# 1. Ozan


# 2. Ümit

# 3. Muhammet(Alm)

# 4. Ismail

### 1-) (card1 - card4 - addr1) ile (TransactionAmt - id_02 - D15) arasinda uretilen degiskenler

- https://www.kaggle.com/davidcairuz/feature-engineering-lightgbm

In [None]:
columns_a = ['TransactionAmt', 'id_02', 'D15']
columns_b = ['card1', 'card4', 'addr1']

for col_a in columns_a:
    for col_b in columns_b:
        for df in [train, test]:
            df[f'{col_a}_to_mean_{col_b}'] = df[col_a] / df.groupby([col_b])[col_a].transform('mean')
            df[f'{col_a}_to_std_{col_b}'] = df[col_a] / df.groupby([col_b])[col_a].transform('std')

#### card1 - TransactionAmt

- card1 degiskeni onemli bir degisken, kullanici belirlemede ise yarayabilecek bir degisken, burada yapilan her bir harcamanin (transactionID) card1 bilgisi var ama bu card ile baska harcamalarda yapilmis, bu card ile yapilan toplam harcamalarin ortalamasini (std icin de yapiyor) belirliyor ve sadece bu harcamadaki miktar ile oranina bakiyor,

- mesela x card, 3 farkli islemde kullanilmis ve her islemde 5, 10, 15 para birimlik harcama yapilmis, sonucta x card islem basina harcama ortalamasi 10

- 1.islemde gercek harcama 5/ ortalama harcama 10
- 2.islemde 10/10
- 3.islemde 15/10 

- bu sekilde her islemdeki card'in o islemdeki harcamasinin toplam card harcama ortalamasina oranini tespit ediyoruz

- bu islem hem MEAN hem de STD icin ayri ayri yapiliyor

In [5]:
(train['TransactionAmt'] / train.groupby('card1')['TransactionAmt'].transform('mean')).head()

TransactionID
2987000    0.194640
2987001    0.123777
2987002    0.608150
2987003    0.405133
2987004    0.515612
Name: TransactionAmt, dtype: float64

#### card4 - TransactionAmt

- card4 kartin visa, mastercard, debit vs... olup olmadigini bilgisini tutuyor, burada da ayni yukarida oldugu gibi, her bir islemdeki tutarin genel ortalamaya oranini veriyor

In [6]:
(train['TransactionAmt'] / train.groupby('card4')['TransactionAmt'].transform('mean')).head()

TransactionID
2987000    0.257761
2987001    0.219054
2987002    0.443070
2987003    0.377679
2987004    0.377679
Name: TransactionAmt, dtype: float64

#### addr1 - TransactionAmt
- addr1 degiskeni zipcode bilgisi veriyor, yani ayni zipcode'dan yapilan islemlerin tamamindaki miktarin ortalamasi ile ayni zipcode'dan yapilan her bir islemin orani hesaplanmis

In [7]:
(train['TransactionAmt'] / train.groupby('addr1')['TransactionAmt'].transform('mean')).head()

TransactionID
2987000    0.509556
2987001    0.186032
2987002    0.445765
2987003    0.368830
2987004    0.306863
Name: TransactionAmt, dtype: float64

#### card1 - id_02

- card1'i grupluyor ve id_02 degerlerinin mean ve std'sini cikartiyor, 
- id_02 degeri 1'den baslayip yaklasik 1 milyona kadar artan, ortalamasi 175000 civari olan bir sayisal deger, ne ifade ettigini anlayamadim
- id_02'yi anlayamadigim icin buradaki feature uretmenin de mantigini cozemedim

In [None]:
(train['id_02'] / train.groupby('card1')['id_02'].transform('mean')).head(30)

#### card4 - id_02  - addr1 - id_02
- benzer sekilde id_02 degiskeni ile card4 ve addr1 arasindaki iliskiyi de anlayamadim

#### (card1 - D15), (card4 - D15), (addr1 - D15)
- D15 degiskeni hakkinda bir kanaat olusmadigindan bu yeni feature olusturmanin mantigi da kavranamadi.

# 5. Berkan

# 6. Muhammet (Nor)

# LIGHT GBM

# XGBOOST  MODELI