In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import gc
# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ieee-fraud-detection/sample_submission.csv
/kaggle/input/ieee-fraud-detection/test_transaction.csv
/kaggle/input/ieee-fraud-detection/test_identity.csv
/kaggle/input/ieee-fraud-detection/train_identity.csv
/kaggle/input/ieee-fraud-detection/train_transaction.csv


**FUNCTIONS**

In [2]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True, index_col='TransactionID')
    df = reduce_mem_usage(df)
    return df

# AMAÇ: seçilen değişkenlerin uids lere göre gruplanıp ortalamaların alıp yeni değişkene atamak.
def aggreg(columns, userid, aggr='mean'):
    
    for col in columns:
        new_col_name = col+'_'+userid+'_'+aggr # sectigi kolon isimlerini ve aggregation degerini birleştirip yeni değişken ismi oluşturmuş
        df_temp = pd.concat([X_train[[userid, col]], X_test[[userid,col]]]) # Train ve Test setten seçtiği değişkenleri alt alta birleştirmiş.
        df_temp.loc[df_temp[col]==-1,col] = np.nan # main_colums dan gelen değişkende -1 olan değerleri nan yapar.
        # col değişkenine göre groupby atmış ve col değişkenine agg_type türüne göre işlem yapmış ve bunu new_col_name isimli yeni değişkene atamış.İNDEX İ DE SIFIRLADI.
        df_temp = df_temp.groupby(userid)[col].agg([aggr]).reset_index().rename(columns={aggr: new_col_name})
        df_temp.index = list(df_temp[userid]) # df_temp nin sıfırlanan indexleri col değişkeninin index leriyle değiştirildi.
        df_temp = df_temp[new_col_name].to_dict()  # Yni oluşturulan değişken sözlük türüne çevirildi.
        # Bu yeni değerler Train ve Test setlerinde ilgili kolona(col değişkeni) karşılık gelecek şekilde  "new_col_name" ismiyle Train ve Test sete eklendi.
        X_train[new_col_name] = X_train[userid].map(df_temp).astype('float32')
        X_test[new_col_name]  = X_test[userid].map(df_temp).astype('float32')
        # Yeni olusturulan degiskenlerdeki nan değerler yerine -1 yazdırır.
        X_train[new_col_name].fillna(-1,inplace=True)
        X_test[new_col_name].fillna(-1,inplace=True)
      

 # AMAÇ: uids lere göre main_colums daki değişkenler gruplanır ve ve bu main_column daki farklı değerler sayılır ve 
# bu toplam sayı Test ve Trainde ilgili "TransactionID" nin karşısına atanır.
def aggreg_uniq(columns, userid):
    for col in columns:  
        df = pd.concat([X_train[[userid,col]],X_test[[userid,col]]],axis=0)
        #Burada uniq aynı col değişkenine denk gelen col değişkenindeki farklı değerlerin toplam sayısıdır.
        uniq = df.groupby(userid)[col].agg(['nunique'])['nunique'].to_dict()

        # "uid_P_emaildomain_ct" şeklinde değişken oluşturulur ve Train ile test setinde col değişkenindeki değerlere "TransactionID" baz alınarak uniq değerleri eşlenir.
        X_train[col+'_count'] = X_train[userid].map(uniq).astype('float32')
        X_test[col+'_count'] = X_test[userid].map(uniq).astype('float32')
        
# Her degeri minumum deger kadar arttiriyoruz, boylece hic eksi deger kalmiyor ve minumum deger 0 oluyor, bunu yapmadaki amac, NAN degere -1
# verdigimizde ayri bir sinif olarak algilayabilmesi, cunku hic negatif deger kalmadi, sadece nan olanlar negatif olmus oldu
def num_positiv(X_train,X_test):
    for f in X_train.columns:  
        # Bütün nümerik değerleri pozitif yap ve NAN değerleri -1 yap. ['TransactionAmt','TransactionDT'] kolonları hariç.
        if f not in ['TransactionAmt','TransactionDT',"isFraud"]: 
            mn = np.min((X_train[f].min(),X_test[f].min()))  # X_train ve X_test deki f kolonunun minimum degerlerini kıyasla ve en küçük olanını "mn" ye ata.

            # Buradaki amaç bütün değerlerden en küçük değeri çıkartarak onları pozitif yaparken aralarındaki değer farkınıda korumaktır.
            X_train[f] -= np.float32(mn)   # X_train deki f kolonundaki değerlerden mn yi yani en küçük değeri çıkarır.
            X_test[f] -= np.float32(mn)    # X_test deki f kolonundaki değerlerden mn yi yani en küçük değeri çıkarır.
            X_train[f].fillna(-1,inplace=True)  # X_train deki NaN değerleri -1 ile doldurur.
            X_test[f].fillna(-1,inplace=True)   # X_test deki NaN değerleri -1 ile doldurur.
            
# AMAÇ: encode_FE fonksiyonu girilen data setlerindeki belirtilen kolonları normalize edip türlerini "float32" ye çevirip _FE uzantılı yeni bir değişken olarak data setlerine ekler.
def class_freq(cols):
    for col in cols:
        df = pd.concat([X_train[col],X_test[col]])
        vc = df.value_counts(dropna=True).to_dict() # col. kolonundaki unique değerleri alıp bunları normalize ediyor ve listeye çevirip vc değişkeninde saklıyor.
        vc[-1] = -1  # vc.  sözlüğüne -1 key adı ile -1 değerini ekliyor.
        nm = col+'_freq' # kolon isimlerine uyguladığı FE encode uzantısını ekliyor.
        X_train[nm] = X_train[col].map(vc)  #vc deki keys değerleri ile df1[col] daki index değerlerini eşleyip karşılığına vc deki values değerlerini atayıp bunu df1'e yeni değişken olarak atar.
        X_test[nm] = X_test[col].map(vc)  #vc deki keys değerleri ile df2[col] daki index değerlerini eşleyip karşılığına vc deki values değerlerini atayıp bunu df2'ye yeni değişken olarak atar.
        del df; x=gc.collect()
        
# butun categoruc (object) degiskenler icin factorize islemi yapildi,
# factorize ile label encoder arasindaki en onemli fark, factorize nan degerleri -1 olarak tutuyor ama labelencoder da nan degerleri onceden doldurmak gerekiyor
def factorize_categoric():    
    for col in X_train.select_dtypes(include=['category','object']).columns:
        df = pd.concat([X_train[col],X_test[col]])
        df,_ = df.factorize(sort=True)
        X_train[col] = df[:len(X_train)].astype('int32')
        X_test[col] = df[len(X_train):].astype('int32')
        del df; x=gc.collect()        
        
# TEK BIR USERID OLUSTURMAK ICIN ONCELIKLE USERID OLUSTURMAK ICIN FAYDALI OLDUGUNU DEGERLENDIRDIGIMIZ FEATURELARI BELIRLEYIP
# O FEATURLER ARASINDAN FARKLI KOMPINASYONLARI DENEYIP BIRDEN COK USERID OLUSTURALIM, DAHA SONRA BUNLARIN KOLERASYONUNA BAKIP
# MANTIKLI OLAN HANGISI ISE ONU USERID OLARAK BELIRLEYELIM

def user_id(col1,col2):
    us_id = col1+'_'+col2
    
    X_train[us_id] = X_train[col1].astype(str)+'_'+X_train[col2].astype(str)   # 12926.0_215.0, 3663.0_230.0  şeklinde çıktı üretir.
    X_test[us_id] = X_test[col1].astype(str)+'_'+X_test[col2].astype(str)
   


In [3]:
%%time

print('Loading data...')

train_id = import_data("../input/ieee-fraud-detection/train_identity.csv")
print('\tSuccessfully loaded train_identity!')

X_train = import_data('../input/ieee-fraud-detection/train_transaction.csv')
print('\tSuccessfully loaded train_transaction!')
X_train = X_train.merge(train_id, how='left', left_index=True, right_index=True) # Train setini kendi içinde merge etmiş

test_id = import_data('../input/ieee-fraud-detection/test_identity.csv')
print('\tSuccessfully loaded test_identity!')

X_test = import_data('../input/ieee-fraud-detection/test_transaction.csv')
print('\tSuccessfully loaded test_transaction!')

test_id.columns = train_id.columns
X_test = X_test.merge(test_id, how='left', left_index=True, right_index=True)  # Test setini kendi içinde merge etmiş

pd.set_option('max_columns', None)

# TARGET
y_train = X_train['isFraud'].copy()  # Train deki bağımlı değişkeni y_train setine atamış.

print('Data was successfully loaded!\n')

Loading data...
Memory usage of dataframe is 45.12 MB
Memory usage after optimization is: 10.57 MB
Decreased by 76.6%
	Successfully loaded train_identity!
Memory usage of dataframe is 1775.15 MB
Memory usage after optimization is: 489.41 MB
Decreased by 72.4%
	Successfully loaded train_transaction!
Memory usage of dataframe is 44.39 MB
Memory usage after optimization is: 10.40 MB
Decreased by 76.6%
	Successfully loaded test_identity!
Memory usage of dataframe is 1519.24 MB
Memory usage after optimization is: 427.17 MB
Decreased by 71.9%
	Successfully loaded test_transaction!
Data was successfully loaded!

CPU times: user 2min 4s, sys: 1min 55s, total: 4min
Wall time: 4min 1s


In [4]:
# Nan değerlere göre değişkenleri gruplama
nan_groups={}
v_cols = ['V'+str(i) for i in range(1,340)]
for i in X_train.columns:
    nan_sum = X_train[i].isna().sum()
    try:
        nan_groups[nan_sum].append(i)
    except:
        nan_groups[nan_sum]=[i]

for i,j in nan_groups.items():
    print('The Sum of the NaN Values =',i)
    print(j)
    
    

non_group_list=list()
for i,j in nan_groups.items():
    if len(j)>5:
        if i != 0:
            non_group_list.append(i)
            
            
# Grupların kendi içinde korelasyon değeri 0.70 ten yüksek olan değişken grupları

# D1 - V11 
grp1 = [[1],[2,3],[4,5],[6,7],[8,9],[10,11]]
# V12 - V34
grp2 = [[12,13],[14],[15,16,17,18,21,22,31,32,33,34],[19,20],[23,24],[25,26],[27,28],[29,30]]
# V35 - V52
grp3 = [[35,36],[37,38],[39,40,42,43,50,51,52],[41],[44,45],[46,47],[48,49]]
# V53 - V74
grp4 = [[53,54],[55,56],[57,58,59,60,63,64,71,72,73,74],[61,62],[65],[66,67],[68],[69,70]]
# V74 - V94
grp5 = [[75,76],[77,78],[79,80,81,84,85,92,93,94],[82,83],[86,87],[88],[89],[90,91]]
# V95 - V107
grp6 = [[95,96,97,101,102,103,105,106],[98],[99,100],[104]]
# V107 - V123
grp7 = [[107],[108,109,110,114],[111,112,113],[115,116],[117,118,119],[120,122],[121],[123]]
# V124 - V137
grp8 = [[124,125],[126,127,128,132,133,134],[129],[130,131],[135,136,137]]
# V138 - V163
grp9 = [[138],[139,140],[141,142],[146,147],[148,149,153,154,156,157,158],[161,162,163]]
# V167 - V183
grp10 = [[167,168,177,178,179],[172,176],[173],[181,182,183]]
# V184 - V216
grp11 = [[186,187,190,191,192,193,196,199],[202,203,204,211,212,213],[205,206],[207],[214,215,216]]
# V217 - V238
grp12 = [[217,218,219,231,232,233,236,237],[223],[224,225],[226],[228],[229,230],[235]]
# V240 - V262
grp13 = [[240,241],[242,243,244,258],[246,257],[247,248,249,253,254],[252],[260],[261,262]]
# V263 - V278
grp14 = [[263,265,264],[266,269],[267,268],[273,274,275],[276,277,278]]
# V220 - V272
grp15 = [[220],[221,222,227,245,255,256,259],[234],[238,239],[250,251],[270,271,272]]
# V279 - V299
grp16 = [[279,280,293,294,295,298,299],[284],[285,287],[286],[290,291,292],[297]]
# V302 - V321
grp17 = [[302,303,304],[305],[306,307,308,316,317,318],[309,311],[310,312],[319,320,321]]
# V281 V315
grp18 = [[281],[282,283],[288,289],[296],[300,301],[313,314,315]]
# V322 - V339
grp19 = [[322,323,324,326,327,328,329,330,331,332,333],[325],[334,335,336],[337,338,339]]

grp_list = [grp1,grp2,grp3,grp4,grp5,grp6,grp7,grp8,grp9,grp10,grp11,grp12,grp13,grp14,grp15,grp16,grp17,grp18,grp19]




# Aynı korelasyondaki değişkenlerden unique değeri fazla olanı seçen fonksiyon
def clip_group(group,df):
    clipped_list = []
    for i in group:
        maximum = 0; 
        V_num = i[0]
        for j in i:
            n = df['V'+str(j)].value_counts().count()
            if n>maximum:
                maximum = n
                V_num = j
            
        clipped_list.append(V_num)
    
        
    print('Variables in the clipped_list: ',clipped_list)
    return clipped_list


# Korelasyon sonucunda modelde kullanılmasına karar verilen  V değişkenleri V_clipped_cols değişkeninde tutulmuştur.
V_clipped_cols = list()
for i in grp_list:
    for j in clip_group(i,X_train):
        V_clipped_cols.append("V"+str(j))
        

for i in range (1, 339):
    name = "V"+str(i)
    if name not in V_clipped_cols:
        X_train.drop("V"+str(i),axis=1, inplace=True)
        X_test.drop("V"+str(i),axis=1, inplace=True)

The Sum of the NaN Values = 0
['isFraud', 'TransactionDT', 'TransactionAmt', 'ProductCD', 'card1', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14']
The Sum of the NaN Values = 8933
['card2']
The Sum of the NaN Values = 1565
['card3']
The Sum of the NaN Values = 1577
['card4']
The Sum of the NaN Values = 4259
['card5']
The Sum of the NaN Values = 1571
['card6']
The Sum of the NaN Values = 65706
['addr1', 'addr2']
The Sum of the NaN Values = 352271
['dist1']
The Sum of the NaN Values = 552913
['dist2']
The Sum of the NaN Values = 94456
['P_emaildomain']
The Sum of the NaN Values = 453249
['R_emaildomain']
The Sum of the NaN Values = 1269
['D1', 'V281', 'V282', 'V283', 'V288', 'V289', 'V296', 'V300', 'V301', 'V313', 'V314', 'V315']
The Sum of the NaN Values = 280797
['D2']
The Sum of the NaN Values = 262878
['D3']
The Sum of the NaN Values = 168922
['D4']
The Sum of the NaN Values = 309841
['D5']
The Sum of the NaN Values = 517353
['D6']
The Sum of 

In [5]:
# frekans sayısı 2 den az olan kartlar INVALID, çok olanlar VALID kart olarak tanimlaniyor, sonra train ve testin birinde bulunmuyorsa nan yapmış sonra invalid olanları nan yapmış

valid_card = pd.concat([X_train[['card1']], X_test[['card1']]])
valid_card = valid_card['card1'].value_counts()
valid_card_std = valid_card.values.std()

invalid_cards = valid_card[valid_card<=2]

valid_card = valid_card[valid_card>2]
valid_card = list(valid_card.index)

X_train['card1'] = np.where(X_train['card1'].isin(X_test['card1']), X_train['card1'], np.nan)
X_test['card1']  = np.where(X_test['card1'].isin(X_train['card1']), X_test['card1'], np.nan)

X_train['card1'] = np.where(X_train['card1'].isin(valid_card), X_train['card1'], np.nan)
X_test['card1']   = np.where(X_test['card1'].isin(valid_card), X_test['card1'], np.nan)

# burda frekans sayısı 2 den az olan kartlara invalid çok olanlara valid kart demiş
# sonra train ve testin ikisinde de bulunanları almış eğer birinde bulunmuyorsa nan yapmış
# sonra invalid olanları nan yapmış

for col in ['card2','card3','card4','card5','card6']: 
    X_train[col] = np.where(X_train[col].isin(X_test[col]), X_train[col], np.nan)
    X_test[col]  = np.where(X_test[col].isin(X_train[col]), X_test[col], np.nan)

# train ve testin ikisinde de bulunanları almış eğer birinde bulunmuyorsa nan yapmış

In [6]:
# USERID belirliyoruz
col_1 = 'card1'
col_2 = 'P_emaildomain'
col_3 = 'addr1'


user_id(col_1,col_2)
user_id(col_1+'_'+col_2,col_3)
X_train.drop(col_1+'_'+col_2, axis = 1, inplace=True)
X_test.drop(col_1+'_'+col_2, axis = 1, inplace=True)

us_id = col_1 + '_' + col_2 + '_' + col_3
X_train.rename(columns={us_id: 'userid'}, inplace=True)
X_test.rename(columns={us_id: 'userid'}, inplace=True)

In [7]:
# cihaz ve browser tespitinin onemli oldugu varsayimiyla yapildi...

for df in [X_train,X_test]:

    df['OS_id_30'] = df['id_30'].str.split(' ', expand=True)[0]
    df['version_id_30'] = df['id_30'].str.split(' ', expand=True)[1]

    df['browser_id_31'] = df['id_31'].str.split(' ', expand=True)[0]
    df['version_id_31'] = df['id_31'].str.split(' ', expand=True)[1]

In [8]:
# amt ilk halinde float16, bu sekilde std ve mean NAN oluyor, float32 yapmamiz lazim

for df in [X_train,X_test]:

    df['TransactionAmt'] = df['TransactionAmt'].astype('float32')
    df['Trans_min_std'] = (df['TransactionAmt'] - df['TransactionAmt'].mean()) / df['TransactionAmt'].std()

In [9]:
# lastest_browser (SON VERSIYON KONTROLU) son versiyon olanlar 1 

X_train["lastest_browser"] = np.zeros(X_train.shape[0])
X_test["lastest_browser"] = np.zeros(X_test.shape[0])

def setBrowser(df):
    df.loc[df["id_31"]=="samsung browser 7.0",'lastest_browser']=1
    df.loc[df["id_31"]=="opera 53.0",'lastest_browser']=1
    df.loc[df["id_31"]=="mobile safari 10.0",'lastest_browser']=1
    df.loc[df["id_31"]=="google search application 49.0",'lastest_browser']=1
    df.loc[df["id_31"]=="firefox 60.0",'lastest_browser']=1
    df.loc[df["id_31"]=="edge 17.0",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 69.0",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 67.0 for android",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 63.0 for android",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 63.0 for ios",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 64.0",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 64.0 for android",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 64.0 for ios",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 65.0",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 65.0 for android",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 65.0 for ios",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 66.0",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 66.0 for android",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 66.0 for ios",'lastest_browser']=1
    return df

X_train=setBrowser(X_train)
X_test=setBrowser(X_test)

In [10]:
# MAIL ADRESLERININ SON KISIMLARINDAN ULKE TESPITINE YONELIK URETILEN FEATURE (com, us, mx, es, de, fr, uk, jp)

us_emails = ['gmail', 'net', 'edu']

for df in [X_train,X_test]:
    for c in ['P_emaildomain', 'R_emaildomain']:

        df[c + '_suffix'] = df[c].map(lambda x: str(x).split('.')[-1])
        df[c + '_suffix'] = df[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')

In [11]:
# Check if P_emaildomain matches R_emaildomain 
# extracts prefix

p = 'P_emaildomain'
r = 'R_emaildomain'
unknown = 'email_not_provided'

def setDomain(df):
    df[p] = df[p].astype('str')
    df[r] = df[r].astype('str')
    
    df[p] = df[p].fillna(unknown)
    df[r] = df[r].fillna(unknown)
    
    df['email_check'] = np.where((df[p]==df[r])&(df[p]!=unknown),1,0)

    df[p+'_prefix'] = df[p].apply(lambda x: x.split('.')[0])
    df[r+'_prefix'] = df[r].apply(lambda x: x.split('.')[0])
    
    return df
    
X_train=setDomain(X_train)
X_test=setDomain(X_test)

In [12]:
# TransactionDT degerlerinden icin yeni degiskenler uretilmis.

import datetime
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

# start='2017-10-01', end='2019-01-01 arasindaki tarihler listelenmis.
dates_range = pd.date_range(start='2017-10-01', end='2019-01-01')

# start='2017-10-01', end='2019-01-01 ABD ulusal tatil gunleri listelenmis. 
us_holidays = calendar().holidays(start=dates_range.min(), end=dates_range.max())

START_DATE = '2017-12-01'
startdate = datetime.datetime.strptime(START_DATE, "%Y-%m-%d")

# islemlerin yapildigi hour of day, day of week ve day of month ve month of year degiskeni olusturulmus.

for df in [X_train,X_test]:
    
    df["Date"] = df['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds=x)))
    df['_Weekdays'] = df['Date'].dt.dayofweek
    df['_Dayhours'] = df['Date'].dt.hour
    df['_Monthdays'] = df['Date'].dt.day
    df['_Yearmonths'] = (df['Date'].dt.month).astype(np.int8) 

    # yapilan islem tatil gunumu mu?

    df['is_holiday'] = (df['Date'].dt.date.astype('datetime64').isin(us_holidays)).astype(np.int8)

    # Timestamp tipinde olduğu için algoritma tanımlayamıyor.
    df.drop("Date", axis=1,inplace=True)

In [13]:
# ProductCD value_count = (W,C,R,H,S) 
# M4 value_count = (M0,M1,M2)

# kategorik degisken olan ProductCD ve M4, 'fraud' ortalamalarina gore gruplandiriliyor

for col in ['ProductCD','M4']:
    temp_dict = X_train.groupby([col])['isFraud'].agg(['mean']).reset_index().rename(columns={'mean': col+'_target_mean'})
    temp_dict.index = temp_dict[col].values
    temp_dict = temp_dict[col+'_target_mean'].to_dict()
    
    if col=='ProductCD':
        X_train['ProductCD_1'] = X_train[col].map(temp_dict)
        X_test['ProductCD_1']  = X_test[col].map(temp_dict)
    else:
        X_train['M4_1'] = X_train[col].map(temp_dict)
        X_test['M4_1']  = X_test[col].map(temp_dict)
        
        
# orjinal featurelar drop edilecek

X_train.drop(['ProductCD','M4'], axis=1,inplace=True)
X_test.drop(['ProductCD','M4'], axis=1,inplace=True)

In [14]:
# dolar kuruna gore ulke tahmini
# kategorik olarak tutmamiz gerekiyor, one hot encoding yapilmasi gerekiyor

for df in [X_train,X_test]:
    
    df['TransactionAmt_decimal_lenght'] = df['TransactionAmt'].astype(str).str.split('.', expand=True)[1].str.len()
    df['cents'] = (df['TransactionAmt'] - np.floor(X_train['TransactionAmt'])).astype('float32')

In [15]:
# Modelde D ve D' nin normalize edilmiş  kolonlarının çıkarılmış halini de deneyeceğiz.
# The D Columns are "time deltas" from some point in the past. We will transform the D Columns into their point in the past.
# NORMALIZE D COLUMNS
for i in range(1,16):
    if i in [1,2,3,5,9]:
        continue
    X_train['D'+str(i)] =  X_train['D'+str(i)] - X_train.TransactionDT/np.float32(24*60*60)
    X_test['D'+str(i)] = X_test['D'+str(i)] - X_test.TransactionDT/np.float32(24*60*60)

In [16]:
for df in [X_train,X_test]:
   
    df = df.replace(np.inf,999)# sonsuz değerleri 999 ile değiştiriyor

In [17]:
factorize_categoric()

num_positiv(X_train,X_test)

class_freq(['addr1','card1','card2','card3','P_emaildomain'])

aggreg(['TransactionAmt','D4','D9','D10','D11','D15'],'userid','mean')

aggreg(['TransactionAmt','D4','D9','D10','D11','D15','C14'],'userid','std')

aggreg(['C'+str(x) for x in range(1,15) if x!=3],'userid','mean')

aggreg(['M'+str(x) for x in range(1,10) if x!=4],'userid','mean')

aggreg_uniq(['P_emaildomain','dist1','id_02','cents','C13','V314','V127','V136','V309','V307','V320'],'userid')

In [18]:
X_train.drop("userid", axis=1, inplace=True)
X_test.drop("userid", axis=1, inplace=True)

In [19]:
def user_id(col1,col2):
    us_id = col1+'_'+col2
    
    X_train['day'] = X_train.TransactionDT / (24*60*60)
    X_train[us_id] = X_train[col1].astype(str)+'_'+X_train[col2].astype(str)+'_'+np.floor(X_train.day-X_train.D1).astype(str)

    X_test['day'] = X_test.TransactionDT / (24*60*60)
    X_test[us_id] = X_test[col1].astype(str)+'_'+X_test[col2].astype(str)+'_'+np.floor(X_test.day-X_test.D1).astype(str)

In [20]:
categoric_features = ['card1','card2','card3','card4','card5','card6','addr1','addr2',
                      'P_emaildomain','R_emaildomain',
                      'M1','M2','M3','M5','M6','M7','M8','M9',
                      'id_01','id_02','id_03','id_04','id_05','id_06','id_07','id_08','id_09','id_10','id_11','id_12','id_13','id_14','id_15','id_16','id_17','id_18','id_19',
                      'id_20','id_21','id_22','id_23','id_24','id_25','id_26','id_27','id_28','id_29','id_30','id_31','id_32','id_33','id_34','id_35','id_36','id_37','id_38',
                      'DeviceType','DeviceInfo',
                      'OS_id_30','version_id_30','browser_id_31','version_id_31','Trans_min_std','lastest_browser','P_emaildomain_suffix','R_emaildomain_suffix','email_check',
                      'P_emaildomain_prefix','R_emaildomain_prefix','_Weekdays','_Dayhours','_Monthdays','_Yearmonths','is_holiday','ProductCD_1','M4_1','TransactionAmt_decimal_lenght','cents']

categoric_features_index = [X_train.columns.get_loc(c) for c in categoric_features if c in X_train]

In [21]:
X_train = reduce_mem_usage(X_train)
X_test = reduce_mem_usage(X_test)

Memory usage of dataframe is 497.02 MB
Memory usage after optimization is: 361.85 MB
Decreased by 27.2%
Memory usage of dataframe is 431.70 MB
Memory usage after optimization is: 315.73 MB
Decreased by 26.9%


In [22]:
X_train.drop("isFraud", axis=1, inplace=True)

# Hyperparameters Tuning and Building Model

In [23]:
from sklearn.model_selection import train_test_split
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_train, y_train, test_size=0.33, random_state=42)

In [24]:
import xgboost as xgb
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

def auc(m, train, test,y_train1,y_test1): 
    return (metrics.roc_auc_score(y_train1,m.predict_proba(train)[:,1]),
                            metrics.roc_auc_score(y_test1,m.predict_proba(test)[:,1]))

# Parameter Tuning
model = xgb.XGBClassifier()
param_dist = {"max_depth": [8,12,20],
              "min_child_weight" : [1,3],
              "n_estimators": [200,400],
              "learning_rate": [0.14,0.2, 0.25],
              "nthread":[4],
              "tree_method":["gpu_hist"],
              "random_state": [2]}
                             
grid_search = GridSearchCV(model, param_grid=param_dist, cv = 3, 
                                   verbose=10,n_jobs =-1)

grid_search.fit(X_train1, y_train1)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   28.2s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed: 10.0min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed: 14.5min
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed: 16.6min
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed: 19.7min
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed: 25.2min
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed: 28.4min
[Parallel(n_jobs=-1)]: Done  94 tasks      | elapsed: 32.3min
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed: 38.0min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estim...
                                     subsample=None, tree_method=None,
                                     validate_parameters=False,
                                     verbosity=None),
             iid='deprecated', n_jobs=-1,
             param_grid={'learning_rate': [0.14, 0.2

In [25]:
max_depth_best = grid_search.best_estimator_.max_depth
n_estimators_best = grid_search.best_estimator_.n_estimators
learning_rate_best = grid_search.best_estimator_.learning_rate
min_child_weight_best = grid_search.best_estimator_.min_child_weight

In [26]:
del model 
x=gc.collect()


# MODEL AND PREDICTION


In [28]:
model = xgb.XGBClassifier(max_depth=max_depth_best,
                          min_child_weight=min_child_weight_best, 
                          n_estimators=n_estimators_best,
                          n_jobs=-1 , 
                          verbose=1,
                          learning_rate=learning_rate_best,
                          subsample=0.8,
                          colsample_bytree=0.4,
                          missing=-1,
                          eval_metric='auc',
                          # USE CPU
                          nthread=4,
                          #tree_method='hist'
                          # USE GPU
                          tree_method='gpu_hist')


model.fit(X_train1,y_train1)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.4, eval_metric='auc',
              gamma=0, gpu_id=0, importance_type='gain',
              interaction_constraints=None, learning_rate=0.25,
              max_delta_step=0, max_depth=12, min_child_weight=1, missing=-1,
              monotone_constraints=None, n_estimators=400, n_jobs=-1, nthread=4,
              num_parallel_tree=1, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.8,
              tree_method='gpu_hist', validate_parameters=False, verbose=1,
              verbosity=None)

In [29]:
from sklearn import metrics
pred1 = model.predict(X_test1)
fpr, tpr, thresholds = metrics.roc_curve(y_test1, pred1, pos_label=2)
metrics.auc(fpr, tpr)

print(metrics.confusion_matrix(y_test1, pred1))
print(metrics.classification_report(y_test1, pred1))



[[187745    188]
 [  2075   4871]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99    187933
           1       0.96      0.70      0.81      6946

    accuracy                           0.99    194879
   macro avg       0.98      0.85      0.90    194879
weighted avg       0.99      0.99      0.99    194879



In [30]:
preds = model.predict_proba(X_test)[:,1]

NameError: name 'X_test_best' is not defined

In [None]:
sample_submission = pd.read_csv('../input/ieee-fraud-detection/sample_submission.csv')
sample_submission.isFraud = preds
sample_submission.to_csv('sub_xgb1.csv',index=False)