In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
from sklearn.metrics import roc_auc_score
import gc

from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
pd.set_option('max_columns', None)

In [3]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [13]:
%%time

print('Loading data...')

train_identity = pd.read_csv('train_identity.csv', index_col='TransactionID')
print('\tSuccessfully loaded train_identity!')

train_transaction = pd.read_csv('train_transaction.csv', index_col='TransactionID')
print('\tSuccessfully loaded train_transaction!')

test_identity = pd.read_csv('test_identity.csv', index_col='TransactionID')
print('\tSuccessfully loaded test_identity!')

test_transaction = pd.read_csv('test_transaction.csv', index_col='TransactionID')
print('\tSuccessfully loaded test_transaction!')

sub = pd.read_csv('sample_submission.csv')
print('\tSuccessfully loaded sample_submission!')

test_identity.columns = train_identity.columns

print('Data was successfully loaded!\n')

Loading data...
	Successfully loaded train_identity!
	Successfully loaded train_transaction!
	Successfully loaded test_identity!
	Successfully loaded test_transaction!
	Successfully loaded sample_submission!
Data was successfully loaded!

Wall time: 1min 39s


In [14]:
def id_split(dataframe):
    dataframe['device_name'] = dataframe['DeviceInfo'].str.split('/', expand=True)[0]
    dataframe['device_version'] = dataframe['DeviceInfo'].str.split('/', expand=True)[1]

    dataframe['OS_id_30'] = dataframe['id_30'].str.split(' ', expand=True)[0]
    dataframe['version_id_30'] = dataframe['id_30'].str.split(' ', expand=True)[1]

    dataframe['browser_id_31'] = dataframe['id_31'].str.split(' ', expand=True)[0]
    dataframe['version_id_31'] = dataframe['id_31'].str.split(' ', expand=True)[1]

    dataframe['screen_width'] = dataframe['id_33'].str.split('x', expand=True)[0]
    dataframe['screen_height'] = dataframe['id_33'].str.split('x', expand=True)[1]

    dataframe['id_34'] = dataframe['id_34'].str.split(':', expand=True)[1]
    dataframe['id_23'] = dataframe['id_23'].str.split(':', expand=True)[1]

    dataframe.loc[dataframe['device_name'].str.contains('SM', na=False), 'device_name'] = 'Samsung'
    dataframe.loc[dataframe['device_name'].str.contains('SAMSUNG', na=False), 'device_name'] = 'Samsung'
    dataframe.loc[dataframe['device_name'].str.contains('GT-', na=False), 'device_name'] = 'Samsung'
    dataframe.loc[dataframe['device_name'].str.contains('Moto G', na=False), 'device_name'] = 'Motorola'
    dataframe.loc[dataframe['device_name'].str.contains('Moto', na=False), 'device_name'] = 'Motorola'
    dataframe.loc[dataframe['device_name'].str.contains('moto', na=False), 'device_name'] = 'Motorola'
    dataframe.loc[dataframe['device_name'].str.contains('LG-', na=False), 'device_name'] = 'LG'
    dataframe.loc[dataframe['device_name'].str.contains('rv:', na=False), 'device_name'] = 'RV'
    dataframe.loc[dataframe['device_name'].str.contains('HUAWEI', na=False), 'device_name'] = 'Huawei'
    dataframe.loc[dataframe['device_name'].str.contains('ALE-', na=False), 'device_name'] = 'Huawei'
    dataframe.loc[dataframe['device_name'].str.contains('-L', na=False), 'device_name'] = 'Huawei'
    dataframe.loc[dataframe['device_name'].str.contains('Blade', na=False), 'device_name'] = 'ZTE'
    dataframe.loc[dataframe['device_name'].str.contains('BLADE', na=False), 'device_name'] = 'ZTE'
    dataframe.loc[dataframe['device_name'].str.contains('Linux', na=False), 'device_name'] = 'Linux'
    dataframe.loc[dataframe['device_name'].str.contains('XT', na=False), 'device_name'] = 'Sony'
    dataframe.loc[dataframe['device_name'].str.contains('HTC', na=False), 'device_name'] = 'HTC'
    dataframe.loc[dataframe['device_name'].str.contains('ASUS', na=False), 'device_name'] = 'Asus'

    dataframe.loc[dataframe.device_name.isin(dataframe.device_name.value_counts()[dataframe.device_name.value_counts() < 200].index), 'device_name'] = "Others"
    dataframe['had_id'] = 1
    gc.collect()
    
    return dataframe

In [15]:
train_identity = id_split(train_identity)
test_identity = id_split(test_identity)

In [16]:
print('Merging data...')
train = train_transaction.merge(train_identity, how='left', left_index=True, right_index=True)
test = test_transaction.merge(test_identity, how='left', left_index=True, right_index=True)

print('Data was successfully merged!\n')

del train_identity, train_transaction, test_identity, test_transaction

print(f'Train dataset has {train.shape[0]} rows and {train.shape[1]} columns.')
print(f'Test dataset has {test.shape[0]} rows and {test.shape[1]} columns.')

gc.collect()

Merging data...
Data was successfully merged!

Train dataset has 590540 rows and 442 columns.
Test dataset has 506691 rows and 441 columns.


35

# FEATURE ENGINEERING INCELEMELERI

## card1, card4 ve addr1 degiskenlerinin Tr AMT, id_02 ve D15 degiskeni ile iliskisinden uretilenler

In [10]:
columns_a = ['TransactionAmt', 'id_02', 'D15']
columns_b = ['card1', 'card4', 'addr1']

for col_a in columns_a:
    for col_b in columns_b:
        for df in [train, test]:
            df[f'{col_a}_to_mean_{col_b}'] = df[col_a] / df.groupby([col_b])[col_a].transform('mean')
            df[f'{col_a}_to_std_{col_b}'] = df[col_a] / df.groupby([col_b])[col_a].transform('std')

### TransactionAmt ile card1 arasinda

In [43]:
(train['TransactionAmt'] / train.groupby('card1')['TransactionAmt'].transform('mean')).head(5)

# ornegin transactionID 2987000'in AMT'si 68.5, her bir tID'nin card1 degeri var ama ayni card1 baska islemlerde de var, yani 2987000 isleminin card1 bilgisi ornegin 100 ve bu card1 10 farkli tID'de devar, hepsinin harcamalarinin ortalamasini
# buluyor ve 68.5 degerini bu ortalamaya boluyor, card1'i bir sahis gibi dusunursek, card'1 in butun harcamalarinin ortalamasini aliyor ve bir harcamasina boluyor
# yani alttaki %19 card1'nin toplam harcamasinin ortalamasinin %19'u bu harcamada gerceklesmis diyor, 

TransactionID
2987000    0.194640
2987001    0.123777
2987002    0.608150
2987003    0.405133
2987004    0.515612
Name: TransactionAmt, dtype: float64

In [51]:
(train['TransactionAmt'] / train.groupby('card1')['TransactionAmt'].transform('std')).head(5)

TransactionID
2987000    0.184566
2987001    0.062995
2987002    0.589241
2987003    0.259447
2987004    0.882933
Name: TransactionAmt, dtype: float64

### TransactionAmt ile card4 arasinda

In [46]:
(train['TransactionAmt'] / train.groupby('card4')['TransactionAmt'].transform('mean')).head(5)

# card4 ornegin VISA, visa ile yapilan harcamalarin ortalamasi diyelim 1000 dolar, 2987000 de yapilan islem ise 10 dolar, oran %10 oluyor,
# yani 2987000 id numarali islem, visa kart tipiyle yapilmis ve islemin miktarinin visa ile yapilan islemlerin tutarlarinin ortalamasina orani %25

TransactionID
2987000    0.257761
2987001    0.219054
2987002    0.443070
2987003    0.377679
2987004    0.377679
Name: TransactionAmt, dtype: float64

In [52]:
(train['TransactionAmt'] / train.groupby('card4')['TransactionAmt'].transform('std')).head(5)

TransactionID
2987000    0.170233
2987001    0.114212
2987002    0.258544
2987003    0.196917
2987004    0.196917
Name: TransactionAmt, dtype: float64

### TransactionAmt ile addr1 arasinda

In [49]:
(train['TransactionAmt'] / train.groupby('addr1')['TransactionAmt'].transform('mean')).head(5)

# addr1'in zipcode oldugu dusunuldugunde, ayni zipcode'dan yapilan butun islem tutarlarinin ortalamasi ile o zipcode'dan yapilan tek bir islemin orani

TransactionID
2987000    0.509556
2987001    0.186032
2987002    0.445765
2987003    0.368830
2987004    0.306863
Name: TransactionAmt, dtype: float64

In [53]:
(train['TransactionAmt'] / train.groupby('addr1')['TransactionAmt'].transform('std')).head(5)

TransactionID
2987000    0.286941
2987001    0.116132
2987002    0.254240
2987003    0.235658
2987004    0.185845
Name: TransactionAmt, dtype: float64

### id_02 ile card1 arasinda

In [55]:
(train['id_02'] / train.groupby('card1')['id_02'].transform('mean')).head(5)

# yine card1'e gore yapilan bir islem ama id_02 nin ne deger tasidigini tam olarak anlayamadim

TransactionID
2987000         NaN
2987001         NaN
2987002         NaN
2987003         NaN
2987004    0.764773
Name: id_02, dtype: float64

In [56]:
(train['id_02'] / train.groupby('card1')['id_02'].transform('std')).head(5)

TransactionID
2987000         NaN
2987001         NaN
2987002         NaN
2987003         NaN
2987004    1.753301
Name: id_02, dtype: float64

### id_02 ile card4 arasinda

In [58]:
(train['id_02'] / train.groupby('card4')['id_02'].transform('mean')).head(5)

TransactionID
2987000         NaN
2987001         NaN
2987002         NaN
2987003         NaN
2987004    0.373295
Name: id_02, dtype: float64

### id_02 ile addr1 arasinda

In [59]:
(train['id_02'] / train.groupby('addr1')['id_02'].transform('mean')).head(5)

TransactionID
2987000         NaN
2987001         NaN
2987002         NaN
2987003         NaN
2987004    0.627453
Name: id_02, dtype: float64

### D15 ile card1 arasinda

In [60]:
(train['D15'] / train.groupby('card1')['D15'].transform('mean')).head(5)

TransactionID
2987000    0.000000
2987001    0.000000
2987002    2.518583
2987003    0.550272
2987004         NaN
Name: D15, dtype: float64

### D15 ile card4 arasinda

In [62]:
(train['D15'] / train.groupby('card4')['D15'].transform('mean')).head(5)

TransactionID
2987000    0.000000
2987001    0.000000
2987002    1.865915
2987003    0.720057
2987004         NaN
Name: D15, dtype: float64

### D15 ile addr1 arasinda

In [63]:
(train['D15'] / train.groupby('addr1')['D15'].transform('mean')).head(5)

TransactionID
2987000    0.000000
2987001    0.000000
2987002    1.611525
2987003    0.686169
2987004         NaN
Name: D15, dtype: float64

### SONUC
- card1-4 degiskenleri ile addr1 degiskeninin bireye ozel olabilecegi degerlendirildiginde, bunlardan yapilan islemlerin AMT miktarlari, ID_02 ve D15 degerlerinin toplamlarinin ortalamasi ve standart sapmasi belirlenip, her bir transaction id lerine orani tespit edilmis

## log of transaction amount.

In [67]:
train['TransactionAmt_Log'] = np.log(train['TransactionAmt'])
test['TransactionAmt_Log'] = np.log(test['TransactionAmt'])

## decimal part of the transaction amount.

In [69]:
train['TransactionAmt_decimal'] = ((train['TransactionAmt'] - train['TransactionAmt'].astype(int)) * 1000).astype(int)
test['TransactionAmt_decimal'] = ((test['TransactionAmt'] - test['TransactionAmt'].astype(int)) * 1000).astype(int)

# dolar cinsinden yapilan harcamalarda 0'dan sonraki rakamlarin incelenmesin adina uretilmis bir feature

## day of week in which a transaction happened

In [70]:
train['Transaction_day_of_week'] = np.floor((train['TransactionDT'] / (3600 * 24) - 1) % 7)
test['Transaction_day_of_week'] = np.floor((test['TransactionDT'] / (3600 * 24) - 1) % 7)

## hour of the day in which a transaction happened.

In [75]:
train['Transaction_hour'] = np.floor(train['TransactionDT'] / 3600) % 24
test['Transaction_hour'] = np.floor(test['TransactionDT'] / 3600) % 24

## Some arbitrary features interaction (rasgele)
- rasgele sectigi degiskenler arasinda herhangi bir islem yapmadan degerler uzerinden labelencoder islemi yapilmis, iki degisken strng formatinda birlestirilmis

In [76]:
for feature in ['id_02__id_20', 'id_02__D8', 'D11__DeviceInfo', 'DeviceInfo__P_emaildomain', 'P_emaildomain__C2', 
                'card2__dist1', 'card1__card5', 'card2__id_20', 'card5__P_emaildomain', 'addr1__card1']:

    f1, f2 = feature.split('__')
    train[feature] = train[f1].astype(str) + '_' + train[f2].astype(str)
    test[feature] = test[f1].astype(str) + '_' + test[f2].astype(str)

    le = LabelEncoder()
    le.fit(list(train[feature].astype(str).values) + list(test[feature].astype(str).values))
    train[feature] = le.transform(list(train[feature].astype(str).values))
    test[feature] = le.transform(list(test[feature].astype(str).values))

In [97]:
train['id_02'].astype(str) + '_' + train['id_20'].astype(str)

# iki id degeri hakkinda da bir kanaat halen olusmadi, bu yuzden burada yapilanin anlamli olup olmadigi hakkinda bir kanaatim yok

TransactionID
2987000           nan_nan
2987001           nan_nan
2987002           nan_nan
2987003           nan_nan
2987004     70787.0_144.0
2987005           nan_nan
2987006           nan_nan
2987007           nan_nan
2987008     98945.0_500.0
2987009           nan_nan
2987010    191631.0_142.0
2987011    221832.0_507.0
2987012           nan_nan
2987013           nan_nan
2987014           nan_nan
2987015           nan_nan
2987016      7460.0_575.0
2987017     61141.0_600.0
2987018           nan_nan
2987019           nan_nan
2987020           nan_nan
2987021           nan_nan
2987022           nan_nan
2987023           nan_nan
2987024           nan_nan
2987025           nan_nan
2987026           nan_nan
2987027           nan_nan
2987028           nan_nan
2987029           nan_nan
                ...      
3577510           nan_nan
3577511           nan_nan
3577512           nan_nan
3577513           nan_nan
3577514           nan_nan
3577515           nan_nan
3577516           nan_na

In [98]:
train['id_02'].astype(str) + '_' + train['D8'].astype(str)

TransactionID
2987000                        nan_nan
2987001                        nan_nan
2987002                        nan_nan
2987003                        nan_nan
2987004                    70787.0_nan
2987005                        nan_nan
2987006                        nan_nan
2987007                        nan_nan
2987008                    98945.0_nan
2987009                        nan_nan
2987010                  191631.0_83.0
2987011                   221832.0_nan
2987012                        nan_nan
2987013                        nan_nan
2987014                        nan_nan
2987015                        nan_nan
2987016                    7460.0_26.0
2987017                   61141.0_21.0
2987018                        nan_nan
2987019                        nan_nan
2987020                        nan_nan
2987021                        nan_nan
2987022                        nan_nan
2987023                        nan_nan
2987024                        nan_nan
2987025    

In [99]:
train['D11'].astype(str) + '_' + train['DeviceInfo'].astype(str)

TransactionID
2987000                                  13.0_nan
2987001                                   nan_nan
2987002                                 315.0_nan
2987003                                   nan_nan
2987004         nan_SAMSUNG SM-G892A Build/NRD90M
2987005                                   0.0_nan
2987006                                   0.0_nan
2987007                                   nan_nan
2987008                            nan_iOS Device
2987009                                 302.0_nan
2987010                               nan_Windows
2987011                                   nan_nan
2987012                                   nan_nan
2987013                                   nan_nan
2987014                                   nan_nan
2987015                                 423.0_nan
2987016                                 nan_MacOS
2987017                               nan_Windows
2987018                                 237.0_nan
2987019                             

In [100]:
train['DeviceInfo'].astype(str) + '_' + train['P_emaildomain'].astype(str)

TransactionID
2987000                                         nan_nan
2987001                                   nan_gmail.com
2987002                                 nan_outlook.com
2987003                                   nan_yahoo.com
2987004         SAMSUNG SM-G892A Build/NRD90M_gmail.com
2987005                                   nan_gmail.com
2987006                                   nan_yahoo.com
2987007                                    nan_mail.com
2987008                        iOS Device_anonymous.com
2987009                                   nan_yahoo.com
2987010                               Windows_gmail.com
2987011                                 nan_hotmail.com
2987012                                 nan_verizon.net
2987013                                     nan_aol.com
2987014                                   nan_yahoo.com
2987015                                         nan_nan
2987016                                   MacOS_aol.com
2987017                           

In [101]:
train['P_emaildomain'].astype(str) + '_' + train['C2'].astype(str)

TransactionID
2987000              nan_1.0
2987001        gmail.com_1.0
2987002      outlook.com_1.0
2987003        yahoo.com_5.0
2987004        gmail.com_1.0
2987005        gmail.com_1.0
2987006        yahoo.com_1.0
2987007         mail.com_1.0
2987008    anonymous.com_1.0
2987009        yahoo.com_2.0
2987010        gmail.com_4.0
2987011      hotmail.com_1.0
2987012      verizon.net_2.0
2987013          aol.com_5.0
2987014        yahoo.com_1.0
2987015              nan_4.0
2987016          aol.com_1.0
2987017        yahoo.com_1.0
2987018        gmail.com_1.0
2987019        gmail.com_5.0
2987020        gmail.com_1.0
2987021      gmail.com_120.0
2987022        gmail.com_1.0
2987023        gmail.com_1.0
2987024        gmail.com_5.0
2987025        gmail.com_1.0
2987026           me.com_1.0
2987027        yahoo.com_2.0
2987028              nan_1.0
2987029        gmail.com_1.0
                 ...        
3577510        gmail.com_1.0
3577511          aol.com_3.0
3577512      hotmail.com_1.0


In [103]:
train['card2'].astype(str) + '_' + train['dist1'].astype(str)

# bu mantikli olabilir mi?
# card degiskenlerinin kisiye ozel durumlari olabilir ve dist1 ile birlikte degerlendirilebilir mi?

TransactionID
2987000        nan_19.0
2987001       404.0_nan
2987002     490.0_287.0
2987003       567.0_nan
2987004       514.0_nan
2987005      555.0_36.0
2987006       360.0_0.0
2987007       490.0_nan
2987008       100.0_nan
2987009      111.0_19.0
2987010       352.0_nan
2987011       375.0_nan
2987012       418.0_nan
2987013       303.0_nan
2987014       490.0_nan
2987015       555.0_3.0
2987016       555.0_nan
2987017       111.0_nan
2987018       490.0_5.0
2987019       111.0_nan
2987020       314.0_0.0
2987021       543.0_nan
2987022       583.0_nan
2987023       360.0_4.0
2987024       360.0_nan
2987025       111.0_nan
2987026       148.0_nan
2987027      321.0_17.0
2987028       269.0_nan
2987029       361.0_nan
               ...     
3577510       321.0_nan
3577511       555.0_1.0
3577512       343.0_nan
3577513       555.0_3.0
3577514    555.0_1917.0
3577515      321.0_49.0
3577516      321.0_21.0
3577517       343.0_nan
3577518      298.0_29.0
3577519       452.0_nan
35

In [104]:
train['card1'].astype(str) + '_' + train['card5'].astype(str)

# mantikli olabilir

TransactionID
2987000    13926_142.0
2987001     2755_102.0
2987002     4663_166.0
2987003    18132_117.0
2987004     4497_102.0
2987005     5937_226.0
2987006    12308_166.0
2987007    12695_226.0
2987008     2803_226.0
2987009    17399_224.0
2987010    16496_134.0
2987011     4461_224.0
2987012     3786_226.0
2987013    12866_226.0
2987014    11839_226.0
2987015     7055_226.0
2987016     1790_226.0
2987017    11492_219.0
2987018     4663_166.0
2987019     7005_226.0
2987020     7875_224.0
2987021    11401_117.0
2987022     1724_226.0
2987023     2392_166.0
2987024    10112_166.0
2987025    15385_224.0
2987026    17868_226.0
2987027    11307_226.0
2987028     8431_224.0
2987029    12932_226.0
              ...     
3577510     9500_226.0
3577511    12059_226.0
3577512    16873_226.0
3577513     2198_166.0
3577514     2487_226.0
3577515     4219_226.0
3577516    17188_226.0
3577517    16873_226.0
3577518    11204_226.0
3577519    18018_117.0
3577520    17150_226.0
3577521    12019_224

In [105]:
train['card2'].astype(str) + '_' + train['id_20'].astype(str)

TransactionID
2987000        nan_nan
2987001      404.0_nan
2987002      490.0_nan
2987003      567.0_nan
2987004    514.0_144.0
2987005      555.0_nan
2987006      360.0_nan
2987007      490.0_nan
2987008    100.0_500.0
2987009      111.0_nan
2987010    352.0_142.0
2987011    375.0_507.0
2987012      418.0_nan
2987013      303.0_nan
2987014      490.0_nan
2987015      555.0_nan
2987016    555.0_575.0
2987017    111.0_600.0
2987018      490.0_nan
2987019      111.0_nan
2987020      314.0_nan
2987021      543.0_nan
2987022      583.0_nan
2987023      360.0_nan
2987024      360.0_nan
2987025      111.0_nan
2987026      148.0_nan
2987027      321.0_nan
2987028      269.0_nan
2987029      361.0_nan
              ...     
3577510      321.0_nan
3577511      555.0_nan
3577512      343.0_nan
3577513      555.0_nan
3577514      555.0_nan
3577515      321.0_nan
3577516      321.0_nan
3577517      343.0_nan
3577518      298.0_nan
3577519      452.0_nan
3577520      292.0_nan
3577521    305.0_139

In [106]:
train['card5'].astype(str) + '_' + train['P_emaildomain'].astype(str)

TransactionID
2987000              142.0_nan
2987001        102.0_gmail.com
2987002      166.0_outlook.com
2987003        117.0_yahoo.com
2987004        102.0_gmail.com
2987005        226.0_gmail.com
2987006        166.0_yahoo.com
2987007         226.0_mail.com
2987008    226.0_anonymous.com
2987009        224.0_yahoo.com
2987010        134.0_gmail.com
2987011      224.0_hotmail.com
2987012      226.0_verizon.net
2987013          226.0_aol.com
2987014        226.0_yahoo.com
2987015              226.0_nan
2987016          226.0_aol.com
2987017        219.0_yahoo.com
2987018        166.0_gmail.com
2987019        226.0_gmail.com
2987020        224.0_gmail.com
2987021        117.0_gmail.com
2987022        226.0_gmail.com
2987023        166.0_gmail.com
2987024        166.0_gmail.com
2987025        224.0_gmail.com
2987026           226.0_me.com
2987027        226.0_yahoo.com
2987028              224.0_nan
2987029        226.0_gmail.com
                  ...         
3577510        226.0_gmai

In [107]:
train['addr1'].astype(str) + '_' + train['card1'].astype(str)

TransactionID
2987000    315.0_13926
2987001     325.0_2755
2987002     330.0_4663
2987003    476.0_18132
2987004     420.0_4497
2987005     272.0_5937
2987006    126.0_12308
2987007    325.0_12695
2987008     337.0_2803
2987009    204.0_17399
2987010      nan_16496
2987011       nan_4461
2987012     204.0_3786
2987013    330.0_12866
2987014    226.0_11839
2987015     315.0_7055
2987016     170.0_1790
2987017    204.0_11492
2987018     184.0_4663
2987019     264.0_7005
2987020     299.0_7875
2987021    204.0_11401
2987022     299.0_1724
2987023     126.0_2392
2987024    264.0_10112
2987025    441.0_15385
2987026    472.0_17868
2987027    337.0_11307
2987028     251.0_8431
2987029    204.0_12932
              ...     
3577510     204.0_9500
3577511    110.0_12059
3577512    110.0_16873
3577513     315.0_2198
3577514     337.0_2487
3577515     472.0_4219
3577516    204.0_17188
3577517    110.0_16873
3577518    272.0_11204
3577519    264.0_18018
3577520    204.0_17150
3577521      nan_120

## Encoding - count encoding for both train and test
- ornegin train setindeki degiskenlerin value_counts degerlerine gore 13926 degerine sahip card1 43 tane, test setinde ise 13 tane, ikisini birlestiriyor ve hem train hem test setine train[card1_count_full] isimli bir degisken ekleyerek her bir transactionID'nin karsisina onun card1 degeri ne ise toplamda kac tane oldugunu yaziyor, mesela 2987000 id nin card1 degiskeni normalde 13926, eklenen degisken 56, bundan 56 tane var diyor

In [122]:
for feature in ['card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'id_36']:
    train[feature + '_count_full'] = train[feature].map(pd.concat([train[feature], test[feature]], ignore_index=True).value_counts(dropna=False))
    test[feature + '_count_full'] = test[feature].map(pd.concat([train[feature], test[feature]], ignore_index=True).value_counts(dropna=False))

In [108]:
train['card1'].map(pd.concat([train['card1'], test['card1']], ignore_index=True).value_counts(dropna=False))

TransactionID
2987000       56
2987001     1338
2987002     1794
2987003     7635
2987004       30
2987005       10
2987006      320
2987007    12732
2987008    11043
2987009     3319
2987010        4
2987011     5959
2987012      403
2987013     1076
2987014     3561
2987015       16
2987016       29
2987017       85
2987018     1794
2987019     1460
2987020      104
2987021      139
2987022     2985
2987023     2604
2987024     6058
2987025     1131
2987026     1009
2987027        5
2987028     1201
2987029     4906
           ...  
3577510    26243
3577511       10
3577512      598
3577513       31
3577514       29
3577515      115
3577516    19606
3577517      598
3577518      610
3577519     2786
3577520      147
3577521       77
3577522      693
3577523      199
3577524     5021
3577525      259
3577526      939
3577527       98
3577528    14606
3577529        4
3577530    14606
3577531    13268
3577532     1373
3577533        7
3577534     6697
3577535     2110
3577536       25


In [113]:
df = pd.concat([train['card1'], test['card1']], ignore_index=True).value_counts(dropna=False)

In [115]:
df[13926]

56

In [117]:
train['card1'].value_counts(dropna=False)[13926]

43

In [118]:
test['card1'].value_counts(dropna=False)[13926]

13

In [119]:
train['id_36'].map(pd.concat([train['id_36'], test['id_36']], ignore_index=True).value_counts(dropna=False))

TransactionID
2987000    819269
2987001    819269
2987002    819269
2987003    819269
2987004    267353
2987005    819269
2987006    819269
2987007    819269
2987008    267353
2987009    819269
2987010    267353
2987011    267353
2987012    819269
2987013    819269
2987014    819269
2987015    819269
2987016    267353
2987017    267353
2987018    819269
2987019    819269
2987020    819269
2987021    819269
2987022    819269
2987023    819269
2987024    819269
2987025    819269
2987026    819269
2987027    819269
2987028    819269
2987029    819269
            ...  
3577510    819269
3577511    819269
3577512    819269
3577513    819269
3577514    819269
3577515    819269
3577516    819269
3577517    819269
3577518    819269
3577519    819269
3577520    819269
3577521    267353
3577522    819269
3577523    819269
3577524    819269
3577525    819269
3577526    267353
3577527    819269
3577528    819269
3577529    267353
3577530    819269
3577531    267353
3577532    819269
3577533    819

In [121]:
pd.concat([train['id_36'], test['id_36']], ignore_index=True).value_counts(dropna=False)

NaN    819269
F      267353
T       10609
Name: id_36, dtype: int64

## Encoding - count encoding separately for train and test
- ayni islemi bu sefer train test birlestirmeden yapiyor

In [None]:
for feature in ['id_01', 'id_31', 'id_33', 'id_36']:
    train[feature + '_count_dist'] = train[feature].map(train[feature].value_counts(dropna=False))
    test[feature + '_count_dist'] = test[feature].map(test[feature].value_counts(dropna=False))

In [124]:
train['id_01'].value_counts(dropna=False)

 NaN      446307
-5.0       82170
 0.0       19555
-10.0      11257
-20.0      11211
-15.0       5674
-25.0       4623
-45.0       2143
-35.0       1622
-40.0       1385
-100.0      1012
-50.0        709
-30.0        682
-95.0        428
-60.0        410
-55.0        320
-80.0        220
-90.0        214
-70.0         97
-65.0         93
-85.0         87
-75.0         83
-18.0         23
-6.0          15
-12.0         15
-11.0         15
-16.0         13
-21.0         12
-7.0          10
-14.0         10
           ...  
-29.0          2
-71.0          2
-99.0          2
-61.0          2
-46.0          2
-88.0          2
-64.0          2
-34.0          1
-28.0          1
-42.0          1
-43.0          1
-52.0          1
-33.0          1
-89.0          1
-47.0          1
-32.0          1
-51.0          1
-92.0          1
-54.0          1
-57.0          1
-58.0          1
-94.0          1
-63.0          1
-48.0          1
-93.0          1
-72.0          1
-76.0          1
-82.0         

In [125]:
train['id_01'].map(train['id_01'].value_counts(dropna=False))

TransactionID
2987000    446307
2987001    446307
2987002    446307
2987003    446307
2987004     19555
2987005    446307
2987006    446307
2987007    446307
2987008     82170
2987009    446307
2987010     82170
2987011     82170
2987012    446307
2987013    446307
2987014    446307
2987015    446307
2987016     19555
2987017     82170
2987018    446307
2987019    446307
2987020    446307
2987021    446307
2987022      5674
2987023    446307
2987024    446307
2987025    446307
2987026    446307
2987027    446307
2987028    446307
2987029    446307
            ...  
3577510    446307
3577511    446307
3577512    446307
3577513    446307
3577514    446307
3577515    446307
3577516    446307
3577517    446307
3577518    446307
3577519    446307
3577520    446307
3577521      5674
3577522    446307
3577523    446307
3577524    446307
3577525    446307
3577526     82170
3577527    446307
3577528    446307
3577529     11211
3577530    446307
3577531     82170
3577532    446307
3577533    446