In [1]:
import numpy as np
import _pickle as cPickle
import pandas as pd

# 1. A plan and comments.

The first part of the feature engineering is about checking sets similarity in order to rely on train
data diversity. In the situation when a test data is received from the teacher 'as is' and without any labels, the analysis seems to be vital, because headers may divide the data with some tricks. This stage has to include at least a surface analysis of each feature, mean and std comparison of numerical features for different sets and categorical values frequency comparison similarly.
When a large dataset is divided by a developer on his own, he may rely on that the data is diverse enough and each set represents a data manifold good.

The second part of the feature engineering is more about insights from data, EDA.

Due to the insider information, most of the columns are not required at all.
The task is more about an RNN building, training and testing.

### 1.1. Load a raw data. Check an amount of lines in different sets of the date.

In [None]:
with open('../data/transactions_train_valid_test_splits.pickle', 'rb') as f:
    data = cPickle.load(f)
train_data = data['train']
valid_data = data['valid']
test_data = data['test']

In [3]:
t1 = [len(train_data), len(valid_data), len(test_data)]
total = np.sum(t1)
print(f'train {len(train_data):>8} = {len(train_data) / total:.02f}%')
print(f'valid {len(valid_data):>8} = {len(valid_data) / total:.02f}%')
print(f'test  {len(test_data):>8} = {len(test_data) / total:.02f}%')

train 17397216 = 0.71%
valid  3412698 = 0.14%
test   3576986 = 0.15%


### 1.2. Check columns dtypes, define numerical and categorical.

In [4]:
for column in train_data.columns:
    unique = train_data[column].unique()
    print('*' * 30)
    print(f'Column {column:>20}: total unique {len(unique):6}, random examples:')
    print(np.random.choice(unique, size=min(len(unique), 10), replace=False))


******************************
Column                 User: total unique   1400, random examples:
[ 432 1866  535 1077 1507 1762 1186 1072  587  305]
******************************
Column                 Card: total unique      9, random examples:
[4 5 8 6 7 2 3 1 0]
******************************
Column                 Year: total unique     30, random examples:
[1997 2008 2007 2000 2010 2020 2009 2011 2013 1993]
******************************
Column                Month: total unique     12, random examples:
[11  5 12  7  4  9  3 10  1  2]
******************************
Column                  Day: total unique     31, random examples:
[14  3 26 13 29  1 28  4 22 11]
******************************
Column                 Time: total unique   1440, random examples:
['08:58' '08:54' '13:49' '02:49' '05:11' '11:52' '18:46' '00:09' '09:51'
 '04:27']
******************************
Column               Amount: total unique  88600, random examples:
['$38.36' '$606.23' '$785.59' '$53.56' '$27

### 1.3. Check amount of unique users in each set, their intersection.

In [5]:
t1 = [len(train_data['User'].unique()), len(valid_data['User'].unique()), len(test_data['User'].unique())]
total = np.sum(t1)
print(f'train {t1[0]:>8} = {t1[0] / total:.02f}%')
print(f'valid {t1[1]:>8} = {t1[1] / total:.02f}%')
print(f'test  {t1[2]:>8} = {t1[2] / total:.02f}%')

print('Check intersection')
print('train vs valid', np.intersect1d(train_data['User'].unique(), valid_data['User'].unique()))
print('train vs test ', np.intersect1d(train_data['User'].unique(), test_data['User'].unique()))
print('valid vs test ', np.intersect1d(valid_data['User'].unique(), test_data['User'].unique()))

train     1400 = 0.70%
valid      300 = 0.15%
test       300 = 0.15%
Check intersection
train vs valid []
train vs test  []
valid vs test  []


### 1.4. Delete excessive columns.

In [6]:
columns_to_drop = ['Card', 'Use Chip', 'Merchant Name', 'Merchant City', 'Merchant State',
                   'Zip', 'MCC', 'Errors?', 'Is Fraud?']
train_data = train_data.drop(columns_to_drop, axis=1)
valid_data = valid_data.drop(columns_to_drop, axis=1)
test_data = test_data.drop(columns_to_drop, axis=1)

### 1.5. Drop index.

In [7]:
train_data.reset_index(drop=True, inplace=True)
valid_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

### 1.6. Remove the $ sign in the 'Amount' column, convert to the float dtype.

In [8]:
# train
train_data['Amount'] = [x[1:] for x in train_data['Amount']]
train_data['Amount'] = pd.to_numeric(train_data['Amount'], downcast='float')
# valid
valid_data['Amount'] = [x[1:] for x in valid_data['Amount']]
valid_data['Amount'] = pd.to_numeric(valid_data['Amount'], downcast='float')
# test
test_data['Amount'] = [x[1:] for x in test_data['Amount']]
test_data['Amount'] = pd.to_numeric(test_data['Amount'], downcast='float')

### 1.7. Convert dtypes for other columns.

In [9]:
# train
train_data['User'] = train_data['User'].astype('int32')
train_data['IsFraud_target'] = train_data['IsFraud_target'].astype('bool')
# valid
valid_data['User'] = valid_data['User'].astype('int32')
valid_data['IsFraud_target'] = valid_data['IsFraud_target'].astype('bool')
# test
test_data['User'] = test_data['User'].astype('int32')
test_data['IsFraud_target'] = test_data['IsFraud_target'].astype('bool')

### 1.8. Join columns related to date and time into a single one.

It is not clear how to feed the data with regard to the datetime.
 1. Since payments frequency is undefined for an every user, aggregation may be performed, but it
 may hide the anomaly and reduce the probability to predict a correct answer.
 2. Use timestamp only for an order, pad from the left up to the max length of a sequence.


In [10]:
columns_datetime = ['Year', 'Month', 'Day', 'Time']


def concat(df):
    concat_lambda = lambda w, x, y, z: str(w) + '-' + str(x) + '-' + str(y) + '-' + str(z)
    col = [concat_lambda(w, x, y, z) for w, x, y, z in zip(df['Year'], df['Month'], df['Day'], df['Time'])]
    return col


train_data['Datetime'] = concat(train_data)
train_data = train_data.drop(columns_datetime, axis=1)
train_data['Datetime'] = pd.to_datetime(train_data['Datetime'], format="%Y-%m-%d-%H:%M")

valid_data['Datetime'] = concat(valid_data)
valid_data = valid_data.drop(columns_datetime, axis=1)
valid_data['Datetime'] = pd.to_datetime(valid_data['Datetime'], format="%Y-%m-%d-%H:%M")

test_data['Datetime'] = concat(test_data)
test_data = test_data.drop(columns_datetime, axis=1)
test_data['Datetime'] = pd.to_datetime(test_data['Datetime'], format="%Y-%m-%d-%H:%M")


### 1.9. Save new data to reopen it instead of the raw data.

In [11]:
data = {'train': train_data, 'valid': valid_data, 'test': test_data}
with open('transactions_train_valid_test_splits_postprocessed.pickle', 'wb') as f:
    cPickle.dump(data, f)
del train_data
del valid_data
del test_data

### 1.10. Reopen and check if everything is OK. Close the notebook.

In [12]:
with open('transactions_train_valid_test_splits_postprocessed.pickle', 'rb') as f:
    data = cPickle.load(f)
train_data = data['train']
valid_data = data['valid']
test_data = data['test']

In [13]:
print('train')
print(train_data.dtypes)
train_data

train
User                       int32
Amount                   float32
IsFraud_target              bool
Datetime          datetime64[ns]
dtype: object


Unnamed: 0,User,Amount,IsFraud_target,Datetime
0,0,134.089996,True,2002-09-01 06:21:00
1,0,38.480000,True,2002-09-01 06:42:00
2,0,120.339996,True,2002-09-02 06:22:00
3,0,128.949997,True,2002-09-02 17:45:00
4,0,104.709999,True,2002-09-03 06:23:00
...,...,...,...,...
17397211,1999,-54.000000,False,2020-02-27 22:23:00
17397212,1999,54.000000,False,2020-02-27 22:24:00
17397213,1999,59.150002,False,2020-02-28 07:43:00
17397214,1999,43.119999,False,2020-02-28 20:10:00


In [14]:
print('valid')
print(valid_data.dtypes)
valid_data

valid
User                       int32
Amount                   float32
IsFraud_target              bool
Datetime          datetime64[ns]
dtype: object


Unnamed: 0,User,Amount,IsFraud_target,Datetime
0,4,1300.729980,False,1999-11-26 15:03:00
1,4,106.610001,False,1999-12-01 09:20:00
2,4,16.520000,False,1999-12-01 09:33:00
3,4,9.930000,False,1999-12-01 12:55:00
4,4,81.360001,False,1999-12-01 15:27:00
...,...,...,...,...
3412693,1992,62.000000,False,2020-02-28 06:55:00
3412694,1992,-78.000000,False,2020-02-28 07:08:00
3412695,1992,78.000000,False,2020-02-28 07:15:00
3412696,1992,9.740000,False,2020-02-28 07:27:00


In [15]:
print('test')
print(test_data.dtypes)
test_data

test
User                       int32
Amount                   float32
IsFraud_target              bool
Datetime          datetime64[ns]
dtype: object


Unnamed: 0,User,Amount,IsFraud_target,Datetime
0,5,45.610001,False,2002-01-01 12:24:00
1,5,40.000000,False,2002-01-01 16:47:00
2,5,108.529999,False,2002-01-01 17:40:00
3,5,26.160000,False,2002-01-01 21:59:00
4,5,60.000000,False,2002-01-02 16:05:00
...,...,...,...,...
3576981,1973,17.469999,False,2020-02-27 08:34:00
3576982,1973,17.320000,False,2020-02-27 09:19:00
3576983,1973,18.870001,False,2020-02-28 02:37:00
3576984,1973,3.680000,False,2020-02-28 06:25:00


In [16]:
exit()
