# Feature engineering for the simulated dataset

In [1]:
import numpy as np
import pandas as pd
import random
import seaborn as sns
import matplotlib.pyplot as plt

pd.options.display.float_format = "{:.1f}".format

In [2]:
from data.load_sim_data import load_all_sim_data
transactions_df, customers_df, terminals_df = load_all_sim_data()

In [3]:
small_dataset = True
START_DATE = "2018-04-01"
END_DATE = "2018-04-30"
transactions_df = transactions_df[transactions_df['TX_DATETIME'].between(START_DATE, END_DATE, inclusive='both')]

In [4]:
transactions_df.shape

(278302, 10)

## Baseline features
The following features will be used to build the baseline model. With each model iteration more complex features can be added. But these baseline features can be deemed as the bare minimum.

#### Weekend and weekday

In [5]:
def is_weekend(tx_datetime_series):
    dow_is_weekend = tx_datetime_series.weekday() >= 5 # 0 Monday, 6 Sunday
    return int(dow_is_weekend)

In [6]:
transactions_df['TX_ON_WEEKEND'] = transactions_df.TX_DATETIME.apply(is_weekend)

In [7]:
# quick checks
# 1st April is a Sunday
# 2nd April is a Monday
# 14th April is a Saturday
def check_weekend_flag(transactions_df, check_dates={'2018-04-01':1, '2018-04-02':0, '2018-04-14': 1}):
    for check_date, expected_flag in check_dates.items():
        flag_val = transactions_df[transactions_df['TX_DATE'] == check_date]['TX_ON_WEEKEND'].unique()
        assert flag_val == expected_flag, "Weekend flag not correct for: " + str(check_date) + ". Expected " + str(expected_flag) + " but got " + str(flag_val)
    return None

In [8]:
check_weekend_flag(transactions_df, check_dates={'2018-04-01':1, '2018-04-02':0, '2018-04-14': 1})
transactions_df[['TX_DATE', 'TX_ON_WEEKEND']].groupby('TX_DATE').max('').head(10)

Unnamed: 0_level_0,TX_ON_WEEKEND
TX_DATE,Unnamed: 1_level_1
2018-04-01,1
2018-04-02,0
2018-04-03,0
2018-04-04,0
2018-04-05,0
2018-04-06,0
2018-04-07,1
2018-04-08,1
2018-04-09,0
2018-04-10,0


#### At night or day

In [9]:
def is_night(tx_datetime_series):
    # between 12am and 6am we consider night
    txn_at_night = tx_datetime_series.hour < 6
    return int(txn_at_night)

In [10]:
transactions_df['TX_AT_NIGHT'] = transactions_df.TX_DATETIME.apply(is_night)

In [11]:
# Quick checks
transactions_df.sample(3, random_state=12)[['TX_DATETIME','TX_ON_WEEKEND', 'TX_AT_NIGHT']]

Unnamed: 0,TX_DATETIME,TX_ON_WEEKEND,TX_AT_NIGHT
263800,2018-04-28 11:57:34,1,0
221018,2018-04-24 02:31:49,0,1
30264,2018-04-04 06:33:29,0,0


### Customer features
These features will be RFM - Recency, Frequency and Monetary value. These types of features are also commonly used in retail customer analytics.

In [12]:
# Start by building for 1 customer
customer_txns = transactions_df[transactions_df['CUSTOMER_ID'] == 0]
customer_txns.head()

Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_TIME_SECONDS,TX_TIME_DAYS,TX_FRAUD,TX_FRAUD_SCENARIO,TX_DATE,TX_ON_WEEKEND,TX_AT_NIGHT
1758,1758,2018-04-01 07:19:05,0,6076,123.6,26345,0,0,0,2018-04-01,1,0
8275,8275,2018-04-01 18:00:16,0,858,77.3,64816,0,0,0,2018-04-01,1,0
8640,8640,2018-04-01 19:02:02,0,6698,46.5,68522,0,0,0,2018-04-01,1,0
12169,12169,2018-04-02 08:51:06,0,6569,54.7,118266,1,0,0,2018-04-02,0,0
15764,15764,2018-04-02 14:05:38,0,7707,63.3,137138,1,0,0,2018-04-02,0,0


In [13]:
def generate_customer_rfm(customer_txns, window_sizes=(1, 7, 30)):
    customer_txns = customer_txns.sort_values("TX_DATETIME")
    customer_txns.index = customer_txns.TX_DATETIME # allows us to use rolling functions

    # Window features such as transaction sum amount, average amount and count
    for window in window_sizes:
        SUB_TX_AMOUNT = customer_txns['TX_AMOUNT'].rolling(str(window) + 'd').sum()
        SUB_TX_COUNT = customer_txns['TX_AMOUNT'].rolling(str(window) + 'd').count()
        SUB_TX_AVG_AMOUNT = SUB_TX_AMOUNT/SUB_TX_COUNT # note that average at start of the data is missing past days so would only have a few days of data.

        customer_txns['CUSTOMER_ID_' + str(window) + '_DAY_WINDOW_SUM_AMOUNT'] = SUB_TX_AMOUNT
        customer_txns['CUSTOMER_ID_' + str(window) + '_DAY_WINDOW_COUNT'] = SUB_TX_COUNT
        customer_txns['CUSTOMER_ID_' + str(window) + '_DAY_WINDOW_AVG_AMOUNT'] = SUB_TX_AVG_AMOUNT

    # Recency feature
    customer_txns['CUSTOMER_ID_DAYS_SINCE_LAST_TXN'] = customer_txns['TX_DATETIME'].diff().dt.days.fillna(0).astype(int)

    # drop index according
    customer_txns.reset_index(drop=True, inplace=True)

    return customer_txns

In [14]:
customer_txns_with_rfm = generate_customer_rfm(customer_txns)
customer_txns_with_rfm.head()

Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_TIME_SECONDS,TX_TIME_DAYS,TX_FRAUD,TX_FRAUD_SCENARIO,TX_DATE,...,CUSTOMER_ID_1_DAY_WINDOW_SUM_AMOUNT,CUSTOMER_ID_1_DAY_WINDOW_COUNT,CUSTOMER_ID_1_DAY_WINDOW_AVG_AMOUNT,CUSTOMER_ID_7_DAY_WINDOW_SUM_AMOUNT,CUSTOMER_ID_7_DAY_WINDOW_COUNT,CUSTOMER_ID_7_DAY_WINDOW_AVG_AMOUNT,CUSTOMER_ID_30_DAY_WINDOW_SUM_AMOUNT,CUSTOMER_ID_30_DAY_WINDOW_COUNT,CUSTOMER_ID_30_DAY_WINDOW_AVG_AMOUNT,CUSTOMER_ID_DAYS_SINCE_LAST_TXN
0,1758,2018-04-01 07:19:05,0,6076,123.6,26345,0,0,0,2018-04-01,...,123.6,1.0,123.6,123.6,1.0,123.6,123.6,1.0,123.6,0
1,8275,2018-04-01 18:00:16,0,858,77.3,64816,0,0,0,2018-04-01,...,200.9,2.0,100.5,200.9,2.0,100.5,200.9,2.0,100.5,0
2,8640,2018-04-01 19:02:02,0,6698,46.5,68522,0,0,0,2018-04-01,...,247.4,3.0,82.5,247.4,3.0,82.5,247.4,3.0,82.5,0
3,12169,2018-04-02 08:51:06,0,6569,54.7,118266,1,0,0,2018-04-02,...,178.6,3.0,59.5,302.2,4.0,75.5,302.2,4.0,75.5,0
4,15764,2018-04-02 14:05:38,0,7707,63.3,137138,1,0,0,2018-04-02,...,241.9,4.0,60.5,365.5,5.0,73.1,365.5,5.0,73.1,0


In [15]:
transactions_df = transactions_df.groupby('CUSTOMER_ID').apply(generate_customer_rfm)

In [16]:
transactions_df = transactions_df.sort_values('TX_DATETIME').reset_index(drop=True)
transactions_df.index = transactions_df['TRANSACTION_ID']

In [17]:
transactions_df.head()

Unnamed: 0_level_0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_TIME_SECONDS,TX_TIME_DAYS,TX_FRAUD,TX_FRAUD_SCENARIO,TX_DATE,...,CUSTOMER_ID_1_DAY_WINDOW_SUM_AMOUNT,CUSTOMER_ID_1_DAY_WINDOW_COUNT,CUSTOMER_ID_1_DAY_WINDOW_AVG_AMOUNT,CUSTOMER_ID_7_DAY_WINDOW_SUM_AMOUNT,CUSTOMER_ID_7_DAY_WINDOW_COUNT,CUSTOMER_ID_7_DAY_WINDOW_AVG_AMOUNT,CUSTOMER_ID_30_DAY_WINDOW_SUM_AMOUNT,CUSTOMER_ID_30_DAY_WINDOW_COUNT,CUSTOMER_ID_30_DAY_WINDOW_AVG_AMOUNT,CUSTOMER_ID_DAYS_SINCE_LAST_TXN
TRANSACTION_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,2018-04-01 00:00:31,596,3156,57.2,31,0,0,0,2018-04-01,...,57.2,1.0,57.2,57.2,1.0,57.2,57.2,1.0,57.2,0
1,1,2018-04-01 00:02:10,4961,3412,81.5,130,0,0,0,2018-04-01,...,81.5,1.0,81.5,81.5,1.0,81.5,81.5,1.0,81.5,0
2,2,2018-04-01 00:07:56,2,1365,146.0,476,0,0,0,2018-04-01,...,146.0,1.0,146.0,146.0,1.0,146.0,146.0,1.0,146.0,0
3,3,2018-04-01 00:09:29,4128,8737,64.5,569,0,0,0,2018-04-01,...,64.5,1.0,64.5,64.5,1.0,64.5,64.5,1.0,64.5,0
4,4,2018-04-01 00:10:34,927,9906,51.0,634,0,0,0,2018-04-01,...,51.0,1.0,51.0,51.0,1.0,51.0,51.0,1.0,51.0,0


In [18]:
# Quick check - using human eyes, and stopping short of recalculating the cols
random_cust_id = transactions_df['CUSTOMER_ID'].sample(1, random_state=11).values[0]
random_cust_id

4544

In [19]:
transactions_df[transactions_df['CUSTOMER_ID'] == random_cust_id][['TX_DATE', 'CUSTOMER_ID_1_DAY_WINDOW_COUNT', 'CUSTOMER_ID_1_DAY_WINDOW_SUM_AMOUNT', 'CUSTOMER_ID_1_DAY_WINDOW_AVG_AMOUNT']]

Unnamed: 0_level_0,TX_DATE,CUSTOMER_ID_1_DAY_WINDOW_COUNT,CUSTOMER_ID_1_DAY_WINDOW_SUM_AMOUNT,CUSTOMER_ID_1_DAY_WINDOW_AVG_AMOUNT
TRANSACTION_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4266,2018-04-01,1.0,17.1,17.1
6823,2018-04-01,2.0,23.8,11.9
10811,2018-04-02,3.0,55.9,18.6
11920,2018-04-02,4.0,101.4,25.3
15187,2018-04-02,4.0,139.4,34.9
...,...,...,...,...
262052,2018-04-28,6.0,174.5,29.1
263957,2018-04-28,7.0,209.7,30.0
274536,2018-04-29,1.0,43.3,43.3
274689,2018-04-29,2.0,74.2,37.1


In [20]:
transactions_df[transactions_df['CUSTOMER_ID'] == random_cust_id][['TX_DATE', 'CUSTOMER_ID_7_DAY_WINDOW_COUNT', 'CUSTOMER_ID_7_DAY_WINDOW_SUM_AMOUNT', 'CUSTOMER_ID_7_DAY_WINDOW_AVG_AMOUNT']]

Unnamed: 0_level_0,TX_DATE,CUSTOMER_ID_7_DAY_WINDOW_COUNT,CUSTOMER_ID_7_DAY_WINDOW_SUM_AMOUNT,CUSTOMER_ID_7_DAY_WINDOW_AVG_AMOUNT
TRANSACTION_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4266,2018-04-01,1.0,17.1,17.1
6823,2018-04-01,2.0,23.8,11.9
10811,2018-04-02,3.0,55.9,18.6
11920,2018-04-02,4.0,101.4,25.3
15187,2018-04-02,5.0,156.5,31.3
...,...,...,...,...
262052,2018-04-28,19.0,474.2,25.0
263957,2018-04-28,20.0,509.3,25.5
274536,2018-04-29,18.0,480.3,26.7
274689,2018-04-29,19.0,511.2,26.9


In [21]:
transactions_df[transactions_df['CUSTOMER_ID'] == random_cust_id][['TX_DATE', 'CUSTOMER_ID_30_DAY_WINDOW_COUNT', 'CUSTOMER_ID_30_DAY_WINDOW_SUM_AMOUNT', 'CUSTOMER_ID_30_DAY_WINDOW_AVG_AMOUNT']]

Unnamed: 0_level_0,TX_DATE,CUSTOMER_ID_30_DAY_WINDOW_COUNT,CUSTOMER_ID_30_DAY_WINDOW_SUM_AMOUNT,CUSTOMER_ID_30_DAY_WINDOW_AVG_AMOUNT
TRANSACTION_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4266,2018-04-01,1.0,17.1,17.1
6823,2018-04-01,2.0,23.8,11.9
10811,2018-04-02,3.0,55.9,18.6
11920,2018-04-02,4.0,101.4,25.3
15187,2018-04-02,5.0,156.5,31.3
...,...,...,...,...
262052,2018-04-28,64.0,1846.6,28.9
263957,2018-04-28,65.0,1881.8,29.0
274536,2018-04-29,66.0,1925.1,29.2
274689,2018-04-29,67.0,1956.0,29.2


In [22]:
transactions_df[transactions_df['CUSTOMER_ID'] == random_cust_id][['TX_DATETIME', 'CUSTOMER_ID_DAYS_SINCE_LAST_TXN']]

Unnamed: 0_level_0,TX_DATETIME,CUSTOMER_ID_DAYS_SINCE_LAST_TXN
TRANSACTION_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
4266,2018-04-01 11:20:10,0
6823,2018-04-01 15:08:35,0
10811,2018-04-02 06:14:58,0
11920,2018-04-02 08:25:25,0
15187,2018-04-02 13:15:34,0
...,...,...
262052,2018-04-28 09:26:26,0
263957,2018-04-28 12:11:53,0
274536,2018-04-29 13:26:27,1
274689,2018-04-29 13:40:50,0


### Terminal features

Determine a risk score for each terminal.

We will include a delay period in this since the `TX_FRAUD` flag will likely not be known immediately. This will usually change once a customer has reported fraud for example.
Therefore, we will set a delay period of 1w assuming that fraud will be reported within a week.

`Risk = num of fraud txns / num of total txns, for a given period`.

In [23]:
transactions_df.head()

Unnamed: 0_level_0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_TIME_SECONDS,TX_TIME_DAYS,TX_FRAUD,TX_FRAUD_SCENARIO,TX_DATE,...,CUSTOMER_ID_1_DAY_WINDOW_SUM_AMOUNT,CUSTOMER_ID_1_DAY_WINDOW_COUNT,CUSTOMER_ID_1_DAY_WINDOW_AVG_AMOUNT,CUSTOMER_ID_7_DAY_WINDOW_SUM_AMOUNT,CUSTOMER_ID_7_DAY_WINDOW_COUNT,CUSTOMER_ID_7_DAY_WINDOW_AVG_AMOUNT,CUSTOMER_ID_30_DAY_WINDOW_SUM_AMOUNT,CUSTOMER_ID_30_DAY_WINDOW_COUNT,CUSTOMER_ID_30_DAY_WINDOW_AVG_AMOUNT,CUSTOMER_ID_DAYS_SINCE_LAST_TXN
TRANSACTION_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,2018-04-01 00:00:31,596,3156,57.2,31,0,0,0,2018-04-01,...,57.2,1.0,57.2,57.2,1.0,57.2,57.2,1.0,57.2,0
1,1,2018-04-01 00:02:10,4961,3412,81.5,130,0,0,0,2018-04-01,...,81.5,1.0,81.5,81.5,1.0,81.5,81.5,1.0,81.5,0
2,2,2018-04-01 00:07:56,2,1365,146.0,476,0,0,0,2018-04-01,...,146.0,1.0,146.0,146.0,1.0,146.0,146.0,1.0,146.0,0
3,3,2018-04-01 00:09:29,4128,8737,64.5,569,0,0,0,2018-04-01,...,64.5,1.0,64.5,64.5,1.0,64.5,64.5,1.0,64.5,0
4,4,2018-04-01 00:10:34,927,9906,51.0,634,0,0,0,2018-04-01,...,51.0,1.0,51.0,51.0,1.0,51.0,51.0,1.0,51.0,0


In [24]:
def generate_terminal_risk(terminal_txns, time_delay=7, window_sizes=(1, 7, 30)):

    terminal_txns.index = terminal_txns['TX_DATETIME']
    DELAY_FRAUD_TXN_COUNT = terminal_txns['TX_FRAUD'].rolling(str(time_delay) + 'd').sum('TX_FRAUD')
    DELAY_ALL_TXN_COUNT = terminal_txns['TX_FRAUD'].rolling(str(time_delay) + 'd').sum('count')

    for window in window_sizes:
        FRAUD_TXN_COUNT_NODELAY = terminal_txns['TX_FRAUD'].rolling(str(time_delay + window) + 'd').sum('TX_FRAUD')
        ALL_TXN_COUNT_NODELAY = terminal_txns['TX_FRAUD'].rolling(str(time_delay + window) + 'd').count()

        FRAUD_TXN_COUNT = FRAUD_TXN_COUNT_NODELAY - DELAY_FRAUD_TXN_COUNT
        ALL_TXN_COUNT = ALL_TXN_COUNT_NODELAY - DELAY_ALL_TXN_COUNT

        RISK_PERC = FRAUD_TXN_COUNT / ALL_TXN_COUNT

        terminal_txns['TERMINAL_ID_' + 'FRAUD_RISK_' + str(window) + '_DAY_WINDOW'] = RISK_PERC
        terminal_txns['TERMINAL_ID_' + 'FRAUD_TXN_COUNT_' + str(window) + '_DAY_WINDOW'] = FRAUD_TXN_COUNT
        terminal_txns['TERMINAL_ID_' + 'TXN_COUNT_' + str(window) + '_DAY_WINDOW'] = ALL_TXN_COUNT

    terminal_txns.fillna(0, inplace=True)
    terminal_txns.reset_index(drop=True, inplace=True)

    return terminal_txns

In [25]:
terminal_txns = transactions_df[transactions_df['TERMINAL_ID'] == 0].copy()
generate_terminal_risk(terminal_txns)

Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_TIME_SECONDS,TX_TIME_DAYS,TX_FRAUD,TX_FRAUD_SCENARIO,TX_DATE,...,CUSTOMER_ID_DAYS_SINCE_LAST_TXN,TERMINAL_ID_FRAUD_RISK_1_DAY_WINDOW,TERMINAL_ID_FRAUD_TXN_COUNT_1_DAY_WINDOW,TERMINAL_ID_TXN_COUNT_1_DAY_WINDOW,TERMINAL_ID_FRAUD_RISK_7_DAY_WINDOW,TERMINAL_ID_FRAUD_TXN_COUNT_7_DAY_WINDOW,TERMINAL_ID_TXN_COUNT_7_DAY_WINDOW,TERMINAL_ID_FRAUD_RISK_30_DAY_WINDOW,TERMINAL_ID_FRAUD_TXN_COUNT_30_DAY_WINDOW,TERMINAL_ID_TXN_COUNT_30_DAY_WINDOW
0,9579,2018-04-02 01:00:01,3440,0,16.1,90001,1,0,0,2018-04-02,...,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,12806,2018-04-02 09:49:55,3302,0,67.8,121795,1,0,0,2018-04-02,...,0,0.0,0.0,2.0,0.0,0.0,2.0,0.0,0.0,2.0
2,24184,2018-04-03 12:14:41,3790,0,26.8,216881,2,0,0,2018-04-03,...,0,0.0,0.0,3.0,0.0,0.0,3.0,0.0,0.0,3.0
3,46284,2018-04-05 16:47:41,1125,0,40.5,406061,4,0,0,2018-04-05,...,0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,4.0
4,58807,2018-04-07 06:05:21,1125,0,48.4,540321,6,0,0,2018-04-07,...,0,0.0,0.0,5.0,0.0,0.0,5.0,0.0,0.0,5.0
5,61225,2018-04-07 10:18:42,4029,0,34.2,555522,6,0,0,2018-04-07,...,0,0.0,0.0,6.0,0.0,0.0,6.0,0.0,0.0,6.0
6,64116,2018-04-07 14:37:49,3790,0,51.4,571069,6,0,0,2018-04-07,...,0,0.0,0.0,7.0,0.0,0.0,7.0,0.0,0.0,7.0
7,82373,2018-04-09 13:36:59,3554,0,111.1,740219,8,0,0,2018-04-09,...,0,0.0,0.0,8.0,0.0,0.0,8.0,0.0,0.0,8.0
8,88261,2018-04-10 07:53:35,3780,0,32.9,806015,9,0,0,2018-04-10,...,0,0.0,0.0,8.0,0.0,0.0,9.0,0.0,0.0,9.0
9,89762,2018-04-10 10:19:15,562,0,175.6,814755,9,0,0,2018-04-10,...,1,0.0,0.0,8.0,0.0,0.0,10.0,0.0,0.0,10.0


In [26]:
transactions_df = transactions_df.groupby('TERMINAL_ID').apply(generate_terminal_risk)

In [27]:
transactions_df = transactions_df.sort_values('TX_DATETIME').reset_index(drop=True)
transactions_df.index = transactions_df['TRANSACTION_ID']

In [28]:
transactions_df.iloc[transactions_df['TERMINAL_ID_FRAUD_RISK_30_DAY_WINDOW'].idxmax()]

TRANSACTION_ID                                             96159
TX_DATETIME                                  2018-04-11 03:02:54
CUSTOMER_ID                                                 3493
TERMINAL_ID                                                  898
TX_AMOUNT                                                   25.0
TX_TIME_SECONDS                                           874974
TX_TIME_DAYS                                                  10
TX_FRAUD                                                       1
TX_FRAUD_SCENARIO                                              2
TX_DATE                                      2018-04-11 00:00:00
TX_ON_WEEKEND                                                  0
TX_AT_NIGHT                                                    1
CUSTOMER_ID_1_DAY_WINDOW_SUM_AMOUNT                         53.0
CUSTOMER_ID_1_DAY_WINDOW_COUNT                               4.0
CUSTOMER_ID_1_DAY_WINDOW_AVG_AMOUNT                         13.3
CUSTOMER_ID_7_DAY_WINDOW_

In [29]:
# Quick check - using human eyes, and stopping short of recalculating the cols
random_terminal_id = transactions_df[transactions_df['TERMINAL_ID_FRAUD_RISK_30_DAY_WINDOW'] > 0]['TERMINAL_ID'].sample(1, random_state=11).values[0]
random_terminal_id

6374

In [30]:
transactions_df[(transactions_df['TERMINAL_ID'] == random_terminal_id) & (transactions_df['TX_FRAUD'] == 1)]
# 4th April had a fraud txn of $103

Unnamed: 0_level_0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_TIME_SECONDS,TX_TIME_DAYS,TX_FRAUD,TX_FRAUD_SCENARIO,TX_DATE,...,CUSTOMER_ID_DAYS_SINCE_LAST_TXN,TERMINAL_ID_FRAUD_RISK_1_DAY_WINDOW,TERMINAL_ID_FRAUD_TXN_COUNT_1_DAY_WINDOW,TERMINAL_ID_TXN_COUNT_1_DAY_WINDOW,TERMINAL_ID_FRAUD_RISK_7_DAY_WINDOW,TERMINAL_ID_FRAUD_TXN_COUNT_7_DAY_WINDOW,TERMINAL_ID_TXN_COUNT_7_DAY_WINDOW,TERMINAL_ID_FRAUD_RISK_30_DAY_WINDOW,TERMINAL_ID_FRAUD_TXN_COUNT_30_DAY_WINDOW,TERMINAL_ID_TXN_COUNT_30_DAY_WINDOW
TRANSACTION_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
37467,37467,2018-04-04 19:00:35,398,6374,103.0,327635,3,1,3,2018-04-04,...,0,0.0,0.0,3.0,0.0,0.0,3.0,0.0,0.0,3.0


In [31]:
transactions_df[transactions_df['TERMINAL_ID'] == random_terminal_id][['TX_DATETIME', 'TERMINAL_ID_FRAUD_TXN_COUNT_1_DAY_WINDOW', 'TERMINAL_ID_TXN_COUNT_1_DAY_WINDOW', 'TERMINAL_ID_FRAUD_RISK_1_DAY_WINDOW']]

Unnamed: 0_level_0,TX_DATETIME,TERMINAL_ID_FRAUD_TXN_COUNT_1_DAY_WINDOW,TERMINAL_ID_TXN_COUNT_1_DAY_WINDOW,TERMINAL_ID_FRAUD_RISK_1_DAY_WINDOW
TRANSACTION_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2957,2018-04-01 09:26:24,0.0,1.0,0.0
9465,2018-04-01 23:35:20,0.0,2.0,0.0
23359,2018-04-03 11:06:33,0.0,3.0,0.0
37467,2018-04-04 19:00:35,0.0,3.0,0.0
47145,2018-04-05 19:01:45,0.0,4.0,0.0
57675,2018-04-07 01:36:00,0.0,5.0,0.0
58414,2018-04-07 05:04:22,0.0,6.0,0.0
59839,2018-04-07 08:05:51,0.0,7.0,0.0
60905,2018-04-07 09:49:22,0.0,8.0,0.0
88447,2018-04-10 08:14:17,0.0,7.0,0.0


In [32]:
transactions_df[transactions_df['TERMINAL_ID'] == random_terminal_id][['TX_DATETIME', 'TERMINAL_ID_FRAUD_TXN_COUNT_7_DAY_WINDOW', 'TERMINAL_ID_TXN_COUNT_7_DAY_WINDOW', 'TERMINAL_ID_FRAUD_RISK_7_DAY_WINDOW']]

Unnamed: 0_level_0,TX_DATETIME,TERMINAL_ID_FRAUD_TXN_COUNT_7_DAY_WINDOW,TERMINAL_ID_TXN_COUNT_7_DAY_WINDOW,TERMINAL_ID_FRAUD_RISK_7_DAY_WINDOW
TRANSACTION_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2957,2018-04-01 09:26:24,0.0,1.0,0.0
9465,2018-04-01 23:35:20,0.0,2.0,0.0
23359,2018-04-03 11:06:33,0.0,3.0,0.0
37467,2018-04-04 19:00:35,0.0,3.0,0.0
47145,2018-04-05 19:01:45,0.0,4.0,0.0
57675,2018-04-07 01:36:00,0.0,5.0,0.0
58414,2018-04-07 05:04:22,0.0,6.0,0.0
59839,2018-04-07 08:05:51,0.0,7.0,0.0
60905,2018-04-07 09:49:22,0.0,8.0,0.0
88447,2018-04-10 08:14:17,0.0,9.0,0.0


In [33]:
transactions_df[transactions_df['TERMINAL_ID'] == random_terminal_id][['TX_DATETIME', 'TERMINAL_ID_FRAUD_TXN_COUNT_30_DAY_WINDOW', 'TERMINAL_ID_TXN_COUNT_30_DAY_WINDOW', 'TERMINAL_ID_FRAUD_RISK_30_DAY_WINDOW']]

Unnamed: 0_level_0,TX_DATETIME,TERMINAL_ID_FRAUD_TXN_COUNT_30_DAY_WINDOW,TERMINAL_ID_TXN_COUNT_30_DAY_WINDOW,TERMINAL_ID_FRAUD_RISK_30_DAY_WINDOW
TRANSACTION_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2957,2018-04-01 09:26:24,0.0,1.0,0.0
9465,2018-04-01 23:35:20,0.0,2.0,0.0
23359,2018-04-03 11:06:33,0.0,3.0,0.0
37467,2018-04-04 19:00:35,0.0,3.0,0.0
47145,2018-04-05 19:01:45,0.0,4.0,0.0
57675,2018-04-07 01:36:00,0.0,5.0,0.0
58414,2018-04-07 05:04:22,0.0,6.0,0.0
59839,2018-04-07 08:05:51,0.0,7.0,0.0
60905,2018-04-07 09:49:22,0.0,8.0,0.0
88447,2018-04-10 08:14:17,0.0,9.0,0.0
