### Libraries used 

In [1]:
# Necessary imports for this notebook
import os
import collections
import numpy as np
import pandas as pd
import datetime
import time
import random
import faker 
from faker import Faker
from collections import OrderedDict
# For plotting
%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns

locales=OrderedDict([('es_ES', 3)])
faker= Faker(locales)
Faker.seed(0)
sns.set_style('darkgrid', {'axes.facecolor': '0.9'})

### Sender Profile Generation

In [6]:
def generate_customer_profiles_table(n_customers, random_state=0):
    np.random.seed(random_state)
        
    customer_id_properties=[]
    # Generate customer properties from random distributions 
    for customer_id in range(n_customers):
        #acc_id=np.random.randint(1,n_customers)
        acc_id=faker['es_ES'].iban()
        name=faker['es_ES'].first_name()
        last_name=faker['es_ES'].last_name()
        dob=faker.date_of_birth(minimum_age=18, maximum_age=27)
        mean_amount = np.random.uniform(0.50,1000) # Arbitrary (but sensible) value 
        std_amount = mean_amount/2 # Arbitrary (but sensible) value
        mean_nb_tx_per_day = np.random.uniform(0,4) # Arbitrary (but sensible) value 
        ip=faker.ipv4()
        customer_id_properties.append([customer_id, acc_id, name, last_name,dob, ip,
                                      mean_amount, std_amount,
                                      mean_nb_tx_per_day])
        
    customer_profiles_table = pd.DataFrame(customer_id_properties, columns=['SENDER_ID','iban','name','last_name', 'dob','ip',                                                                            
                                                                      'mean_amount', 'std_amount',
                                                                      'mean_nb_tx_per_day'])
    
    return customer_profiles_table

#### Testing sender generating function

In [None]:
n_customers = 4
customer_profiles_table = generate_customer_profiles_table(n_customers, random_state = 0)
customer_profiles_table

### Receiver Profile Generation

In [7]:
def generate_receiver_profiles_table(n_receiver, random_state=0):
    locales=OrderedDict([('es_ES', 2), ('it_IT',3), ('pt_PT',2)])
    faker=Faker(locales)
    np.random.seed(random_state)
        
    receiver_id_properties=[]
    for receiver_id in range(n_receiver):

        acc_id=faker.iban()
        name=faker.first_name()
        last_name=faker.last_name()
        receiver_id_properties.append([receiver_id, acc_id, name, last_name])
                                       
    receiver_profiles_table = pd.DataFrame(receiver_id_properties, columns=['RECEIVER_ID','iban','name','last_name'])
    
    return receiver_profiles_table

In [None]:
n_receivers = 5
receiver_profiles_table = generate_receiver_profiles_table(n_receivers, random_state = 0)
receiver_profiles_table

### Transactions Generation

In [8]:
def generate_transactions_table(customer_profile, receiver_profiles_table, start_date = "2017-09-05", nb_days=10):
    customer_transactions = []
    
    random.seed(int(customer_profile.SENDER_ID))
    np.random.seed(int(customer_profile.SENDER_ID))
    
    # For all days
    for day in range(nb_days):
        
        # Random number of transactions for that day 
        nb_tx = np.random.poisson(customer_profile.mean_nb_tx_per_day)

        # If nb_tx positive, let us generate transactions
        if nb_tx>0:
            for tx in range(nb_tx):
                
                # Time of transaction: Around noon, std 20000 seconds. This choice aims at simulating the fact that 
                # most transactions occur during the day.
                time_tx = int(np.random.normal(86400/2, 20000))
                receiver_id = random.choice(receiver_profiles_table.RECEIVER_ID)
                iban= receiver_profiles_table.loc[receiver_id,'iban'] 
                # If transaction time between 0 and 86400, let us keep it, otherwise, let us discard it
                if (time_tx>0) and (time_tx<86400):
                    
                    # Amount is drawn from a normal distribution  
                    amount = np.random.normal(customer_profile.mean_amount, customer_profile.std_amount)
                    
                    # If amount negative, draw from a uniform distribution
                    if amount<0:
                        amount = np.random.uniform(0,customer_profile.mean_amount*2)
                    
                    amount=np.round(amount,decimals=2)
                        
                    customer_transactions.append([time_tx+day*86400, day,customer_profile.SENDER_ID,
                                                      customer_profile.iban, receiver_id,iban,amount]) 
                                                                  
            
    customer_transactions = pd.DataFrame(customer_transactions, columns=['TX_TIME_SECONDS', 'TX_TIME_DAYS', 'SENDER_ID','SENDER_IBAN','RECEIVER_ID','RECEIVER_IBAN','TX_AMOUNT'])
    
    if len(customer_transactions)>0:
        customer_transactions['TX_DATETIME'] = pd.to_datetime(customer_transactions["TX_TIME_SECONDS"], unit='s', origin=start_date)
        customer_transactions=customer_transactions[['TX_DATETIME','SENDER_ID','SENDER_IBAN','RECEIVER_ID','RECEIVER_IBAN','TX_AMOUNT','TX_TIME_SECONDS', 'TX_TIME_DAYS']]
    
    return customer_transactions  

### Testing Transactions generation

In [None]:
transaction_table_customer_0=generate_transactions_table(customer_profiles_table.iloc[0], receiver_profiles_table,
                                                         start_date = "2015-09-05", 
                                                         nb_days = 5)
transaction_table_customer_0

In [None]:
transactions_df=customer_profiles_table.groupby('SENDER_ID').apply(lambda x : generate_transactions_table(x.iloc[0], receiver_profiles_table, nb_days=5)).reset_index(drop=True)
transactions_df

### Dataset Generation

In [9]:
def generate_dataset(n_customers, n_receivers, nb_days, start_date="2017-09-05"):
    
    start_time=time.time()
    customer_profiles_table = generate_customer_profiles_table(n_customers, random_state = 0)
    print("Time to generate customer profiles table: {0:.2}s".format(time.time()-start_time))
    
    start_time=time.time()
    receiver_profiles_table = generate_receiver_profiles_table(n_receivers, random_state = 1)
    print("Time to generate receiver profiles table: {0:.2}s".format(time.time()-start_time))
    
    start_time=time.time()
    transactions_df=customer_profiles_table.groupby('SENDER_ID').apply(lambda x : generate_transactions_table(x.iloc[0], receiver_profiles_table, nb_days=nb_days)).reset_index(drop=True)
    print("Time to generate transactions: {0:.2}s".format(time.time()-start_time))
    
    # Sort transactions chronologically
    transactions_df=transactions_df.sort_values('TX_DATETIME')
    # Reset indices, starting from 0
    transactions_df.reset_index(inplace=True,drop=True)
    transactions_df.reset_index(inplace=True)
    # TRANSACTION_ID are the dataframe indices, starting from 0
    transactions_df.rename(columns = {'index':'TRANSACTION_ID'}, inplace = True)
    
    return (customer_profiles_table, receiver_profiles_table, transactions_df)  

In [None]:
(customer_profiles_table, receiver_profiles_table, transactions_df)=\
    generate_dataset(n_customers = 6000, 
                     n_receivers = 7000, 
                     nb_days=1300, 
                     start_date="2017-09-05")

In [None]:
#print(customer_profiles_table.style.to_latex(label='SENDERS TABLE'))

In [None]:
#print(receiver_profiles_table.style.to_latex(label='RECEIVERS TABLE'))

In [None]:
transactions_df.shape

In [None]:
transactions_df

In [None]:
#print(transactions_df.style.to_latex(buf='transactions_df.tex'))

### Fraud Scenario Generation

In [None]:
def add_frauds(customer_profiles_table, receiver_profiles_table, transactions_df):
    
    # By default, all transactions are genuine
    transactions_df['TX_FRAUD']=0
    transactions_df['TX_FRAUD_SCENARIO']=0
       
    # Scenario 1, black-lists, mules. RECEIVERS. LEGITIMATE CLIENT, FRAUDULENT RECEIVER OR IBAN
    for day in range(transactions_df.TX_TIME_DAYS.max()):
        
        compromised_receivers = receiver_profiles_table.RECEIVER_ID.sample(n=2, random_state=day).values
        
        compromised_transactions=transactions_df[transactions_df.RECEIVER_ID.isin(compromised_receivers)]
                    
        transactions_df.loc[compromised_transactions.index,'TX_FRAUD']=1
        transactions_df.loc[compromised_transactions.index,'TX_FRAUD_SCENARIO']=1
                                  
    nb_frauds_scenario_1=transactions_df.TX_FRAUD.sum()
    print("Number of frauds from scenario 1: "+str(nb_frauds_scenario_1))
    
    #Scenario 2: phising cases, client account compromised
    for day in range(transactions_df.TX_TIME_DAYS.max()):
        
        compromised_clients = customer_profiles_table.SENDER_ID.sample(n=5, random_state=day).values #5 clients compromised
        
        compromised_transactions=transactions_df[(transactions_df.TX_TIME_DAYS>=day) & 
                                                    (transactions_df.TX_TIME_DAYS<day+5) & 
                                                    (transactions_df.SENDER_ID.isin(compromised_clients))] #their transactions are compromised 5 days
        
        nb_compromised_transactions=len(compromised_transactions)
        random.seed(day)
        index_ftx= random.sample(list(compromised_transactions.index.values), k=int(nb_compromised_transactions/4)) #25% of their transactions are fraudulent because they have their amounts *1.15
        
        transactions_df.loc[index_ftx, 'TX_AMOUNT']= transactions_df.loc[index_ftx, 'TX_AMOUNT']*1.15
        transactions_df.loc[index_ftx, 'TX_FRAUD']=1
        transactions_df.loc[index_ftx, 'TX_FRAUD_SCENARIO']=2
                     
    nb_frauds_scenario_2=transactions_df.TX_FRAUD.sum()-nb_frauds_scenario_1
    print("Number of frauds from scenario 2: "+str(nb_frauds_scenario_2))
     
    return transactions_df                 

In [None]:
%time transactions_df = add_frauds(customer_profiles_table, receiver_profiles_table, transactions_df)

In [None]:
transactions_df.TX_FRAUD.mean()

In [None]:
transactions_df.TX_FRAUD.sum()

In [None]:
transactions_df.head()

In [None]:
transactions_df[transactions_df.TX_FRAUD_SCENARIO==2].shape

In [None]:
transactions_df[transactions_df.TX_FRAUD_SCENARIO==1].shape

In [None]:
def get_stats(transactions_df):
    #Number of transactions per day
    nb_tx_per_day=transactions_df.groupby(['TX_TIME_DAYS'])['SENDER_ID'].count()
    print(nb_tx_per_day)
    #print(nb_tx_per_day)
    #Number of fraudulent transactions per day
    nb_fraud_per_day=transactions_df.groupby(['TX_TIME_DAYS'])['TX_FRAUD'].sum()
    #Number of fraudulent cards per day
    nb_fraudbt_per_day=transactions_df[transactions_df['TX_FRAUD']>0].groupby(['TX_TIME_DAYS']).SENDER_ID.nunique()
    
    return (nb_tx_per_day,nb_fraud_per_day,nb_fraudbt_per_day)

(nb_tx_per_day,nb_fraud_per_day,nb_fraudbt_per_day)=get_stats(transactions_df)

n_days=len(nb_tx_per_day)
print(n_days)
tx_stats=pd.DataFrame({"value":pd.concat([nb_tx_per_day/50,nb_fraud_per_day/50,nb_fraudbt_per_day/50])})
tx_stats['stat_type']=["nb_tx_per_day"]*n_days+["nb_fraud_per_day"]*n_days+["nb_fraudbt_per_day"]*n_days
tx_stats=tx_stats.reset_index()

In [12]:
%%capture

sns.set(style='darkgrid')
sns.set(font_scale=1.4)

fraud_and_transactions_stats_fig = plt.gcf()

fraud_and_transactions_stats_fig.set_size_inches(15, 8)

sns_plot = sns.lineplot(x="TX_TIME_DAYS", y="value", data=tx_stats, hue="stat_type", hue_order=["nb_tx_per_day","nb_fraud_per_day","nb_fraudbt_per_day"], legend=False)

sns_plot.set_title('Total transactions, and number of fraudulent transactions \n and number of compromised transfers per day', fontsize=20)
sns_plot.set(xlabel = "Number of days since beginning of data generation", ylabel="Number")

sns_plot.set_ylim([0,500])

labels_legend = ["# transactions per day (/50)", "# fraudulent txs per day (/50)", "# fraudulent transfers per day (/50)"]

sns_plot.legend(loc='upper left', labels=labels_legend,bbox_to_anchor=(1.05, 1), fontsize=15)

In [None]:
fraud_and_transactions_stats_fig

### Saving dataset without Feature Transformation

In [None]:
DIR_OUTPUT = "./simulated-data-raw/"

if not os.path.exists(DIR_OUTPUT):
    os.makedirs(DIR_OUTPUT)

start_date = datetime.datetime.strptime("2017-09-05", "%Y-%m-%d")

for day in range(transactions_df.TX_TIME_DAYS.max()+1):
    
    transactions_day = transactions_df[transactions_df.TX_TIME_DAYS==day].sort_values('TX_TIME_SECONDS')
    
    date = start_date + datetime.timedelta(days=day)
    filename_output = date.strftime("%Y-%m-%d")+'.pkl'
    
    # Protocol=4. It adds support for very large objects, pickling more kinds of objects, and some data format optimizations. It is the default protocol starting with Python 3.8. 
    transactions_day.to_pickle(DIR_OUTPUT+filename_output, protocol=4)

In [None]:
#print(transactions_df.style.to_latex(buf='transactions_df_frauds.tex'))

### Function to read pickle format

In [2]:
# Load a set of pickle files, put them together in a single DataFrame, and order them by time
# It takes as input the folder DIR_INPUT where the files are stored, and the BEGIN_DATE and END_DATE
#import pandas as pd

def read_from_files(DIR_INPUT, BEGIN_DATE, END_DATE):
    
    files = [os.path.join(DIR_INPUT, f) for f in os.listdir(DIR_INPUT) if f>=BEGIN_DATE+'.pkl' and f<=END_DATE+'.pkl']

    frames = []
    for f in files:
        df = pd.read_pickle(f)
        frames.append(df)
        del df
    df_final = pd.concat(frames)
    
    df_final=df_final.sort_values('TRANSACTION_ID')
    df_final.reset_index(drop=True,inplace=True)
    #  Note: -1 are missing values for real world data 
    df_final=df_final.replace([-1],0)
    
    return df_final

### Loading data 

In [None]:
DIR_INPUT='./simulated-data-raw' 

BEGIN_DATE = "2017-09-05"
END_DATE = "2021-03-27"

print("Load  files")
%time transactions_df=read_from_files(DIR_INPUT, BEGIN_DATE, END_DATE)
print("{0} transactions loaded, containing {1} fraudulent transactions".format(len(transactions_df),transactions_df.TX_FRAUD.sum()))

In [None]:
transactions_df.head()

In [None]:
transactions_df.dtypes

### Feature Transformations

In [None]:
## TX_DURING_WEEKEND feature
## 1 -> wend, 0 -> otherwise
def is_weekend(tx_datetime):
    
    # Transform date into weekday (0 is Monday, 6 is Sunday)
    weekday = tx_datetime.weekday()
    # Binary value: 0 if weekday, 1 if weekend
    is_weekend = weekday>=5
    
    return int(is_weekend)

In [None]:
transactions_df['TX_DURING_WEEKEND']=transactions_df.TX_DATETIME.apply(is_weekend)

In [None]:
transactions_df.dtypes

In [None]:
## 1 -> night, 0 -> otherwise(day)
def is_night(tx_datetime):
    
    # Get the hour of the transaction
    tx_hour = tx_datetime.hour
    # Binary value: 1 if hour less than 6, and 0 otherwise
    is_night = tx_hour<=6
    
    return int(is_night)

In [None]:
%time transactions_df['TX_DURING_NIGHT']=transactions_df.TX_DATETIME.apply(is_night)

In [None]:
transactions_df[transactions_df.TX_TIME_DAYS>=30]

In [None]:
#transactions_df.style.to_latex(buf='tx_datefeature.tex')

In [None]:
##sender id transformations

In [None]:
def get_customer_spending_behaviour_features(customer_transactions, windows_size_in_days=[1,7,30]):
    
    # Let us first order transactions chronologically
    customer_transactions=customer_transactions.sort_values('TX_DATETIME')
    
    # The transaction date and time is set as the index, which will allow the use of the rolling function 
    customer_transactions.index=customer_transactions.TX_DATETIME
    
    # For each window size
    for window_size in windows_size_in_days:
        
        # Compute the sum of the transaction amounts and the number of transactions for the given window size
        SUM_AMOUNT_TX_WINDOW=customer_transactions['TX_AMOUNT'].rolling(str(window_size)+'d').sum()
        NB_TX_WINDOW=customer_transactions['TX_AMOUNT'].rolling(str(window_size)+'d').count()
    
        # Compute the average transaction amount for the given window size
        # NB_TX_WINDOW is always >0 since current transaction is always included
        AVG_AMOUNT_TX_WINDOW=SUM_AMOUNT_TX_WINDOW/NB_TX_WINDOW
    
        # Save feature values
        customer_transactions['SENDER_ID_NB_TX_'+str(window_size)+'DAY_WINDOW']=list(NB_TX_WINDOW)
        customer_transactions['SENDER_ID_AVG_AMOUNT_'+str(window_size)+'DAY_WINDOW']=list(AVG_AMOUNT_TX_WINDOW)
    
    # Reindex according to transaction IDs
    customer_transactions.index=customer_transactions.TRANSACTION_ID
        
    # And return the dataframe with the new features
    return customer_transactions

In [None]:
spending_behaviour_customer_0=get_customer_spending_behaviour_features(transactions_df[transactions_df.SENDER_ID==0])
spending_behaviour_customer_0

In [None]:
%time transactions_df=transactions_df.groupby('SENDER_ID').apply(lambda x: get_customer_spending_behaviour_features(x, windows_size_in_days=[1,7,30]))
transactions_df=transactions_df.sort_values('TX_DATETIME').reset_index(drop=True)

In [None]:
transactions_df

In [None]:
#transactions_df.style.to_latex(buf='tx_senderfeature')

### RECEIVER TRANSFORMATIONS
#### DELAY PERIOD: TIME TO DISCOVER THE FRAUDULENT TRANSACTIONS, CUSTOMER COMPLAINT, OR INVESTIGATION. SET TO 1 WEEK FIRST

In [None]:
def get_count_risk_rolling_window(receiver_transactions, delay_period=7, windows_size_in_days=[1,7,30], feature="RECEIVER_ID"):
    
    receiver_transactions=receiver_transactions.sort_values('TX_DATETIME')
    
    receiver_transactions.index=receiver_transactions.TX_DATETIME
    
    NB_FRAUD_DELAY=receiver_transactions['TX_FRAUD'].rolling(str(delay_period)+'d').sum()
    NB_TX_DELAY=receiver_transactions['TX_FRAUD'].rolling(str(delay_period)+'d').count()
    
    for window_size in windows_size_in_days:
    
        NB_FRAUD_DELAY_WINDOW=receiver_transactions['TX_FRAUD'].rolling(str(delay_period+window_size)+'d').sum()
        NB_TX_DELAY_WINDOW=receiver_transactions['TX_FRAUD'].rolling(str(delay_period+window_size)+'d').count()
    
        NB_FRAUD_WINDOW=NB_FRAUD_DELAY_WINDOW-NB_FRAUD_DELAY
        NB_TX_WINDOW=NB_TX_DELAY_WINDOW-NB_TX_DELAY
    
        RISK_WINDOW=NB_FRAUD_WINDOW/NB_TX_WINDOW # Risk score
        
        receiver_transactions[feature+'_NB_TX_'+str(window_size)+'DAY_WINDOW']=list(NB_TX_WINDOW)
        receiver_transactions[feature+'_RISK_'+str(window_size)+'DAY_WINDOW']=list(RISK_WINDOW)
        
    receiver_transactions.index=receiver_transactions.TRANSACTION_ID
    
    # Replace NA values with 0 (all undefined risk scores where NB_TX_WINDOW is 0) 
    receiver_transactions.fillna(0,inplace=True)
    
    return receiver_transactions

In [None]:
transactions_df[transactions_df.TX_FRAUD==1]

In [None]:
get_count_risk_rolling_window(transactions_df[transactions_df.RECEIVER_ID==3059], delay_period=7, windows_size_in_days=[1,7,30])

In [None]:
%time transactions_df=transactions_ df.groupby('RECEIVER_ID').apply(lambda x: get_count_risk_rolling_window(x, delay_period=7, windows_size_in_days=[1,7,30], feature="RECEIVER_ID"))
transactions_df=transactions_df.sort_values('TX_DATETIME').reset_index(drop=True)

In [None]:
transactions_df

In [None]:
#print(transactions_df.style.to_latex())

### Saving dataset with Feature Transformations

In [None]:
DIR_OUTPUT = "./simulated-data-transformed/"

if not os.path.exists(DIR_OUTPUT):
    os.makedirs(DIR_OUTPUT)

start_date = datetime.datetime.strptime("2017-09-05", "%Y-%m-%d")

for day in range(transactions_df.TX_TIME_DAYS.max()+1):
    
    transactions_day = transactions_df[transactions_df.TX_TIME_DAYS==day].sort_values('TX_TIME_SECONDS')
    
    date = start_date + datetime.timedelta(days=day)
    filename_output = date.strftime("%Y-%m-%d")+'.pkl'
    
    # Protocol=4 required for Google Colab
    transactions_day.to_pickle(DIR_OUTPUT+filename_output, protocol=4)

### Loading data transformed

In [None]:
# Load data from the 2021-03-23 to the 2021-03-27

DIR_INPUT='./simulated-data-transformed' 

BEGIN_DATE = "2021-03-23"
END_DATE = "2021-03-27"

print("Load  files")
%time transactions_df=read_from_files(DIR_INPUT, BEGIN_DATE, END_DATE)
print("{0} transactions loaded, containing {1} fraudulent transactions".format(len(transactions_df),transactions_df.TX_FRAUD.sum()))