# Credit Card Fraud Data Generation

Generate data based on https://fraud-detection-handbook.github.io/fraud-detection-handbook/Chapter_3_GettingStarted/SimulatedDataset.html

For generating larger datasets consider to use 2XL or above for your WH, or have some patience since associating customers with terminals takes some time due to it based on distance calculations.

In [None]:
import snowflake.snowpark as S
from snowflake.snowpark import functions as F
from snowflake.snowpark.session import Session
from snowflake.snowpark import types as T
from snowflake.snowpark import Window
from snowflake.snowpark.context import get_active_session


# Print the version of Snowpark we are using
print(f"Using Snowpark: {S.__version__}")

In [None]:
import pandas as pd
import numpy as np
import json 
import datetime
import time

# For plotting
%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns

# Make sure we do not get line breaks when doing show on wide dataframes
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

## Connect to Snowflake

This example is using a JSON file with the following structure
```
{
    "account":"MY SNOWFLAKE ACCOUNT",
    "user": "MY USER",
    "password":"MY PASSWORD",
    "role":"MY ROLE",
    "warehouse":"MY WH",
    "database":"MY DB",
    "schema":"MY SCHEMA"
}

```

In [None]:
with open('../creds.json') as f:
    connection_parameters = json.load(f)
    
session = Session.builder.configs(connection_parameters).create()
print("Current role: " + session.get_current_role() + ", Current schema: " + session.get_fully_qualified_current_schema() + ", Current WH: " + session.get_current_warehouse())

# Create database and schema to store the generated data

In [None]:
session.sql("CREATE DATABASE IF NOT EXIST CREDIT_CARD_FRAUD").collect()
session.use_database("CREDIT_CARD_FRAUD")
session.sql("CREATE SCHEMA IF NOT EXIST DATA").collect()
session.use_schema("DATA")

# Generator functions

In [None]:
"""
    Function to generate customers.
    
    The customer profile dataframe will have the following features:
     - CUSTOMER_ID: The customer unique ID
     - X_CUSTOMER_ID & Y_CUSTOMER_ID: Coordinates of the location of the customers in a square x square grid
     - MEAN_AMOUNT, STD_AMOUNT: The mean and standard deviation of the transaction amounts for the customer, assuming that the transaction amounts follow a normal distribution
     - MEAN_NB_TX_PER_DAY: The average number of transactions per day for the customer, assuming that the number of transactions per day follows a Poisson distribution.
     
"""
def snf_generate_customer_profiles_table(snf_session, n_customers, square=100, random_state=0):
    # Generate n_customers, 
    df_customer_profiles_table = snf_session.generator(F.seq8(1).as_("customer_id")
                  , F.uniform(0, F.lit(square), F.random()).as_("x_customer_id") 
                  , F.uniform(0, F.lit(square), F.random()).as_("y_customer_id")
                  , F.uniform(5, F.lit(100), F.random()).as_("mean_amount")
                  , F.uniform(0, 4, F.random()).as_("mean_nb_tx_per_day")
                  , rowcount=n_customers)\
                  .with_column("std_amount",(F.col("mean_amount")/F.lit(2)))\
                  .select(['CUSTOMER_ID','x_customer_id', 'y_customer_id'
                           ,'mean_amount', 'std_amount', 'mean_nb_tx_per_day'])

    return df_customer_profiles_table


In [None]:
"""
    Function to generate terminals.
    
    The terminal profile dataframe will have the following features:
     - TERMINAL_ID: The terminal unique ID
     - X_TERMINAL_ID & Y_TERMINAL_ID: Coordinates of the location of the terminal in a square x square grid
     
"""
def snf_generate_terminal_profiles_table(snf_session, n_terminals, square=100, random_state=0):
                                           
    df_terminal_profiles_table = snf_session.generator(F.seq8(1).as_("TERMINAL_ID")
                                                      , F.uniform(0, F.lit(square), F.random()).as_("x_terminal_id") 
                                                      , F.uniform(0, F.lit(square), F.random()).as_("y_terminal_id")
                                                      , rowcount=n_terminals)
    
    return df_terminal_profiles_table


In [None]:
"""
    Function to asscoiate customer with terminals.
     
    For each customer find the terminals that are within a r radious from the customer.
    
"""
def add_terminals_to_customer(df_customer_profiles, df_terminal_profiles, r=5):
    
    snf_square = F.function("SQUARE")
    
    df_customer_terminals = df_customer_profiles.join(df_terminal_profiles, 
                                                      F.sqrt(snf_square(F.col("X_CUSTOMER_ID") - F.col("X_TERMINAL_ID")) + snf_square(F.col("Y_CUSTOMER_ID") - F.col("Y_TERMINAL_ID"))) < F.lit(r))\
                                                .select("CUSTOMER_ID", "TERMINAL_ID")
    
    return df_customer_terminals


In [None]:
"""
    Function to generate transactions for each customer.
    
    Generates transactions for each customer based on the MEAN_AMOUNT, STD_AMOUNT and MEAN_NB_TX_PER_DAY values for each customer.
    
"""
def generate_transactions_table(snf_session, df_customer_profiles, df_customer_terminals, start_date, nb_days):
    
    # Output columns
    row_schema=T.StructType([ T.StructField("CUSTOMER_ID", T.IntegerType())
                         , T.StructField("TERMINAL_ID", T.IntegerType())
                         , T.StructField("TX_AMOUNT", T.DecimalType(38, 6))
                         , T.StructField("TX_TIME_SECONDS", T.IntegerType())
                         , T.StructField("TX_TIME_DAYS", T.IntegerType())
                        ])
    
    # Create a UDTF that generates transactions for each customer
    class generate_trx_udtf:
        def process(self, customer_id: int, mean_nb_tx_per_day: int, mean_amount: int, std_amount: float, available_terminals: list, nb_days:int):
            import random
            customer_transactions = []

            random.seed(customer_id)
            np.random.seed(customer_id)
            # For each day in the range of nb_days generate a random number 
            # of transactions based on the customer mean_nb_tx_per_day
            for day in range(nb_days):
                # Random number of transactions for that day 
                nb_tx = np.random.poisson(mean_nb_tx_per_day)
                if nb_tx>0:
                    # For each transcation during a day
                    for tx in range(nb_tx):
                        # Generate a time for the transaction
                        # Around noon, std 20000 seconds. This choice aims at simulating the fact that 
                        # most transactions occur during the day.
                        time_tx = int(np.random.normal(86400/2, 20000))

                        if (time_tx>0) and (time_tx<86400):
                            # Amount is drawn from a normal distribution
                            amount = np.random.normal(mean_amount, std_amount)
                            # If amount negative, draw from a uniform distribution
                            if amount<0:
                                amount = np.random.uniform(0,mean_amount*2)
                            amount=np.round(amount,decimals=2)
                            # Add a terminal to the transcation, based on the ones closet of the custom
                            if len(available_terminals)>0:
                                terminal_id = random.choice(available_terminals)
                                customer_transactions.append((customer_id, 
                                                          terminal_id, amount,
                                                             time_tx+day*86400, day))
            return customer_transactions

    generate_trx = snf_session.udtf.register(generate_trx_udtf, 
                                                     name="generate_trx_udtf",
                                                     is_permanent=False,
                                                     packages=["numpy"],
                                                     output_schema=row_schema, 
                                                     input_types=[T.LongType(), T.LongType(), T.LongType(), T.DecimalType(38, 6), T.ArrayType(T.StringType()), T.IntegerType()],
                                                     replace=True)

    # Generate a list of terminals per customer
    df_input = df_customer_profiles.join(df_customer_terminals, df_customer_profiles.col("CUSTOMER_ID") == df_customer_terminals.col("CUSTOMER_ID"), lsuffix="_CUST")\
                                        .group_by(F.col("CUSTOMER_ID_CUST"), F.col("MEAN_NB_TX_PER_DAY"), F.col("MEAN_AMOUNT"), F.col("STD_AMOUNT")).agg(F.array_agg("TERMINAL_ID").as_("AVAILABLE_TERMINALS")).cache_result()
    # Generate the transactions
    df_customer_trx = df_input.join_table_function(generate_trx(F.col("CUSTOMER_ID_CUST"), F.col("MEAN_NB_TX_PER_DAY"), F.col("MEAN_AMOUNT"), F.col("STD_AMOUNT"), F.col("AVAILABLE_TERMINALS"), F.lit(nb_days)))\
            .with_column("TX_DATETIME" ,F.dateadd("SECONDS", F.col("TX_TIME_SECONDS"), F.lit(start_date)))\
            .with_column("TRANSACTION_ID",F.row_number().over(Window.order_by(F.col("TX_DATETIME"))))\
            .select("TRANSACTION_ID", "TX_DATETIME" ,"CUSTOMER_ID", "TERMINAL_ID","TX_AMOUNT","TX_TIME_SECONDS","TX_TIME_DAYS").sort("TRANSACTION_ID").cache_result()
    return df_customer_trx


In [None]:
"""
    Function to generate all datasets.
    
"""
def generate_dataset(snf_session, n_customers = 10000, n_terminals = 1000000, nb_days=90, start_date="2018-04-01", square=100, r=5):
    
    start_time=time.time()
    df_customer_profiles_table = snf_generate_customer_profiles_table(snf_session, n_customers, square, random_state = 0).cache_result()
    print("Time to generate customer profiles table: {0:.2}s".format(time.time()-start_time))
    
    start_time=time.time()
    df_terminal_profiles_table = snf_generate_terminal_profiles_table(snf_session, n_terminals, square, random_state = 1).cache_result()
    print("Time to generate terminal profiles table: {0:.2}s".format(time.time()-start_time))
    
    start_time=time.time()
    # Get the cordinate of each terminal
    df_customer_terminals = add_terminals_to_customer(df_customer_profiles_table, df_terminal_profiles_table).cache_result()
    print("Time to associate terminals to customers: {0:.2}s".format(time.time()-start_time))
    
    start_time=time.time()
    df_transactions = generate_transactions_table(snf_session, df_customer_profiles_table, df_customer_terminals, start_date, nb_days)
    print("Time to generate transactions: {0:.2}s".format(time.time()-start_time))
    df_transactions = df_transactions.sort("TX_DATETIME")
    
    # return (customer_profiles_table, terminal_profiles_table, transactions_df)
    return (df_customer_profiles_table, df_terminal_profiles_table, df_transactions)


In [None]:
"""
    Function to generate fraudelent transactions.
    
    It generate 3 diffrent fraud scenarios:
        Scenario 1 - all transactions above 220
        Scenario 2 - set all transactions for the fraudelent terminals between the date the fraud terminal is selected and 28 days after
        Scenario 3 - set all transactions for the fraudelent customers between the date the fraud customer is selected and 14 days after

"""
def add_frauds(transactions_df):
    
    # Get 3 random customers by for each day that will be fraudulent
    df_rand_cust = transactions_df.select(F.col("TX_TIME_DAYS"), F.col("CUSTOMER_ID"), F.row_number().over(Window.partition_by(F.col("tx_time_days")).order_by(F.random())).as_("R_NR"))\
                                    .filter(F.col("R_NR").in_(1,2, 3)).select("TX_TIME_DAYS","CUSTOMER_ID" ).sort("TX_TIME_DAYS").cache_result()
    
    # Get the transcations that will be fraudelent for the customer ie one third of all their transaction between 
    df_fraud_cust_trx = transactions_df.join(df_rand_cust, ((transactions_df["customer_id"] == df_rand_cust["CUSTOMER_ID"]) 
                          & ((transactions_df["TX_TIME_DAYS"] >= df_rand_cust["TX_TIME_DAYS"]) 
                                 & (transactions_df["TX_TIME_DAYS"] < (df_rand_cust["TX_TIME_DAYS"] + F.lit(14)))))
                          ,lsuffix="_T2", rsuffix="_CF")\
                    .select(F.col("TX_DATETIME")
                            , F.col("customer_id_t2").as_("CUSTOMER_ID"),F.row_number().over(Window.partition_by(F.col("customer_id_t2")).order_by(F.random())).as_("R_NR")
                           , F.count("*").over(Window.partition_by(F.col("CUSTOMER_ID"))).as_("ROWS"))\
                    .filter(F.col("R_NR") <= F.round(F.col("ROWS") * F.lit(0.33))).cache_result()

    # Get 2 random terminals by for each day that will be fraudulent
    df_rand_term = transactions_df.select(F.col("TX_TIME_DAYS"), F.col("TERMINAL_ID"), F.row_number().over(Window.partition_by(F.col("tx_time_days")).order_by(F.random())).as_("R_NR"))\
                                    .filter(F.col("R_NR").in_(1,2)).select("TX_TIME_DAYS", "TERMINAL_ID" ).sort("TX_TIME_DAYS").cache_result()
    
    
    df_customer_trx_fraud = transactions_df.join(df_rand_term, 
                         ((transactions_df["terminal_id"] == df_rand_term["TERMINAL_ID"]) 
                          & ((transactions_df["TX_TIME_DAYS"] >= df_rand_term["TX_TIME_DAYS"]) 
                                 & (transactions_df["TX_TIME_DAYS"] < (df_rand_term["TX_TIME_DAYS"] + F.lit(28)))))
                         , how="leftouter", lsuffix="_T1", rsuffix="_TF")\
                    .join(df_fraud_cust_trx, 
                         ((transactions_df["customer_id"] == df_fraud_cust_trx["CUSTOMER_ID"]) 
                          & (transactions_df["TX_DATETIME"] == df_fraud_cust_trx["TX_DATETIME"]))
                         , how="leftouter", lsuffix="_T2", rsuffix="_CF")\
                    .select(F.col("TX_DATETIME_T2").as_("TX_DATETIME"), F.col("CUSTOMER_ID_T2").as_("CUSTOMER_ID"), F.col("TERMINAL_ID_T1").as_("TERMINAL_ID")
                           ,F.iff(F.col("CUSTOMER_ID_CF").is_not_null(), F.col("TX_AMOUNT")* F.lit(5), F.col("TX_AMOUNT")).as_("TX_AMOUNT") 
                            , "TX_TIME_SECONDS", F.col("TX_TIME_DAYS_T1").as_("TX_TIME_DAYS") ,
                            F.when(F.col("TX_AMOUNT") > F.lit(220), F.lit(1)).when(F.col("TERMINAL_ID_TF").is_not_null(), F.lit(2))\
                            .when(F.col("CUSTOMER_ID_CF").is_not_null(), F.lit(3)).otherwise(F.lit(0)).as_("TX_FRAUD_SCENARIO"))\
                    .with_column("TX_FRAUD", F.iff(F.col("TX_FRAUD_SCENARIO") > F.lit(0), F.lit(1), F.lit(0)))
    
    return df_customer_trx_fraud


In [None]:
# Generate customers, terminals and transactions
df_customer_profiles_table, df_terminal_profiles_table, df_transactions = generate_dataset(session, n_customers = 10000, 
                     n_terminals = 20000, 
                     nb_days=180,
                     start_date="2023-01-01", 
                     square=100,
                     r=5)

In [None]:
print(f"Number of customers: {df_customer_profiles_table.count()}")
print(f"Number of terminals: {df_terminal_profiles_table.count()}")
print(f"Number of transactions: {df_transactions.count()}")

Look at the distribution of the transactions generated 

In [None]:
distribution_amount_times_fig, ax = plt.subplots(1, 2, figsize=(18,4))

amount_val = df_transactions.filter(F.col("TX_TIME_DAYS") > F.lit(10)).select("TX_AMOUNT").sample(n=10000).to_pandas()['TX_AMOUNT'].values #transactions_df[transactions_df.TX_TIME_DAYS<10]['TX_AMOUNT'].sample(n=10000).values
time_val = df_transactions.filter(F.col("TX_TIME_DAYS") < F.lit(10)).select("TX_TIME_SECONDS").sample(n=10000).to_pandas()['TX_TIME_SECONDS'].values #transactions_df[transactions_df.TX_TIME_DAYS<10]['TX_TIME_SECONDS'].sample(n=10000).values

sns.histplot(amount_val, ax=ax[0], color='r')
ax[0].set_title('Distribution of transaction amounts', fontsize=14)
ax[0].set_xlim([min(amount_val), max(amount_val)])
ax[0].set(xlabel = "Amount", ylabel="Number of transactions")

# We divide the time variables by 86400 to transform seconds to days in the plot
sns.histplot(time_val/86400, ax=ax[1], color='b', bins = 100,)
ax[1].set_title('Distribution of transaction times', fontsize=14)
ax[1].set_xlim([min(time_val/86400), max(time_val/86400)])
ax[1].set_xticks(range(10))
ax[1].set(xlabel = "Time (days)", ylabel="Number of transactions")

Save the generated data into tables

In [None]:
df_transactions.write.save_as_table("CUSTOMER_TRANSACTIONS_RAW", mode="overwrite")
df_customer_profiles_table.write.save_as_table("CUSTOMER_PROFILES", mode="overwrite")
df_terminal_profiles_table.write.save_as_table("TERMINAL_PROFILES", mode="overwrite")

Create a DataFrame using the transaction table

In [None]:
df_customer_trx_raw = session.table("CUSTOMER_TRANSACTIONS_RAW")

In [None]:
df_customer_trx_raw.show()

Add the fraud scenarions

In [None]:
df_transactions_fraud = add_frauds(df_customer_trx_raw)
df_transactions_fraud.show()

In [None]:
df_transactions_fraud.group_by("TX_FRAUD_SCENARIO").count().show()

In [None]:
df_transactions_fraud.write.save_as_table("CUSTOMER_TRANSACTIONS_FRAUD", mode="overwrite")

In [None]:
session.close()