In [0]:
%restart_python

In [0]:
%sql
-- Create database
CREATE DATABASE IF NOT EXISTS upi_fraud_schema;
USE upi_fraud_schema;

-- USERS TABLE
CREATE TABLE IF NOT EXISTS users (
    user_id STRING NOT NULL,
    default_city STRING,
    default_state STRING,
    CONSTRAINT pk_users PRIMARY KEY (user_id)
) USING DELTA;

-- DEVICES TABLE
CREATE TABLE IF NOT EXISTS devices (
    device_id STRING NOT NULL,
    device_os STRING,
    CONSTRAINT pk_devices PRIMARY KEY (device_id)
) USING DELTA;

-- RECIPIENTS TABLE (Merchants)
CREATE TABLE IF NOT EXISTS recipients (
    merchant_id STRING NOT NULL,
    merchant_category STRING,
    transaction_channel STRING,
    CONSTRAINT pk_recipients PRIMARY KEY (merchant_id)
) USING DELTA;

-- TRANSACTIONS TABLE
CREATE TABLE IF NOT EXISTS transactions (
    transaction_id STRING NOT NULL,
    transaction_date DATE,
    transaction_time STRING,
    user_id STRING NOT NULL,
    merchant_id STRING NOT NULL,
    device_id STRING NOT NULL,
    transaction_type STRING,
    payment_gateway STRING,
    transaction_city STRING,
    transaction_state STRING,
    ip_address STRING,
    transaction_status STRING,
    transaction_frequency INT,
    amount FLOAT,
    transaction_amount_deviation FLOAT,
    days_since_last_transaction INT,
    fraud INT,
    CONSTRAINT pk_transactions PRIMARY KEY (transaction_id),
    CONSTRAINT fk_txn_user FOREIGN KEY (user_id) REFERENCES users(user_id),
    CONSTRAINT fk_txn_device FOREIGN KEY (device_id) REFERENCES devices(device_id),
    CONSTRAINT fk_txn_merchant FOREIGN KEY (merchant_id) REFERENCES recipients(merchant_id)
) USING DELTA;



In [0]:
import uuid
import random
from datetime import datetime, timedelta
from faker import Faker
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

# Initialize Spark and Faker
spark = SparkSession.builder.getOrCreate()
fake = Faker()
random.seed(42)
Faker.seed(42)

# Config
NUM_USERS = 1000
NUM_DEVICES = 800
NUM_RECIPIENTS = 500
NUM_TRANSACTIONS = 20000

states = ["Andhra Pradesh", "Assam", "Bihar", "Chhattisgarh", "Goa", "Gujarat", "Haryana", 
          "Himachal Pradesh", "Jharkhand", "Karnataka", "Kerala", "Madhya Pradesh", 
          "Maharashtra", "Manipur", "Meghalaya", "Mizoram", "Nagaland", "Odisha", 
          "Punjab", "Rajasthan", "Sikkim", "Tamil Nadu", "Telangana", "Tripura", 
          "Uttar Pradesh", "Uttarakhand", "West Bengal"]
cities = [fake.city() for _ in range(200)]
merchant_categories = ["Brand Vouchers and OTT", "Home delivery", "Utilities", "Purchases", 
                       "More Services", "Investment", "Travel bookings", "Donations and Devotion", 
                       "Financial services and Taxes", "Other"]
transaction_types = ["Refund", "Bank Transfer", "Subscription", "Purchase", 
                    "Investment", "Other", "Bill Payment"]
payment_gateways = ["SamplePay", "UPI Pay", "Dummy Bank", "CReditPAY", "Gamma Bank", 
                    "Sigma Bank", "Alpha Bank", "Bank of Data", "Other"]
device_oses = ["Android", "iOS", "Windows", "MacOS"]
transaction_channels = ["Online", "Mobile", "In-store"]
transaction_statuses = ["Completed", "Pending", "Failed"]

def random_date(start, end):
    return start + timedelta(days=random.randint(0, (end - start).days))

# Generate data
rows = []
for _ in range(NUM_TRANSACTIONS):
    transaction_id = f"T{random.randint(10000000,99999999)}"
    date_obj = random_date(datetime(2023,1,1), datetime(2024,6,30))
    date = date_obj.strftime("%d/%m/%y")
    time = fake.time(pattern="%I:%M:%S %p")
    merchant_id = str(uuid.uuid4())
    customer_id = str(uuid.uuid4())
    device_id = str(uuid.uuid4())
    transaction_type = random.choice(transaction_types)
    payment_gateway = random.choice(payment_gateways)
    transaction_city = random.choice(cities)
    transaction_state = random.choice(states)
    ip_address = fake.ipv4()
    transaction_status = random.choice(transaction_statuses)
    device_os = random.choice(device_oses)
    transaction_frequency = random.randint(0, 50)
    merchant_category = random.choice(merchant_categories)
    transaction_channel = random.choice(transaction_channels)
    transaction_amount_deviation = round(random.uniform(-100, 100), 2)
    days_since_last_transaction = random.randint(0, 30)
    amount = round(random.uniform(1, 5000), 2)
    fraud = random.choices([0, 1], weights=[0.7, 0.3])[0]
    rows.append((
        transaction_id, date, time, merchant_id, customer_id, device_id, transaction_type, payment_gateway,
        transaction_city, transaction_state, ip_address, transaction_status, device_os, transaction_frequency,
        merchant_category, transaction_channel, transaction_amount_deviation, days_since_last_transaction, amount, fraud
    ))

schema = StructType([
    StructField("Transaction_ID", StringType(), True),
    StructField("Date", StringType(), True),
    StructField("Time", StringType(), True),
    StructField("Merchant_ID", StringType(), True),
    StructField("Customer_ID", StringType(), True),
    StructField("Device_ID", StringType(), True),
    StructField("Transaction_Type", StringType(), True),
    StructField("Payment_Gateway", StringType(), True),
    StructField("Transaction_City", StringType(), True),
    StructField("Transaction_State", StringType(), True),
    StructField("IP_Address", StringType(), True),
    StructField("Transaction_Status", StringType(), True),
    StructField("Device_OS", StringType(), True),
    StructField("Transaction_Frequency", IntegerType(), True),
    StructField("Merchant_Category", StringType(), True),
    StructField("Transaction_Channel", StringType(), True),
    StructField("Transaction_Amount_Deviation", FloatType(), True),
    StructField("Days_Since_Last_Transaction", IntegerType(), True),
    StructField("amount", FloatType(), True),
    StructField("fraud", IntegerType(), True)
])

features_df = spark.createDataFrame(rows, schema=schema)
features_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("features")


In [0]:
%sql
SELECT * FROM features;


Transaction_ID,Date,Time,Merchant_ID,Customer_ID,Device_ID,Transaction_Type,Payment_Gateway,Transaction_City,Transaction_State,IP_Address,Transaction_Status,Device_OS,Transaction_Frequency,Merchant_Category,Transaction_Channel,Transaction_Amount_Deviation,Days_Since_Last_Transaction,amount,fraud
T32794721,09/02/24,05:18:14 AM,ada24d11-a7f5-4ab9-ae76-f27a48182f56,515c6088-6735-4efd-84fe-b248782ba16d,b8bc47cc-b2de-4f1d-a9a2-4eaad61ec922,Investment,Gamma Bank,Jeffreyborough,Andhra Pradesh,147.99.110.163,Pending,Windows,42,Purchases,Mobile,45.99,2,1648.71,1
T25247038,25/07/23,07:32:00 AM,94ac4f38-b64e-4f85-9f3a-19f611eeb2bc,f987da08-f0e6-4218-a13f-734088327212,e5d8fa11-4992-43ab-a3af-a57ab77d5d1a,Bank Transfer,Bank of Data,South Jeffrey,Rajasthan,12.109.89.16,Completed,iOS,22,Home delivery,Online,-29.36,15,877.92,0
T42135383,28/12/23,01:26:40 AM,daeb2f53-7b3e-43d9-830c-10ad79530f84,db183674-bc33-48bb-8f4a-64985f527595,9b51304b-34b5-49e7-92b6-85b5d9e0050b,Purchase,Other,South Jason,Jharkhand,78.43.152.230,Completed,Windows,34,Donations and Devotion,Online,-29.76,20,497.34,1
T25969695,20/05/24,12:22:20 PM,4583b5e3-c96e-4f11-984d-c1c47717bf3a,a1580188-35a1-4d80-9533-32589c963014,6497b176-af8f-484e-9acf-285c6bab98e5,Refund,Gamma Bank,Port Michelleville,Himachal Pradesh,81.150.235.134,Failed,MacOS,2,Investment,Online,-57.31,24,61.63,0
T62055641,06/06/23,06:13:03 PM,9b330940-ec4f-4c36-b438-e98a9f67ad81,328a78f7-5c6f-4c0f-a390-e86ffa542375,6cd32cfe-0e48-4a4d-8566-b4da949c5b22,Purchase,Sigma Bank,Lake Debbie,West Bengal,88.185.135.153,Pending,iOS,14,Other,In-store,79.16,3,1209.02,0
T16341247,26/10/23,07:33:54 AM,5d50d0e2-071b-4712-bb88-c3930945b350,b1a4fd69-3576-4d1d-8155-ea749925ceb4,5e11492f-6e6d-42c4-b184-ec8f967f87f2,Other,Other,Shawhaven,Kerala,164.30.255.113,Failed,MacOS,34,Other,In-store,-28.32,4,4462.02,1
T61341313,02/04/23,12:12:17 PM,255000d9-80e6-4f64-b811-2b1431ce93c1,8fb2acc3-880d-4905-9756-9db9b6519c86,71b847f5-32e3-4c96-bca6-26f01a7954be,Bill Payment,Other,Cassandraton,Sikkim,34.239.218.198,Pending,MacOS,8,More Services,Mobile,33.99,18,2617.84,1
T50942554,17/05/24,10:18:23 AM,ba2d0d16-ad14-4f9d-ac57-559c0d12c535,902d5e28-c4f0-4631-9d1f-975b5afbf38a,d5f431f5-57c6-445b-8622-f3e6310a81ae,Subscription,Bank of Data,Lake Larry,Maharashtra,221.11.38.221,Failed,Android,34,Utilities,In-store,87.79,1,529.11,0
T31728214,05/05/24,12:47:44 PM,08e5efb8-2046-43f3-a5e5-a593524508e5,d3da461d-1490-47f0-9444-01334d933110,cfc337e4-e3d1-46f6-8bee-5fe69a6d814f,Purchase,Other,Karenchester,Kerala,119.59.53.212,Pending,Android,3,Utilities,Mobile,-33.32,4,164.06,0
T27273137,25/05/24,11:16:02 PM,b51f7be2-6aeb-46c2-a75d-e657d5d8ac82,085b7f05-eeb7-44b4-b43d-b1f1458d8049,5d5b9aba-425a-4d1c-99cd-2898a6caf6d9,Investment,SamplePay,Sandersborough,Telangana,22.119.135.128,Completed,Android,30,Donations and Devotion,In-store,-78.19,28,2995.34,0


IMPORTING LIBRARIES


In [0]:
# importing libraries
# %pip install pandas
import pandas as pd
import numpy as np
import matplotlib.pyplot as px
import seaborn as sns

In [0]:
# %pip install pyforest
# import pyforest


In [0]:
#Data collection
# -Import the data 
# -Use pandas for data manipulation
df = spark.read.table("features")
# df = features_df.toPandas()



In [0]:
# df.head()

df = spark.read.table("features")
df.show(5, truncate=False)


+--------------+--------+-----------+------------------------------------+------------------------------------+------------------------------------+----------------+---------------+-----------------+-----------------+-------------+------------------+---------+---------------------+----------------------------+-------------------+----------------------------+---------------------------+-------+-----+
|Transaction_ID|Date    |Time       |Merchant_ID                         |Customer_ID                         |Device_ID                           |Transaction_Type|Payment_Gateway|Transaction_City |Transaction_State|IP_Address   |Transaction_Status|Device_OS|Transaction_Frequency|Merchant_Category           |Transaction_Channel|Transaction_Amount_Deviation|Days_Since_Last_Transaction|amount |fraud|
+--------------+--------+-----------+------------------------------------+------------------------------------+------------------------------------+----------------+---------------+-------------

In [0]:
df.columns
# df.printSchema

['Transaction_ID',
 'Date',
 'Time',
 'Merchant_ID',
 'Customer_ID',
 'Device_ID',
 'Transaction_Type',
 'Payment_Gateway',
 'Transaction_City',
 'Transaction_State',
 'IP_Address',
 'Transaction_Status',
 'Device_OS',
 'Transaction_Frequency',
 'Merchant_Category',
 'Transaction_Channel',
 'Transaction_Amount_Deviation',
 'Days_Since_Last_Transaction',
 'amount',
 'fraud']

Data Preparation
-duplicate, missing, unique, removing or dropping unique values

DUPLICATE VALUES


In [0]:
from pyspark.sql.functions import count

# Find all rows that are duplicated across all columns
df.groupBy(df.columns) \
    .count() \
    .filter("count > 1") \
    .show(truncate=False)
    #  .count()


# df.duplictaed()->pandas


+--------------+----+----+-----------+-----------+---------+----------------+---------------+----------------+-----------------+----------+------------------+---------+---------------------+-----------------+-------------------+----------------------------+---------------------------+------+-----+-----+
|Transaction_ID|Date|Time|Merchant_ID|Customer_ID|Device_ID|Transaction_Type|Payment_Gateway|Transaction_City|Transaction_State|IP_Address|Transaction_Status|Device_OS|Transaction_Frequency|Merchant_Category|Transaction_Channel|Transaction_Amount_Deviation|Days_Since_Last_Transaction|amount|fraud|count|
+--------------+----+----+-----------+-----------+---------+----------------+---------------+----------------+-----------------+----------+------------------+---------+---------------------+-----------------+-------------------+----------------------------+---------------------------+------+-----+-----+
+--------------+----+----+-----------+-----------+---------+----------------+--------

MISSING VALUES

In [0]:

from pyspark.sql.functions import col, sum as spark_sum
from functools import reduce

# This will show you how many nulls are present in each column:
missing_counts = features_df.select([
    spark_sum(col(c).isNull().cast("int")).alias(c) for c in features_df.columns
])
missing_counts.show()


# This will display all rows that have at least one null value
features_df.filter(
    reduce(lambda a, b: a | b, (col(c).isNull() for c in features_df.columns))
).show()


# This gives you the number of rows with at least one null:
num_missing_rows = features_df.filter(
    reduce(lambda a, b: a | b, (col(c).isNull() for c in features_df.columns))
).count()
print("Rows with any missing value:", num_missing_rows)

+--------------+----+----+-----------+-----------+---------+----------------+---------------+----------------+-----------------+----------+------------------+---------+---------------------+-----------------+-------------------+----------------------------+---------------------------+------+-----+
|Transaction_ID|Date|Time|Merchant_ID|Customer_ID|Device_ID|Transaction_Type|Payment_Gateway|Transaction_City|Transaction_State|IP_Address|Transaction_Status|Device_OS|Transaction_Frequency|Merchant_Category|Transaction_Channel|Transaction_Amount_Deviation|Days_Since_Last_Transaction|amount|fraud|
+--------------+----+----+-----------+-----------+---------+----------------+---------------+----------------+-----------------+----------+------------------+---------+---------------------+-----------------+-------------------+----------------------------+---------------------------+------+-----+
|             0|   0|   0|          0|          0|        0|               0|              0|          

How to deal with missing values if any
-drop the missing values ( only if proportion is very less)
-fill th emissing values
-froward fill
-back fill
-linear regression
-mean values (but it is sensitive to outliers)
-median values ( not sensitive to outliers)

In [0]:
num_rows = features_df.count()
num_cols = len(features_df.columns)
print(f"Shape: ({num_rows}, {num_cols})")


Shape: (20000, 20)


DEALING WITH UNIQUES VALUES


In [0]:
from pyspark.sql.functions import countDistinct
import pandas as pd

# Compute unique counts for each column
unique_counts = [(c, df.select(countDistinct(c)).first()[0]) for c in df.columns]

# Convert to Spark DataFrame for display
unique_counts_df = spark.createDataFrame(unique_counts, ["column_name", "unique_count"])
unique_counts_df.show(truncate=False)


+----------------------------+------------+
|column_name                 |unique_count|
+----------------------------+------------+
|Transaction_ID              |19997       |
|Date                        |547         |
|Time                        |17967       |
|Merchant_ID                 |20000       |
|Customer_ID                 |20000       |
|Device_ID                   |20000       |
|Transaction_Type            |7           |
|Payment_Gateway             |9           |
|Transaction_City            |197         |
|Transaction_State           |27          |
|IP_Address                  |20000       |
|Transaction_Status          |3           |
|Device_OS                   |4           |
|Transaction_Frequency       |51          |
|Merchant_Category           |10          |
|Transaction_Channel         |3           |
|Transaction_Amount_Deviation|12664       |
|Days_Since_Last_Transaction |31          |
|amount                      |19617       |
|fraud                       |2 

Drop the unique values from the data



In [0]:
df = df.drop("Transaction_ID", "Merchant_ID", "Customer_ID", "Device_ID", "IP_Address")


In [0]:

df.show(5, truncate=False)

+--------+-----------+----------------+---------------+-----------------+-----------------+------------------+---------+---------------------+----------------------------+-------------------+----------------------------+---------------------------+-------+-----+
|Date    |Time       |Transaction_Type|Payment_Gateway|Transaction_City |Transaction_State|Transaction_Status|Device_OS|Transaction_Frequency|Merchant_Category           |Transaction_Channel|Transaction_Amount_Deviation|Days_Since_Last_Transaction|amount |fraud|
+--------+-----------+----------------+---------------+-----------------+-----------------+------------------+---------+---------------------+----------------------------+-------------------+----------------------------+---------------------------+-------+-----+
|03/06/24|08:41:58 AM|Bill Payment    |SamplePay      |South Todd       |Madhya Pradesh   |Pending           |Android  |11                   |Financial services and Taxes|In-store           |13.08               

In [0]:
from pyspark.sql.functions import col

# For fraud transactions (fraud = 1)
fraud_df = df.filter(col('fraud') == 1)

# For legitimate transactions (fraud = 0)
normal_df = df.filter(col('fraud') == 0)

print(f"Fraud cases: {fraud_df.count()}")
print(f"Legitimate cases: {normal_df.count()}")


Fraud cases: 5934
Legitimate cases: 14066


Analysing the fraud df
