<a href="https://colab.research.google.com/github/majid-shahabi/synthetic_fraud_transactions/blob/Link_to_Colab/synthetic_fraud.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Import Libraries and Functions

In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    r2_score,
    root_mean_squared_error,
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from typing import Tuple, Union


# Helper functions
def freedman_diaconis_bins(data: np.ndarray) -> int:
    """Calculate the optimal number of bins for a histogram using the Freedman-Diaconis rule.

    The Freedman-Diaconis rule is robust to outliers and works well for skewed data.
    It calculates the bin width using the interquartile range (IQR) and adjusts the
    number of bins accordingly.

    Args:
        data (array-like): Input data, typically a 1D array of numerical values.

    Returns:
        int: Optimal number of bins for the histogram. If the bin width is very small,
        a minimum of 1 bin is enforced.

    Example:
        >>> data = np.random.randn(1000)
        >>> freedman_diaconis_bins(data)
        16
    """
    # Ensure input is a NumPy array
    data = np.asarray(data)

    # Calculate the interquartile range (IQR)
    iqr = np.subtract(*np.percentile(data, [75, 25]))

    # Compute the bin width using the Freedman-Diaconis rule
    bin_width = 2 * iqr / np.cbrt(len(data))

    # If bin_width is too small, return at least 1 bin
    if bin_width <= 0:
        return 1

    # Calculate the number of bins and return it
    return max(1, int(np.ceil((data.max() - data.min()) / bin_width)))


def optimal_figsize(data: np.ndarray, bins: int = None, base_width: float = 8.0, base_height: float = 5.0) -> Tuple[float, float]:
    """Determine the optimal figure size based on the number of bins.

    Dynamically adjusts the width of the plot based on the number of bins (if
    specified) or calculates the optimal number using the Freedman-Diaconis rule.
    The height is kept fixed or adjusted based on user input. Recommended for
    small datasets.

    Args:
        data (np.ndarray): The input data array to be visualized.
        bins (int, optional): The number of bins for the histogram. If not provided,
            it is calculated using the Freedman-Diaconis rule.
        base_width (float, optional): The base width of the figure. Defaults to 8.0.
        base_height (float, optional): The base height of the figure. Defaults to 5.0.

    Returns:
        Tuple[float, float]: The optimal width and height of the figure.
    """
    # Calculate bins if not provided
    if bins is None:
        bins = freedman_diaconis_bins(data)

    # Adjust width based on the number of bins
    width = base_width + bins * 0.1
    return (width, base_height)


def optimal_figsize_by_range(data: np.ndarray, base_width: float = 8.0, base_height: float = 5.0, scale_factor: float = 0.5) -> Tuple[float, float]:
    """Determine the optimal figure size based on the data range.

    Adjusts the width of the figure proportionally to the range of the data to
    ensure adequate spacing and visibility. The height remains fixed unless
    adjusted by user input. Recommended for small datasets

    Args:
        data (np.ndarray): The input data array to be visualized.
        base_width (float, optional): The base width of the figure. Defaults to 8.0.
        base_height (float, optional): The base height of the figure. Defaults to 5.0.
        scale_factor (float, optional): Scaling factor to adjust the width based
            on the data range. Defaults to 0.5.

    Returns:
        Tuple[float, float]: The optimal width and height of the figure.
    """
    # Calculate the range of the data
    data_range = np.max(data) - np.min(data)

    # Adjust width based on data range
    width = base_width + data_range * scale_factor
    return (width, base_height)


def figsize_by_samples(n_samples: int, base_width: float = 8.0, base_height: float = 5.0) -> Tuple[float, float]:
    """Determine the optimal figure size based on the number of samples.

    Adjusts the width of the figure proportionally to the number of samples,
    ensuring adequate space for larger datasets. The height remains fixed
    unless specified. Recommended for large datasets.

    Args:
        n_samples (int): The number of samples to be visualized.
        base_width (float, optional): The base width of the figure. Defaults to 8.0.
        base_height (float, optional): The base height of the figure. Defaults to 5.0.

    Returns:
        Tuple[float, float]: The optimal width and height of the figure.
    """
    # Adjust width based on the number of samples
    width = base_width + (n_samples // 1000)
    return (width, base_height)

#Import Data

In [3]:
from google.colab import files

uploaded = files.upload()

Saving synthetic_fraud_dataset.csv to synthetic_fraud_dataset.csv


In [10]:
main_data = pd.read_csv("synthetic_fraud_dataset.csv")

#Data Exploration

In [12]:
# Data shape
main_data.shape

(50000, 21)

In [11]:
# First 10 rows of data
main_data.head(10)

Unnamed: 0,Transaction_ID,User_ID,Transaction_Amount,Transaction_Type,Timestamp,Account_Balance,Device_Type,Location,Merchant_Category,IP_Address_Flag,...,Daily_Transaction_Count,Avg_Transaction_Amount_7d,Failed_Transaction_Count_7d,Card_Type,Card_Age,Transaction_Distance,Authentication_Method,Risk_Score,Is_Weekend,Fraud_Label
0,TXN_33553,USER_1834,39.79,POS,2023-08-14 19:30:00,93213.17,Laptop,Sydney,Travel,0,...,7,437.63,3,Amex,65,883.17,Biometric,0.8494,0,0
1,TXN_9427,USER_7875,1.19,Bank Transfer,2023-06-07 04:01:00,75725.25,Mobile,New York,Clothing,0,...,13,478.76,4,Mastercard,186,2203.36,Password,0.0959,0,1
2,TXN_199,USER_2734,28.96,Online,2023-06-20 15:25:00,1588.96,Tablet,Mumbai,Restaurants,0,...,14,50.01,4,Visa,226,1909.29,Biometric,0.84,0,1
3,TXN_12447,USER_2617,254.32,ATM Withdrawal,2023-12-07 00:31:00,76807.2,Tablet,New York,Clothing,0,...,8,182.48,4,Visa,76,1311.86,OTP,0.7935,0,1
4,TXN_39489,USER_2014,31.28,POS,2023-11-11 23:44:00,92354.66,Mobile,Mumbai,Electronics,0,...,14,328.69,4,Mastercard,140,966.98,Password,0.3819,1,1
5,TXN_42724,USER_6852,168.55,Online,2023-06-05 20:55:00,33236.94,Laptop,Tokyo,Restaurants,0,...,3,226.85,2,Discover,51,1725.64,OTP,0.0504,0,0
6,TXN_10822,USER_5052,3.79,POS,2023-11-07 01:18:00,86834.18,Tablet,London,Restaurants,0,...,2,298.35,2,Mastercard,168,3757.19,Password,0.0875,0,0
7,TXN_49498,USER_4660,7.08,ATM Withdrawal,2023-02-25 03:43:00,45826.27,Tablet,London,Restaurants,0,...,3,164.38,4,Discover,182,1764.66,Biometric,0.5326,0,1
8,TXN_4144,USER_1584,34.25,ATM Withdrawal,2023-03-09 22:51:00,94392.35,Tablet,Tokyo,Clothing,0,...,7,90.02,3,Visa,24,550.38,Biometric,0.1347,1,0
9,TXN_36958,USER_9498,16.24,POS,2023-09-20 17:27:00,91859.97,Mobile,Mumbai,Travel,0,...,6,474.42,1,Mastercard,124,720.91,PIN,0.3394,0,0


In [6]:
# Data types
main_data.dtypes

Unnamed: 0,0
Transaction_ID,object
User_ID,object
Transaction_Amount,float64
Transaction_Type,object
Timestamp,object
Account_Balance,float64
Device_Type,object
Location,object
Merchant_Category,object
IP_Address_Flag,int64


##Q: How many unique `User_ID`, `Location`, `Merchant_Category` and `Card_Type` are present in the dataset?

In [16]:
unique_user_ID = main_data['User_ID'].nunique()
print("Number of unique User IDs:", unique_user_ID)
unique_location = main_data['Location'].nunique()
print("Number of unique Locations:", unique_location)
unique_merchant_category = main_data['Merchant_Category'].nunique()
print("Number of unique Merchant Categorys:", unique_merchant_category)
unique_card_type = main_data['Card_Type'].nunique()
print("Number of unique Card Types:", unique_card_type)

Number of unique User IDs: 8963
Number of unique Locations: 5
Number of unique Merchant Categorys: 5
Number of unique Card Types: 4


##Q: How many fraud transactions are present in the dataset?

In [24]:
fraud_transaction = main_data['Fraud_Label'].value_counts()
print("Number of fraud transactions:", fraud_transaction[1])

Number of fraud transactions: 16067


Q: What are the top five merchant categories and locations of the fraudulent transactions?

In [32]:
common_merchant_category = main_data[main_data["Fraud_Label"]==1]['Merchant_Category'].value_counts().head(5)
common_location = main_data[main_data["Fraud_Label"]==1]['Location'].value_counts().head(5)
print(f"Among all the fraudulent transactions the five most common merchant categories are: {common_merchant_category}")
print(f"Among all the fraudulent transactions the five most common locations are: {common_location}")

Among all the fraudulent transactions the most common merchant categories are: Merchant_Category
Restaurants    3255
Travel         3235
Groceries      3217
Clothing       3181
Electronics    3179
Name: count, dtype: int64
Among all the fraudulent transactions the most common locations are: Location
Tokyo       3315
New York    3221
Sydney      3194
London      3181
Mumbai      3156
Name: count, dtype: int64
