In [121]:
import os
import pandas as pd
import numpy as np
import torch
from torch_geometric.data import Data

In [None]:
train_data = True
drop_card_frac = None

if train_data:
    load_path = "../../data/fraudTrain.csv"
    save_dir = "graphs/train"
else:
    load_path = "../../data/fraudTest.csv"
    save_dir = "graphs/test"

# Load data

In [123]:

# Loading the data
df = pd.read_csv(load_path)

# Drop the column named 'Unnamed: 0' (unnecessary index column)
df = df.drop(columns=['Unnamed: 0'])

# Convert date/time columns
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'], errors='coerce')
df['dob'] = pd.to_datetime(df['dob'], errors='coerce')

# Keep IDs as string/object
df['cc_num'] = df['cc_num'].astype(str)
df['trans_num'] = df['trans_num'].astype(str)

# Convert categorical/text columns
categorical_cols = ['merchant', 'category', 'first', 'last', 'gender', 
                    'street', 'city', 'state', 'zip', 'job']
for col in categorical_cols:
    df[col] = df[col].astype('category')

# Convert to Unix timestamp (in seconds)
df['unix_trans_time'] = df['trans_date_trans_time'].astype('int64') // 10**9
df['age'] = (df['trans_date_trans_time'] - df['dob']).dt.days / 365.25 # account for leap years

# Compute number of distinct categories per merchant
merchant_category_counts = df.groupby("merchant")["category"].transform("nunique")
# Add it as a new column
df["nb_categories"] = merchant_category_counts

print(df.dtypes)

trans_date_trans_time    datetime64[ns]
cc_num                           object
merchant                       category
category                       category
amt                             float64
first                          category
last                           category
gender                         category
street                         category
city                           category
state                          category
zip                            category
lat                             float64
long                            float64
city_pop                          int64
job                            category
dob                      datetime64[ns]
trans_num                        object
unix_time                         int64
merch_lat                       float64
merch_long                      float64
is_fraud                          int64
unix_trans_time                   int64
age                             float64
nb_categories                     int64


  merchant_category_counts = df.groupby("merchant")["category"].transform("nunique")


# Create node ID mappings

In [124]:
# Create numeric IDs for graph nodes (cards, merchants, transactions)

# treat each unique card number as a category
card_ids = df["cc_num"].astype("category").cat.codes
# Add a new column card_id
df["card_id"] = card_ids

# treat each unique merchant as a category
merchant_ids = df["merchant"].astype("category").cat.codes
# Add a new column merchant_id
df["merchant_id"] = merchant_ids

# Each row is one transaction
df["transaction_id"] = range(len(df))


In [125]:
# Number of transaction nodes
print("Number of transaction : ", len(df))

# Count how many unique cards
print("Number of cards : ", card_ids.nunique())

# Count how many unique merchant
print("Number of merchants : ", merchant_ids.nunique())

Number of transaction :  555719
Number of cards :  924
Number of merchants :  693


## Sous-échantillonner le dataset par carte

In [126]:
def drop_cards(df, card_col="card_id", frac=0.8, seed=42):
    """
    Garde toutes les transactions d'une fraction des cartes.
    """
    rng = np.random.default_rng(seed)

    # cartes uniques
    cards = df[card_col].unique()

    # nombre de cartes à garder
    n_keep = int(len(cards) * (1-frac))

    # échantillonnage aléatoire des cartes
    keep_cards = rng.choice(cards, size=n_keep, replace=False)

    # filtrage du dataframe
    df_kept = df[df[card_col].isin(keep_cards)].copy()

    return df_kept

if drop_card_frac:
    df = drop_cards(df, frac=drop_card_frac)

    # treat each unique card number as a category
    card_ids = df["cc_num"].astype("category").cat.codes
    # Add a new column card_id
    df["card_id"] = card_ids
    # treat each unique merchant as a category
    merchant_ids = df["merchant"].astype("category").cat.codes
    # Add a new column merchant_id
    df["merchant_id"] = merchant_ids
    # Each row is one transaction
    df["transaction_id"] = range(len(df))

    # Number of transaction nodes
    print("Number of transaction : ", len(df))
    # Count how many unique cards
    print("Number of cards : ", card_ids.nunique())
    # Count how many unique merchant
    print("Number of merchants : ", merchant_ids.nunique())

# Build node features

In [127]:
# Fenêtre temporelle par défaut (en secondes) pour les premières transactions
FEATURE_WINDOW = 3600

# ----------------------------------------------------
# Encodage des variables catégorielles
# ----------------------------------------------------
df["category_idx"] = df["category"].astype("category").cat.codes
df["gender_idx"] = df["gender"].astype("category").cat.codes
df["job_idx"] = df["job"].astype("category").cat.codes

# ----------------------------------------------------
# Features temporelles
# ----------------------------------------------------
df["hour"] = df["trans_date_trans_time"].dt.hour
df["dayofweek"] = df["trans_date_trans_time"].dt.dayofweek
df["is_weekend"] = df["dayofweek"].isin([5, 6]).astype(int)
# ANTI-FRAUDE : transactions nocturnes
df["is_night_tx"] = df["hour"].between(0, 5).astype(int)

# ----------------------------------------------------
# Tri global pour toutes les features historiques
# ----------------------------------------------------
df = df.sort_values(["card_id", "unix_trans_time"]).reset_index(drop=True)

# ----------------------------------------------------
# Jump de temps depuis la transaction précédente
# ----------------------------------------------------
df["card_time_since_prev_tx"] = (
    df.groupby("card_id")["unix_trans_time"].diff().fillna(FEATURE_WINDOW)
)

# ----------------------------------------------------
# Montant historique de la carte
# ----------------------------------------------------
df["card_amt_mean"] = (
    df.groupby("card_id")["amt"].expanding().mean().shift().reset_index(level=0, drop=True)
).fillna(0)

df["card_amt_std"] = (
    df.groupby("card_id")["amt"].expanding().std().shift().reset_index(level=0, drop=True)
).fillna(0)

df["card_amt_max"] = (
    df.groupby("card_id")["amt"].expanding().max().shift().reset_index(level=0, drop=True)
).fillna(0)

df["card_amt_min"] = (
    df.groupby("card_id")["amt"].expanding().min().shift().reset_index(level=0, drop=True)
).fillna(0)

# Écarts au max/min historique
df["amt_minus_prev_max"] = df["amt"] - df["card_amt_max"]
df["amt_minus_prev_min"] = df["amt"] - df["card_amt_min"]

# ----------------------------------------------------
# Z-score du montant
# ----------------------------------------------------
MIN_TX = 2
df["card_tx_count"] = df.groupby("card_id").cumcount()
df["amt_zscore"] = np.where(
    df["card_tx_count"] < MIN_TX,
    0,
    (df["amt"] - df["card_amt_mean"]) / (df["card_amt_std"] + 1e-6)
)

# ANTI-FRAUDE : ratios plus forts que le z-score
df["amt_vs_card_mean_ratio"] = np.where(
    df["card_amt_mean"] > 0,
    df["amt"] / (df["card_amt_mean"] + 1e-6),
    1.0
)

# ----------------------------------------------------
# Features de fréquences
# ----------------------------------------------------
df["unix_trans_time_dt"] = pd.to_datetime(
    df["unix_trans_time"],
    unit="s"
)

df["tx_count_1h"] = (
    df
    .groupby("card_id", group_keys=False)
    .rolling("3600s", on="unix_trans_time_dt")["unix_trans_time_dt"]
    .count()
    .shift()
    .reset_index(drop=True)
    .fillna(0)
)

df["tx_count_24h"] = (
    df
    .groupby("card_id", group_keys=False)
    .rolling("86400s", on="unix_trans_time_dt")["unix_trans_time_dt"]
    .count()
    .shift()
    .reset_index(drop=True)
    .fillna(0)
)


# ----------------------------------------------------
# Somme des montants récents
# ----------------------------------------------------
df["amt_sum_1h"] = (
    df
    .groupby("card_id", group_keys=False)
    .rolling("3600s", on="unix_trans_time_dt")["amt"]
    .sum()
    .shift()
    .reset_index(drop=True)
    .fillna(0)
)

# ----------------------------------------------------
# Changement de catégorie
# ----------------------------------------------------
df["prev_category"] = df.groupby("card_id")["category_idx"].shift()

df["is_category_shift"] = (
    (df["category_idx"] != df["prev_category"]) &
    df["prev_category"].notna()
).astype(int)

# ----------------------------------------------------
# Distances géographiques
# ----------------------------------------------------
def haversine_np(lat1, lon1, lat2, lon2):
    R = 6371.0  # rayon Terre en km
    lat1 = np.radians(lat1)
    lon1 = np.radians(lon1)
    lat2 = np.radians(lat2)
    lon2 = np.radians(lon2)
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0)**2
    return 2 * R * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

# Coordonnées du marchand précédent
df["prev_merch_lat"] = df.groupby("card_id")["merch_lat"].shift()
df["prev_merch_long"] = df.groupby("card_id")["merch_long"].shift()

# Distance à la transaction précédente
df["dist_from_prev_tx"] = haversine_np(
    df["merch_lat"], df["merch_long"],
    df["prev_merch_lat"], df["prev_merch_long"]
).fillna(0)

# Distance par rapport à l'adresse du propriétaire
df["dist_from_home"] = haversine_np(
    df["merch_lat"], df["merch_long"],
    df["lat"], df["long"]
)

# ----------------------------------------------------
# ANTI-FRAUDE : vitesse implicite
# ----------------------------------------------------
df["geo_speed_kmh"] = np.where(
    df["card_time_since_prev_tx"] > 0,
    df["dist_from_prev_tx"] / (df["card_time_since_prev_tx"] / 3600),
    0
)

df["is_impossible_travel"] = (df["geo_speed_kmh"] > 900).astype(int)


# ----------------------------------------------------
# Merchant features
# ----------------------------------------------------
# Nouveau merchant pour la carte
df["is_new_merchant"] = df.groupby("card_id")["merchant"].transform(lambda x: ~x.duplicated()).astype(int)

# ----------------------------------------------------
# Création d'un DataFrame dédié aux features merchant
# ----------------------------------------------------
df_merchant = (
    df[["transaction_id", "merchant_id", "unix_trans_time", "amt"]]
    # Tri indispensable pour garantir la cohérence temporelle
    .sort_values(["merchant_id", "unix_trans_time"])
    .reset_index(drop=True)
)

# ----------------------------------------------------
# Temps écoulé depuis la transaction précédente
# chez le même merchant
# ----------------------------------------------------
df_merchant["merchant_time_since_prev_tx"] = (
    df_merchant
    .groupby("merchant_id")["unix_trans_time"]
    .diff()
    .fillna(FEATURE_WINDOW)
)

# ----------------------------------------------------
# Montant moyen historique du merchant (jusqu'à t-1)
# ----------------------------------------------------
df_merchant["merchant_avg_amt"] = (
    df_merchant
    .groupby("merchant_id")["amt"]
    .expanding()
    .mean()
    .shift()
    .reset_index(level=0, drop=True)
    .fillna(0)
)

# ----------------------------------------------------
# ANTI-FRAUDE : montant atypique pour le merchant
# ----------------------------------------------------
df_merchant["amt_vs_merchant_avg_ratio"] = np.where(
    df_merchant["merchant_avg_amt"] > 0,
    df_merchant["amt"] / (df_merchant["merchant_avg_amt"] + 1e-6),
    1.0
)

# ----------------------------------------------------
# Jointure avec le DataFrame principal
# ----------------------------------------------------
df = df.merge(
    df_merchant[
        ["transaction_id", "merchant_time_since_prev_tx", "merchant_avg_amt", "amt_vs_merchant_avg_ratio"]
    ],
    on="transaction_id",
    how="left"
)

In [128]:
if train_data:
    df.to_csv("train_df.csv", index=False)

else:
    df.to_csv("test_df.csv", index=False)

if train_data:
    # Garder uniquement les transactions non frauduleuses
    df = df[df['is_fraud'] == 0].reset_index(drop=True)

# Create PyG graphs

In [129]:
def create_graph(df, EDGE_WINDOW = 3600 * 24 * 7): # 7 jours
    """
    Crée un graphe PyG à partir d'un dataframe df.
    """
    # Node features
    node_features = torch.tensor(
        df[[
            # Transaction features
            "amt",
            "hour",
            "is_night_tx",
            "dayofweek",
            "is_weekend",
            "age",
            "is_new_merchant",
            # "card_time_since_prev_tx",
            # "dist_from_prev_tx",
            "dist_from_home",
            "is_impossible_travel",
            "category_idx",
            "is_category_shift",

            # Card features
            "amt_zscore",
            "amt_vs_card_mean_ratio",
            "amt_minus_prev_max",
            "amt_minus_prev_min",
            "card_amt_mean",
            "card_amt_std",
            "tx_count_1h",
            "tx_count_24h",
            "amt_sum_1h",
            
            # Identity
            "gender_idx", "job_idx",
            "city_pop",

            # Mercahnt feature
            "merchant_avg_amt",
            "merchant_time_since_prev_tx",
            "amt_vs_merchant_avg_ratio",
            
        ]].values,
        dtype=torch.float
    )

    node_labels = torch.tensor(
        df["is_fraud"].values,
        dtype=torch.long
    )


    # Mapping transaction_id -> index PyG
    tx2idx = {tx: i for i, tx in enumerate(df["transaction_id"].values)}

    # Create edges
    edges = []
    edge_attrs = []

    # Création des arêtes pour transactions de la même carte
    # On regroupe les transactions par carte (card_id)
    # puis on relie les transactions consécutives dans la fenêtre EDGE_WINDOW

    for _, group in df.groupby("card_id"):
        # Tri chronologique des transactions
        group = group.sort_values("unix_trans_time")
        
        tx = group["transaction_id"].values
        t  = group["card_time_since_prev_tx"].values
        amt = group["amt"].values
        geo = group["dist_from_prev_tx"].values
        n = len(tx)

        for i in range(n-1):
            j = i+1
            dt = t[j]
            if dt <= EDGE_WINDOW:
                # On ajoute une arête bidirectionnelle
                edges.append([tx2idx[tx[i]], tx2idx[tx[j]]])
                
                # Attribut de l'arête
                edge_attrs.append([
                    # 1.0,                     # same_card
                    # 0.0,                     # same_merchant
                    np.log1p(dt),              # delta time
                    abs(amt[j] - amt[i]),      # amount diff
                    amt[j] / (amt[i] + 1e-6),  # amount ratio
                    geo[j]                     # geo jump
                ])



    # Création des arêtes pour transactions du même merchant
    # Même logique que pour les cartes
    # On relie les transactions consécutives chez le même marchand
    # for _, group in df.groupby("merchant_id"):
    #     # Tri chronologique des transactions
    #     group = group.sort_values("unix_trans_time")
        
    #     tx = group["transaction_id"].values
    #     t  = group["unix_trans_time"].values
    #     amt = group["amt"].values
    #     n = len(tx)

    #     for i in range(n-1):
    #         j = i+1
    #         dt = t[j] - t[i]

    #         if dt <= EDGE_WINDOW:
    #             edges.append([tx2idx[tx[i]], tx2idx[tx[j]]])  # i -> j
    #             # Attribut de l'arête : [same_card, same_merchant]
    #             edge_attrs.append([
    #                 0.0,                     # same_card
    #                 1.0,                     # same_merchant
    #                 np.log1p(dt),
    #                 abs(amt[j] - amt[i]),
    #                 amt[j] / (amt[i] + 1e-6),
    #                 0.0                      # geo jump inconnu
    #             ])

    
    # Assemble Data
    # Conversion des arêtes et attributs en tenseurs PyTorch
    edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
    edge_attrs = torch.tensor(edge_attrs, dtype=torch.float)
    tx_ids = torch.tensor(df["transaction_id"].values,dtype=torch.long)

    # Création du graphe PyTorch Geometric
    data = Data(
        x=node_features,
        edge_index=edge_index,
        edge_attr=edge_attrs,
        y=node_labels,
        tx_id=tx_ids
    )

    data.num_nodes = node_features.size(0)
    
    return data


In [130]:
# durée de chaque sous-graphe (batch) en mois
nb_months = 1

# Découpage par nb_months
graphs = []
df["batch_index"] = df["unix_trans_time"].apply(lambda x: (pd.to_datetime(x, unit='s').month - 1)//nb_months + 1)
for period, period_df in df.groupby("batch_index"):
    graph = create_graph(period_df.reset_index(drop=True))
    graphs.append(graph)
    print(f"--- Batch {period} ---")
    print(graph)
    print("Nombre de noeuds:", graph.num_nodes)
    print("Nombre d'arêtes:", graph.num_edges)
    print("Dimension features noeuds:", graph.x.shape[1])
    print("Nombre de type d'arêtes:", len(torch.unique(graph.edge_attr)))
    print()

print(f"{len(graphs)} graphes créés, un pour chaque période de {nb_months} mois.")

--- Batch 6 ---
Data(x=[30058, 26], edge_index=[2, 29149], edge_attr=[29149, 4], y=[30058], tx_id=[30058], num_nodes=30058)
Nombre de noeuds: 30058
Nombre d'arêtes: 29149
Dimension features noeuds: 26
Nombre de type d'arêtes: 92232

--- Batch 7 ---
Data(x=[85848, 26], edge_index=[2, 84932], edge_attr=[84932, 4], y=[85848], tx_id=[85848], num_nodes=85848)
Nombre de noeuds: 85848
Nombre d'arêtes: 84932
Dimension features noeuds: 26
Nombre de type d'arêtes: 234487

--- Batch 8 ---
Data(x=[88759, 26], edge_index=[2, 87849], edge_attr=[87849, 4], y=[88759], tx_id=[88759], num_nodes=88759)
Nombre de noeuds: 88759
Nombre d'arêtes: 87849
Dimension features noeuds: 26
Nombre de type d'arêtes: 241043

--- Batch 9 ---
Data(x=[69533, 26], edge_index=[2, 68614], edge_attr=[68614, 4], y=[69533], tx_id=[69533], num_nodes=69533)
Nombre de noeuds: 69533
Nombre d'arêtes: 68614
Dimension features noeuds: 26
Nombre de type d'arêtes: 198112

--- Batch 10 ---
Data(x=[69348, 26], edge_index=[2, 68425], edge_

In [131]:
# Save to a file
os.makedirs(save_dir, exist_ok=True)
for i, graph in enumerate(graphs, start=1):
    file_path = os.path.join(save_dir, f"graph_batch_{i}.pt")
    torch.save(graph, file_path)
    print(f"Graphe batch {i} enregistré : {file_path}")


Graphe batch 1 enregistré : graphs/test/graph_batch_1.pt
Graphe batch 2 enregistré : graphs/test/graph_batch_2.pt
Graphe batch 3 enregistré : graphs/test/graph_batch_3.pt
Graphe batch 4 enregistré : graphs/test/graph_batch_4.pt
Graphe batch 5 enregistré : graphs/test/graph_batch_5.pt
Graphe batch 6 enregistré : graphs/test/graph_batch_6.pt
Graphe batch 7 enregistré : graphs/test/graph_batch_7.pt
