In [4]:
import pandas as pd
import numpy as np


In [5]:
load_path = "../../data/fraudTrain.csv"

# Loading the data
df = pd.read_csv(load_path)

In [6]:

# Loading the data
df = pd.read_csv(load_path)

# Drop the column named 'Unnamed: 0' (unnecessary index column)
df = df.drop(columns=['Unnamed: 0'])

# Convert date/time columns
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'], errors='coerce')
df['dob'] = pd.to_datetime(df['dob'], errors='coerce')

# Keep IDs as string/object
df['cc_num'] = df['cc_num'].astype(str)
df['trans_num'] = df['trans_num'].astype(str)

# Convert categorical/text columns
categorical_cols = ['merchant', 'category', 'first', 'last', 'gender', 
                    'street', 'city', 'state', 'zip', 'job']
for col in categorical_cols:
    df[col] = df[col].astype('category')

# Convert to Unix timestamp (in seconds)
df['unix_trans_time'] = df['trans_date_trans_time'].astype('int64') // 10**9
df['age'] = (df['trans_date_trans_time'] - df['dob']).dt.days / 365.25 # account for leap years

# Compute number of distinct categories per merchant
merchant_category_counts = df.groupby("merchant")["category"].transform("nunique")
# Add it as a new column
df["nb_categories"] = merchant_category_counts

print(df.dtypes)

trans_date_trans_time    datetime64[ns]
cc_num                           object
merchant                       category
category                       category
amt                             float64
first                          category
last                           category
gender                         category
street                         category
city                           category
state                          category
zip                            category
lat                             float64
long                            float64
city_pop                          int64
job                            category
dob                      datetime64[ns]
trans_num                        object
unix_time                         int64
merch_lat                       float64
merch_long                      float64
is_fraud                          int64
unix_trans_time                   int64
age                             float64
nb_categories                     int64


  merchant_category_counts = df.groupby("merchant")["category"].transform("nunique")


In [7]:
# Create numeric IDs for graph nodes (cards, merchants, transactions)

# treat each unique card number as a category
card_ids = df["cc_num"].astype("category").cat.codes
# Add a new column card_id
df["card_id"] = card_ids

# treat each unique merchant as a category
merchant_ids = df["merchant"].astype("category").cat.codes
# Add a new column merchant_id
df["merchant_id"] = merchant_ids

# Each row is one transaction
df["transaction_id"] = range(len(df))


In [None]:
# Fenêtre temporelle par défaut (en secondes) pour les premières transactions
FEATURE_WINDOW = 3600

# ----------------------------------------------------
# Encodage des variables catégorielles
# ----------------------------------------------------
df["category_idx"] = df["category"].astype("category").cat.codes
df["gender_idx"] = df["gender"].astype("category").cat.codes
df["job_idx"] = df["job"].astype("category").cat.codes

# ----------------------------------------------------
# Features temporelles
# ----------------------------------------------------
df["hour"] = df["trans_date_trans_time"].dt.hour
df["dayofweek"] = df["trans_date_trans_time"].dt.dayofweek
df["is_weekend"] = df["dayofweek"].isin([5, 6]).astype(int)

# ----------------------------------------------------
# Tri global pour toutes les features historiques
# ----------------------------------------------------
df = df.sort_values(["card_id", "unix_trans_time"]).reset_index(drop=True)

# ----------------------------------------------------
# Jump de temps depuis la transaction précédente
# ----------------------------------------------------
df["card_time_since_prev_tx"] = (
    df.groupby("card_id")["unix_trans_time"].diff().fillna(FEATURE_WINDOW)
)
# Optionnel : log-transform pour stabiliser les grandes valeurs
df["card_time_since_prev_tx_log"] = np.log1p(df["card_time_since_prev_tx"])

# ----------------------------------------------------
# Montant historique de la carte
# ----------------------------------------------------
df["card_amt_mean"] = (
    df.groupby("card_id")["amt"].expanding().mean().shift().reset_index(level=0, drop=True)
).fillna(0)

df["card_amt_std"] = (
    df.groupby("card_id")["amt"].expanding().std().shift().reset_index(level=0, drop=True)
).fillna(0)

df["card_amt_max"] = (
    df.groupby("card_id")["amt"].expanding().max().shift().reset_index(level=0, drop=True)
).fillna(0)

df["card_amt_min"] = (
    df.groupby("card_id")["amt"].expanding().min().shift().reset_index(level=0, drop=True)
).fillna(0)

# Écarts au max/min historique
df["amt_minus_prev_max"] = df["amt"] - df["card_amt_max"]
df["amt_minus_prev_min"] = df["amt"] - df["card_amt_min"]

# ----------------------------------------------------
# Z-score du montant
# ----------------------------------------------------
MIN_TX = 2
df["card_tx_count"] = df.groupby("card_id").cumcount()
df["amt_zscore"] = np.where(
    df["card_tx_count"] < MIN_TX,
    0,
    (df["amt"] - df["card_amt_mean"]) / (df["card_amt_std"] + 1e-6)
)

# ----------------------------------------------------
# Distances géographiques
# ----------------------------------------------------
def haversine_np(lat1, lon1, lat2, lon2):
    R = 6371.0  # rayon Terre en km
    lat1 = np.radians(lat1)
    lon1 = np.radians(lon1)
    lat2 = np.radians(lat2)
    lon2 = np.radians(lon2)
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0)**2
    return 2 * R * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

# Coordonnées du marchand précédent
df["prev_merch_lat"] = df.groupby("card_id")["merch_lat"].shift()
df["prev_merch_long"] = df.groupby("card_id")["merch_long"].shift()

# Distance à la transaction précédente
df["dist_from_prev_tx"] = haversine_np(
    df["merch_lat"], df["merch_long"],
    df["prev_merch_lat"], df["prev_merch_long"]
).fillna(0)

# Distance par rapport à l'adresse du propriétaire
df["dist_from_home"] = haversine_np(
    df["merch_lat"], df["merch_long"],
    df["lat"], df["long"]
)

# ----------------------------------------------------
# Merchant features
# ----------------------------------------------------
# Nouveau merchant pour la carte
df["is_new_merchant"] = df.groupby("card_id")["merchant"].transform(lambda x: ~x.duplicated()).astype(int)

# ----------------------------------------------------
# Création d'un DataFrame dédié aux features merchant
# ----------------------------------------------------
df_merchant = (
    df[["transaction_id", "merchant_id", "unix_trans_time", "amt"]]
    # Tri indispensable pour garantir la cohérence temporelle
    .sort_values(["merchant_id", "unix_trans_time"])
    .reset_index(drop=True)
)

# ----------------------------------------------------
# Temps écoulé depuis la transaction précédente
# chez le même merchant
# ----------------------------------------------------
df_merchant["merchant_time_since_prev_tx"] = (
    df_merchant
    .groupby("merchant_id")["unix_trans_time"]
    .diff()
    .fillna(FEATURE_WINDOW)
)

# ----------------------------------------------------
# Montant moyen historique du merchant (jusqu'à t-1)
# ----------------------------------------------------
df_merchant["merchant_avg_amt"] = (
    df_merchant
    .groupby("merchant_id")["amt"]
    .expanding()
    .mean()
    .shift()
    .reset_index(level=0, drop=True)
    .fillna(0)
)

# ----------------------------------------------------
# Jointure avec le DataFrame principal
# ----------------------------------------------------
df = df.merge(
    df_merchant[
        ["transaction_id", "merchant_time_since_prev_tx", "merchant_avg_amt"]
    ],
    on="transaction_id",
    how="left"
)

# Fraudes par ville avec population

In [9]:
city_stats = (
    df.groupby("city")
    .agg(
        nb_fraud=("is_fraud", "sum"),
        nb_tx=("is_fraud", "count"),
        city_pop=("city_pop", "mean")
    )
)

city_stats["fraud_rate"] = city_stats["nb_fraud"] / city_stats["nb_tx"] * 100

city_stats.sort_values("fraud_rate", ascending=False).head(60)


  df.groupby("city")


Unnamed: 0_level_0,nb_fraud,nb_tx,city_pop,fraud_rate
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Irvington,8,8,7322.0,100.0
Isanti,10,10,12568.0,100.0
La Grande,12,12,16955.0,100.0
Wartburg,11,11,7511.0,100.0
Vacaville,7,7,99475.0,100.0
Waukau,10,10,149.0,100.0
Angwin,10,10,3661.0,100.0
Hubbell,19,19,121.0,100.0
Lockhart,9,9,17081.0,100.0
Las Vegas,8,8,1417793.0,100.0


In [10]:
city_stats[city_stats["nb_tx"] > 50].sort_values("fraud_rate", ascending=False).head(20)

Unnamed: 0_level_0,nb_fraud,nb_tx,city_pop,fraud_rate
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Aurora,23,512,389246.0,4.492188
Clearwater,24,553,172247.0,4.339964
Benton,17,530,1386.69434,3.207547
Moscow,16,516,533.0,3.100775
Boulder,15,493,1939.0,3.042596
Howes Cave,15,504,1304.0,2.97619
Riverview,15,504,15623.0,2.97619
Girard,15,514,1100.0,2.918288
White Sulphur Springs,15,529,5216.0,2.835539
Pearlington,15,529,1414.0,2.835539


## Pearson (corrélation linéaire)

In [11]:
from scipy.stats import pearsonr

city_stats_filtered = city_stats[city_stats["nb_tx"] > 50]  # éviter le bruit
corr_pearson, p_value = pearsonr(city_stats_filtered["city_pop"], city_stats_filtered["fraud_rate"])
print(f"Pearson r = {corr_pearson:.3f}, p-value = {p_value:.3f}")


Pearson r = 0.014, p-value = 0.690


## Spearman (corrélation monotone, pas seulement linéaire)

In [12]:
from scipy.stats import spearmanr

corr_spearman, p_value_s = spearmanr(city_stats_filtered["city_pop"], city_stats_filtered["fraud_rate"])
print(f"Spearman rho = {corr_spearman:.3f}, p-value = {p_value_s:.3f}")


Spearman rho = -0.058, p-value = 0.096
