In [7]:
import pandas as pd
import numpy as np


In [8]:
# Loading the data
df = pd.read_csv("train_df.csv")

# Fraudes par ville avec population

In [9]:
city_stats = (
    df.groupby("city")
    .agg(
        nb_fraud=("is_fraud", "sum"),
        nb_tx=("is_fraud", "count"),
        city_pop=("city_pop", "mean")
    )
)

city_stats["fraud_rate"] = city_stats["nb_fraud"] / city_stats["nb_tx"] * 100

city_stats.sort_values("fraud_rate", ascending=False).head(60)


Unnamed: 0_level_0,nb_fraud,nb_tx,city_pop,fraud_rate
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Irvington,8,8,7322.0,100.0
Isanti,10,10,12568.0,100.0
La Grande,12,12,16955.0,100.0
Wartburg,11,11,7511.0,100.0
Vacaville,7,7,99475.0,100.0
Waukau,10,10,149.0,100.0
Angwin,10,10,3661.0,100.0
Hubbell,19,19,121.0,100.0
Lockhart,9,9,17081.0,100.0
Las Vegas,8,8,1417793.0,100.0


In [10]:
city_stats[city_stats["nb_tx"] > 50].sort_values("fraud_rate", ascending=False).head(20)

Unnamed: 0_level_0,nb_fraud,nb_tx,city_pop,fraud_rate
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Aurora,23,512,389246.0,4.492188
Clearwater,24,553,172247.0,4.339964
Benton,17,530,1386.69434,3.207547
Moscow,16,516,533.0,3.100775
Boulder,15,493,1939.0,3.042596
Howes Cave,15,504,1304.0,2.97619
Riverview,15,504,15623.0,2.97619
Girard,15,514,1100.0,2.918288
White Sulphur Springs,15,529,5216.0,2.835539
Pearlington,15,529,1414.0,2.835539


## Pearson (corrélation linéaire)

In [11]:
from scipy.stats import pearsonr

city_stats_filtered = city_stats[city_stats["nb_tx"] > 50]  # éviter le bruit
corr_pearson, p_value = pearsonr(city_stats_filtered["city_pop"], city_stats_filtered["fraud_rate"])
print(f"Pearson r = {corr_pearson:.3f}, p-value = {p_value:.3f}")


Pearson r = 0.014, p-value = 0.690


## Spearman (corrélation monotone, pas seulement linéaire)

In [12]:
from scipy.stats import spearmanr

corr_spearman, p_value_s = spearmanr(city_stats_filtered["city_pop"], city_stats_filtered["fraud_rate"])
print(f"Spearman rho = {corr_spearman:.3f}, p-value = {p_value_s:.3f}")


Spearman rho = -0.058, p-value = 0.096
