In [3]:
# Grundlegende Bibliotheken
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import kagglehub
import numpy as np
import missingno as msno

# Einstellungen für bessere Plots
plt.rcParams['figure.figsize'] = (10, 6)

# Pandas Optionen
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.float_format', '{:.4f}'.format)

# Download latest version


path = kagglehub.dataset_download("priyamchoksi/credit-card-transactions-dataset")

ds = pd.read_csv(f"{path}/credit_card_transactions.csv")

print(f"Anzahl Zeilen: {ds.shape[0]}")
print(f"Anzahl Spalten: {ds.shape[1]}")

Anzahl Zeilen: 1296675
Anzahl Spalten: 24


In [None]:
from scipy.stats import chi2_contingency

ds_filtered = ds[ds['category'].str.endswith(('_net', '_pos'))].copy()
ds_filtered['channel'] = np.where(ds_filtered['category'].str.endswith('_net'), 'online', 'offline')


contingency_table = pd.crosstab(ds_filtered['channel'], ds_filtered['is_fraud'])
print(contingency_table)

chi2, p, dof, expected = chi2_contingency(contingency_table)
print("Chi2:", chi2)
print(f"p-value: {p:.4f}")

a = contingency_table.loc['online', 1]      # Fraud online
b = contingency_table.loc['offline', 1]     # Fraud offline
c = contingency_table.loc['online', 0]      # Non-Fraud online
d = contingency_table.loc['offline', 0]     # Non-Fraud offline

odds_ratio = (a / b) / (c / d)
print("Odds Ratio:", odds_ratio)

is_fraud       0     1
channel               
offline   317129  2836
online    203520  2762
Chi2: 243.6874284262562
p-value: 0.0000
Odds Ratio: 1.5175615409969307


In [12]:
# Gruppen
group_M = ds[ds['gender'] == 'M']['is_fraud'].values
group_F = ds[ds['gender'] == 'F']['is_fraud'].values

# 1️⃣ Beobachtete Differenz berechnen
observed_diff = group_F.mean() - group_M.mean()
print("Beobachtete Differenz (F-M):", observed_diff)

# 2️⃣ Labels mischen und 3️⃣ neue Differenzen berechnen
all_data = np.concatenate([group_M, group_F])
n_B = len(group_F)
n_permutations = 5000
perm_diffs = np.zeros(n_permutations)

for i in range(n_permutations):
    np.random.shuffle(all_data)
    perm_F = all_data[:n_B]
    perm_M = all_data[n_B:]
    perm_diffs[i] = perm_F.mean() - perm_M.mean()

# 4️⃣ p-Wert berechnen (zweiseitig)
p_value = np.mean(np.abs(perm_diffs) >= np.abs(observed_diff))
print("Permutationstest p-Wert (zweiseitig):", p_value)

if p_value < 0.05:
    print("Signifikanter Unterschied zwischen F und M")
else:
    print("Kein signifikanter Unterschied")

Beobachtete Differenz (F-M): -0.0011646703093246774
Permutationstest p-Wert (zweiseitig): 0.0
Signifikanter Unterschied zwischen F und M
