# Anti Money Laundering Dataset EDA Exercises

In [None]:
# Importing necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import classification_report
from scipy.stats import chi2_contingency

# Load the dataset
data = pd.read_csv("./data.csv")  # Replace with your dataset path

## Beginner Level

### 1. Basic Data Inspection
- Load the dataset into a DataFrame.
- Display the first five rows of the dataset.
- Check the data types of each column.

In [None]:
# Caricare il dataset
df = pd.read_csv("data.csv")

# Visualizzare le prime 5 righe
display(df.head())

# Controllare i tipi di dati
display(df.info())

### 2. Missing Values Check
- Identify and count the number of missing values in each column.
- Create a visualization (e.g., bar chart) showing the number of missing values for each column.

In [None]:
# Contare i valori mancanti per ogni colonna
missing_values = df.isnull().sum()

# Visualizzare i valori mancanti
plt.figure(figsize=(10, 5))
sns.barplot(x=missing_values.index, y=missing_values.values)
plt.xticks(rotation=90)
plt.title("Missing Values per Column")
plt.show()

# Stampare i valori mancanti
print(missing_values)

### 3. Summary Statistics
- Generate summary statistics for numerical columns (`Amount`).
- Calculate the unique values for categorical columns (`Payment_currency`, `Received_currency`, `Payment_type`).

In [None]:
# Statistiche descrittive per colonne numeriche
display(df.describe())

# Conteggio dei valori unici per colonne categoriali
categorical_columns = ["Payment_currency", "Received_currency", "Payment_type"]
unique_values = {col: df[col].nunique() for col in categorical_columns}
print("Numero di valori unici per colonna categoriale:", unique_values)

## Intermediate Level

### 4. Time Series Analysis
- Convert the `Date` column to a datetime format.
- Create a time series plot showing the total transaction amount over time.

In [None]:
# Convertire la colonna Date in formato datetime
df["Date"] = pd.to_datetime(df["Date"])

# Raggruppare per data e sommare gli importi delle transazioni
df_grouped = df.groupby("Date")["Amount"].sum()

# Creare un grafico temporale
plt.figure(figsize=(12, 6))
df_grouped.plot(title="Total Transaction Amount Over Time")
plt.xlabel("Date")
plt.ylabel("Total Amount")
plt.show()

### 5. Transaction Patterns
- Analyze the transaction amounts by `Payment_type`. Which type has the highest average transaction amount?
- Create a visualization (e.g., box plot) to show the distribution of transaction amounts for each payment type.

In [None]:
# Calcolare la media degli importi per tipo di pagamento
payment_avg = df.groupby("Payment_type")["Amount"].mean()
display(payment_avg)

# Creare un boxplot degli importi per tipo di pagamento
plt.figure(figsize=(12, 6))
sns.boxplot(x="Payment_type", y="Amount", data=df)
plt.xticks(rotation=90)
plt.title("Transaction Amount Distribution by Payment Type")
plt.show()

### 6. Geographical Analysis
- Analyze the distribution of transactions based on `Sender_bank_location` and `Receiver_bank_location`.
- Create a heatmap to visualize the number of transactions between different locations.

In [None]:
# Creare una matrice delle transazioni tra diverse località bancarie
geo_counts = (
    df.groupby(["Sender_bank_location", "Receiver_bank_location"]).size().unstack()
)

# Creare una heatmap delle transazioni tra località
plt.figure(figsize=(12, 6))
sns.heatmap(geo_counts, cmap="Blues", linewidths=0.5)
plt.title("Transaction Count Between Locations")
plt.show()

## Advanced Level

### 7. Laundering Analysis
- Calculate the proportion of transactions marked as `Is_laundering`.
- Create a bar chart comparing the count of transactions for each `Laundering_type`.

In [None]:
# Calcolare la proporzione di transazioni marcate come riciclaggio
laundering_ratio = df["Is_laundering"].mean()
print(f"Percentuale di transazioni sospette: {laundering_ratio:.2%}")

# Creare un grafico a barre per confrontare le transazioni normali e sospette
plt.figure(figsize=(8, 5))
sns.countplot(x="Is_laundering", data=df)
plt.title("Laundering vs Normal Transactions")
plt.xticks([0, 1], ["Normal", "Laundering"])
plt.show()

### 8. Correlation Analysis
- Calculate the correlation matrix for numerical columns.
- Create a heatmap to visualize the correlation between different numerical features.

In [None]:
# Calcolare la matrice di correlazione
corr_matrix = df.select_dtypes(include=["number"]).corr()

# Creare una heatmap della matrice di correlazione
plt.figure(figsize=(10, 6))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", linewidths=0.5)
plt.title("Correlation Heatmap")
plt.show()

### 9. Hypothesis Testing
- Formulate a hypothesis regarding the relationship between `Payment_type` and `Is_laundering`.
- Conduct a chi-square test to determine if there is a significant association between the two variables.

In [None]:
from scipy.stats import chi2_contingency

# Creare la tabella di contingenza tra Payment_type e Is_laundering
contingency_table = pd.crosstab(df["Payment_type"], df["Is_laundering"])

# Eseguire il test del chi-quadrato
chi2, p, _, _ = chi2_contingency(contingency_table)

# Stampare il risultato
print(f"P-value del test chi-quadrato: {p}")
if p < 0.05:
    print("Esiste una relazione significativa tra Payment_type e Is_laundering.")
else:
    print("Non esiste una relazione significativa tra Payment_type e Is_laundering.")