# 1. Load libraries

In [None]:
import os
import pandas as pd
import numpy as np
from src.feature_extraction import build_feature_vector
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

# 2. Build / Load Multimodal Dataset

In [None]:
# from src.multimodal_dataset_builder import create_dataset  # if you had a function    

dataset_csv = "multimodal_dataset.csv"
if os.path.exists(dataset_csv):
    df = pd.read_csv(dataset_csv)
else:
    print("Dataset CSV not found. Run multimodal_dataset_builder.py first.")

print(df.head())
print(f"Dataset size: {len(df)} rows")

# 3. Visualize Feature Distributions

In [None]:
features = [c for c in df.columns if c.startswith("feature_")]

plt.figure(figsize=(12,6))
for i, f in enumerate(features, 1):
    plt.subplot(2, 3, i)
    sns.histplot(df, x=f, hue="label", kde=True)
    plt.title(f)
plt.tight_layout()
plt.show()

# 4. Prepare Data for Training

In [None]:
X = df[features].values
y = df["label"].map({"normal": 0, "fraud": 1}).values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


# 5. Train RandomForest Classifier

In [None]:
clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train, y_train)

# Save the trained model
os.makedirs("models", exist_ok=True)
joblib.dump(clf, "models/multimodal_model.pkl")
print("Saved trained multimodal model to models/multimodal_model.pkl")


# 6. Evaluate Model

In [None]:

y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:, 1] * 100  # % probability of fraud

print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Display first 5 sample probabilities
for i in range(5):
    print(f"Sample {i+1}: Fraud probability = {y_prob[i]:.2f}%, Label = {y_test[i]}")

# 7. Inference on New Audio

In [None]:

sample_audio = "data/audio/fraud/sample1.wav"  # replace with actual path
if os.path.exists(sample_audio):
    feature_vec = build_feature_vector(sample_audio).reshape(1, -1)
    prob_fraud = clf.predict_proba(feature_vec)[0][1] * 100
    prob_normal = 100 - prob_fraud
    print(f"Sample audio: {sample_audio}")
    print(f"Fraud: {prob_fraud:.2f}%, Normal: {prob_normal:.2f}%")
else:
    print(f"Sample audio not found: {sample_audio}")