# H. Multimodal Classifier for the LAV-DF Dataset

Set Up Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#LOAD OUTPUTS OF AUDIO AND VIDEO MODEL AND CONSTRUCT A MERGED DATASET
import os
import re
import numpy as np
import pandas as pd

# config
DRIVE_FOLDER = "/content/drive/My Drive/Deep Fake Dataset"
AUDIO_CSV = os.path.join(DRIVE_FOLDER, "predictions_audio_FINAL2.csv")
ENSEMBLE_CSV = os.path.join(DRIVE_FOLDER, "ensemble_predictions_lav.csv")
OUT_CSV = os.path.join(DRIVE_FOLDER, "merged_audio_video_predictions.csv")

# function to extract the vid ID
def extract_video_id_from_path(path):
    """
    Extracts numeric digits from filename.
    /path/to/train/034843.mp4 --> 34843
    """
    if pd.isna(path):
        return np.nan
    base = os.path.basename(str(path))
    m = re.search(r"(\d+)", base)
    if not m:
        return np.nan
    return int(m.group(1))

# load audio predictions
df_audio = pd.read_csv(AUDIO_CSV)
print("Loaded audio predictions:", df_audio.shape)
df_audio["filename"] = df_audio["file_rel"].apply(os.path.basename)

# Extract numeric video_id
df_audio["video_id"] = df_audio["filename"].apply(extract_video_id_from_path)
df_audio["video_id"] = pd.to_numeric(df_audio["video_id"], errors="coerce").astype("Int64")

print("Unique video_ids in audio:", df_audio["video_id"].nunique())

# load video ensemble predictions
df_ens = pd.read_csv(ENSEMBLE_CSV)
print("Loaded ensemble predictions:", df_ens.shape)

df_ens["video_id"] = pd.to_numeric(df_ens["video_id"], errors="coerce").astype("Int64")

# merge audio & video into one dataset
merged = pd.merge(
    df_audio,
    df_ens,
    on="video_id",
    how="inner",
    suffixes=("_audio", "_video")
)

print("Merged dataset:", merged.shape)

# feature engineering
# audio_score = prob_fake
merged["audio_score"] = pd.to_numeric(merged["prob_fake"], errors="coerce")

# audio_confidence = max(p, 1-p)
merged["audio_confidence"] = merged["audio_score"].apply(
    lambda p: np.nan if pd.isna(p) else max(p, 1 - p)
)

# true_label from audio CSV ('real'/'fake')
def map_label(x):
    if isinstance(x, str):
        x = x.strip().lower()
        if x == "real": return 0
        if x == "fake": return 1
    try:
        return int(x)
    except:
        return np.nan

merged["true_label"] = merged["label"].apply(map_label)

# Drop rows missing label
merged = merged[merged["true_label"].notna()].copy()

# select final columns
keep_cols = [
    "video_id",
    "filename",
    "true_label",

    # AUDIO fields
    "prob_fake",
    "pred_default",
    "modify_audio",
    "modify_video",
    "duration",
    "n_fakes",
    "audio_score",
    "audio_confidence",

    # VIDEO ENSEMBLE fields
    "ensemble_pred",
    "svm",
    "dt",
    "knn",
    "nb",
]

final_df = merged[keep_cols]

# SAVE
final_df.to_csv(OUT_CSV, index=False)
print("\nSaved merged dataset to:\n", OUT_CSV)

print("\nPREVIEW:")
print(final_df.head())

Loaded audio predictions: (3968, 9)
Unique video_ids in audio: 3968
Loaded ensemble predictions: (1909, 7)
Merged dataset: (1909, 17)

Saved merged dataset to:
 /content/drive/My Drive/Deep Fake Dataset/merged_audio_video_predictions.csv

PREVIEW:
   video_id    filename  true_label  prob_fake  pred_default  modify_audio  \
0     14471  014471.mp4           0   0.000066             0         False   
1     13733  013733.mp4           0   0.000052             0         False   
2     17940  017940.mp4           0   0.000002             0         False   
3     25884  025884.mp4           0   0.000106             0         False   
4     31736  031736.mp4           1   0.523087             1          True   

   modify_video  duration  n_fakes  audio_score  audio_confidence  \
0         False     5.952        0     0.000066          0.999934   
1         False     8.896        0     0.000052          0.999948   
2         False     5.440        0     0.000002          0.999998   
3      

In [None]:
#Checking accuracies of independent classifiers on final filtered dataset

MERGED_CSV = f"{DRIVE_FOLDER}/merged_audio_video_predictions.csv"

# Load merged dataset
df = pd.read_csv(MERGED_CSV)
print("Loaded merged dataset:", df.shape)

# Ensure true_label numeric 0/1
def map_label(x):
    if isinstance(x, str):
        x = x.lower().strip()
        if x == "real": return 0
        if x == "fake": return 1
    try:
        return int(x)
    except:
        return np.nan

df["true_label"] = df["true_label"].apply(map_label)
df = df[df["true_label"].notna()].copy()
df["true_label"] = df["true_label"].astype(int)

print("Label distribution:")
print(df["true_label"].value_counts())

# Accuracy: ensemble_pred vs true_label (video model)
df["ensemble_correct"] = (df["ensemble_pred"] == df["true_label"])
ensemble_acc = df["ensemble_correct"].mean() * 100

# Accuracy: pred_default vs true_label (audio model)
df["audio_correct"] = (df["pred_default"] == df["true_label"])
audio_acc = df["audio_correct"].mean() * 100

# Print results
print("\n--- MODEL ACCURACY METRICS ---")
print(f"Video Ensemble Accuracy (ensemble_pred vs true_label): {ensemble_acc:.2f}%")
print(f"Audio Model Accuracy (pred_default vs true_label):     {audio_acc:.2f}%")

Loaded merged dataset: (1909, 16)
Label distribution:
true_label
0    1057
1     852
Name: count, dtype: int64

--- MODEL ACCURACY METRICS ---
Video Ensemble Accuracy (ensemble_pred vs true_label): 96.33%
Audio Model Accuracy (pred_default vs true_label):     98.06%


In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score,
    recall_score, confusion_matrix, classification_report
)
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import joblib

# Config
DRIVE_FOLDER = "/content/drive/My Drive/Deep Fake Dataset"
MERGED_CSV = os.path.join(DRIVE_FOLDER, "merged_audio_video_predictions.csv")  # merged audio + ensemble
OUT_TRAIN = os.path.join(DRIVE_FOLDER, "val_train_split.csv")
OUT_TEST = os.path.join(DRIVE_FOLDER, "val_test_split.csv")
MODEL_LR = os.path.join(DRIVE_FOLDER, "val_logreg.joblib")
MODEL_MLP = os.path.join(DRIVE_FOLDER, "val_mlp.joblib")
IMPUTER_PATH = os.path.join(DRIVE_FOLDER, "val_imputer.joblib")
SCALER_PATH = os.path.join(DRIVE_FOLDER, "val_scaler.joblib")
TEST_SIZE = 0.2
RANDOM_STATE = 42

# Load merged dataset (audio + ensemble)
if not os.path.exists(MERGED_CSV):
    raise FileNotFoundError(f"Could not find merged CSV at: {MERGED_CSV}")

df = pd.read_csv(MERGED_CSV)
print("Loaded merged dataset:", df.shape)

# feature construction
# audio_score = prob_fake (from audio CSV)
df["audio_score"] = pd.to_numeric(df["prob_fake"], errors="coerce")

# audio_confidence = max(p, 1-p)
df["audio_confidence"] = df["audio_score"].apply(lambda p: np.nan if pd.isna(p) else max(p, 1.0 - p))

# video_score = ensemble_pred
df["video_score"] = pd.to_numeric(df["ensemble_pred"], errors="coerce")

# Ensemble model outputs to include if present (svm, dt, knn, nb, etc.)
ensemble_cols_candidates = ["ensemble_pred", "svm", "dt", "knn", "nb"]
ensemble_cols_present = [c for c in ensemble_cols_candidates if c in df.columns]

# Convert ensemble outputs to numeric
for c in ensemble_cols_present:
    df[c] = pd.to_numeric(df[c], errors="coerce")

# Build feature list: core audio/video features + any present ensemble outputs (excluding duplicates)
feature_cols = ["audio_score", "audio_confidence", "video_score"]
# Add ensemble outputs but avoid adding 'ensemble_pred' twice if video_score already maps to it
for c in ensemble_cols_present:
    if c == "ensemble_pred" and "video_score" in feature_cols:
        continue
    if c not in feature_cols:
        feature_cols.append(c)

print("Feature columns used:", feature_cols)

# label prep
# use true_label column from merged CSV (should be 0/1)
label_col = "true_label"

def map_label_val(x):
    if pd.isna(x): return np.nan
    if isinstance(x, str):
        s = x.strip().lower()
        if s == "real": return 0
        if s == "fake": return 1
    try:
        return int(float(x))
    except:
        return np.nan

df["label_bin"] = df[label_col].apply(map_label_val)
df = df[df["label_bin"].notna()].copy()
df["label_bin"] = df["label_bin"].astype(int)

print("Label distribution:")
print(df["label_bin"].value_counts())

X = df[feature_cols].copy()
y = df["label_bin"]

# impute + scale
imputer = SimpleImputer(strategy="median")
scaler = StandardScaler()

X_imp = imputer.fit_transform(X)
X_scaled = scaler.fit_transform(X_imp)

joblib.dump(imputer, IMPUTER_PATH)
joblib.dump(scaler, SCALER_PATH)
print("Saved imputer and scaler to Drive.")

# STRATIFIED TRAIN/TEST SPLIT
X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
    X_scaled, y, df.index,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=y
)

print("Train size:", X_train.shape[0], " Test size:", X_test.shape[0])
print("Train distribution:\n", y_train.value_counts(normalize=True))
print("Test distribution:\n", y_test.value_counts(normalize=True))

# Save splits with original metadata
df.loc[idx_train].to_csv(OUT_TRAIN, index=False)
df.loc[idx_test].to_csv(OUT_TEST, index=False)
print("Saved train/test CSVs to Drive.")

# LOGISTIC REGRESSION
lr = LogisticRegression(max_iter=2000, class_weight="balanced", random_state=RANDOM_STATE)
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)
y_proba_lr = lr.predict_proba(X_test)[:, 1]

print("\n=== Logistic Regression Performance ===")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
try:
    print("AUC:", roc_auc_score(y_test, y_proba_lr))
except Exception as e:
    print("AUC: could not compute:", e)
print("Precision:", precision_score(y_test, y_pred_lr, zero_division=0))
print("Recall:", recall_score(y_test, y_pred_lr, zero_division=0))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_lr))

joblib.dump(lr, MODEL_LR)
print("Saved logistic regression model:", MODEL_LR)

# MLP
mlp = MLPClassifier(
    hidden_layer_sizes=(16,),
    activation="relu",
    solver="adam",
    max_iter=1000,
    random_state=RANDOM_STATE
)
mlp.fit(X_train, y_train)

y_pred_mlp = mlp.predict(X_test)
# predict_proba may be available
try:
    y_proba_mlp = mlp.predict_proba(X_test)[:, 1]
except:
    y_proba_mlp = None

print("\n=== MLP (1-layer) Performance ===")
print("Accuracy:", accuracy_score(y_test, y_pred_mlp))
if y_proba_mlp is not None:
    try:
        print("AUC:", roc_auc_score(y_test, y_proba_mlp))
    except Exception as e:
        print("AUC: could not compute:", e)
print("Precision:", precision_score(y_test, y_pred_mlp, zero_division=0))
print("Recall:", recall_score(y_test, y_pred_mlp, zero_division=0))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_mlp))

joblib.dump(mlp, MODEL_MLP)
print("Saved 1-layer MLP model:", MODEL_MLP)

Loaded merged dataset: (1909, 16)
Feature columns used: ['audio_score', 'audio_confidence', 'video_score', 'svm', 'dt', 'knn', 'nb']
Using label column: true_label
Label distribution:
label_bin
0    1057
1     852
Name: count, dtype: int64
Saved imputer and scaler to Drive.
Train size: 1527  Test size: 382
Train distribution:
 label_bin
0    0.553373
1    0.446627
Name: proportion, dtype: float64
Test distribution:
 label_bin
0    0.554974
1    0.445026
Name: proportion, dtype: float64
Saved train/test CSVs to Drive.

=== Logistic Regression Performance ===
Accuracy: 0.9921465968586387
AUC: 0.9995283018867924
Precision: 1.0
Recall: 0.9823529411764705
Confusion matrix:
 [[212   0]
 [  3 167]]
Saved logistic regression model: /content/drive/My Drive/Deep Fake Dataset/val_logreg.joblib

=== MLP (1-layer) Performance ===
Accuracy: 0.9921465968586387
AUC: 0.9995837957824639
Precision: 1.0
Recall: 0.9823529411764705
Confusion matrix:
 [[212   0]
 [  3 167]]
Saved 1-layer MLP model: /content/