Imports & Global Config

In [1]:
import os
import numpy as np
import librosa
import torch
import soundfile as sf
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import xgboost as xgb
import joblib


In [2]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", DEVICE)


Using device: cuda


Load Wav2Vec2 Model (Middle Layers)

In [3]:
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model

MODEL_NAME = "facebook/wav2vec2-xls-r-300m"

# Feature extractor (audio â†’ tensors)
processor = Wav2Vec2FeatureExtractor.from_pretrained(
    MODEL_NAME
)

# Model (force safetensors, enable hidden states)
wav2vec = Wav2Vec2Model.from_pretrained(
    MODEL_NAME,
    output_hidden_states=True,
    use_safetensors=True   # ðŸ”´ IMPORTANT (avoids torch.load issues)
).to(DEVICE)

wav2vec.eval()

print("Wav2Vec2 loaded successfully on", DEVICE)




Loading weights:   0%|          | 0/422 [00:00<?, ?it/s]

Wav2Vec2Model LOAD REPORT from: facebook/wav2vec2-xls-r-300m
Key                          | Status     |  | 
-----------------------------+------------+--+-
quantizer.codevectors        | UNEXPECTED |  | 
project_q.weight             | UNEXPECTED |  | 
project_hid.weight           | UNEXPECTED |  | 
quantizer.weight_proj.weight | UNEXPECTED |  | 
quantizer.weight_proj.bias   | UNEXPECTED |  | 
project_q.bias               | UNEXPECTED |  | 
project_hid.bias             | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Wav2Vec2 loaded successfully on cuda


Audio Loading + Preprocessing

In [4]:
def load_audio(path, target_sr=16000):
    y, sr = librosa.load(path, sr=target_sr, mono=True)
    y, _ = librosa.effects.trim(y, top_db=25)
    return y


In [5]:
def chunk_audio(y, sr=16000, chunk_sec=2):
    chunk_len = sr * chunk_sec
    chunks = []
    for i in range(0, len(y) - chunk_len + 1, chunk_len):
        chunks.append(y[i:i + chunk_len])
    return chunks if chunks else [y]


Extract Embeddings (Middle Layer Pooling)

In [6]:
@torch.no_grad()
def extract_embedding(y):
    chunks = chunk_audio(y)
    all_chunks = []

    for chunk in chunks:
        inputs = processor(
            chunk,
            sampling_rate=16000,
            return_tensors="pt",
            padding=True
        ).input_values.to(DEVICE)

        outputs = wav2vec(inputs)
        hidden = outputs.hidden_states[8]   # middle layer
        pooled = hidden.mean(dim=1)
        all_chunks.append(pooled.cpu().numpy())

    return np.mean(np.vstack(all_chunks), axis=0)


Cache

In [7]:
EMB_DIR = "embeddings"
HUMAN_EMB = os.path.join(EMB_DIR, "human")
AI_EMB = os.path.join(EMB_DIR, "ai")

os.makedirs(HUMAN_EMB, exist_ok=True)
os.makedirs(AI_EMB, exist_ok=True)


Extract & Cache Embeddings (RUN ONCE)

In [11]:
def process_folder(root, label_dir):
    for dirpath, _, files in os.walk(root):
        for f in files:
            if not f.lower().endswith((".wav", ".mp3", ".flac")):
                continue

            path = os.path.join(dirpath, f)
            name = f"{hash(path)}.npy"
            out = os.path.join(label_dir, name)

            if os.path.exists(out):
                continue

            try:
                y = load_audio(path)
                emb = extract_embedding(y)
                np.save(out, emb)
            except Exception as e:
                print("Error:", path, e)

print("Extracting HUMAN embeddings...")
process_folder("dataset/train/human", HUMAN_EMB)

print("Extracting AI embeddings...")
process_folder("dataset/train/ai", AI_EMB)


Extracting HUMAN embeddings...
Extracting AI embeddings...


Load Cached Embeddings

In [8]:
def load_embeddings(folder, label):
    X, y = [], []
    for f in os.listdir(folder):
        emb = np.load(os.path.join(folder, f))
        X.append(emb)
        y.append(label)
    return np.array(X), np.array(y)

X_h, y_h = load_embeddings(HUMAN_EMB, 0)
X_a, y_a = load_embeddings(AI_EMB, 1)

X = np.vstack([X_h, X_a])
y = np.concatenate([y_h, y_a])

print("Final shape:", X.shape, y.shape)


KeyboardInterrupt: 

Train / Validation Split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


Train XGBoost

In [14]:
model = xgb.XGBClassifier(
    n_estimators=400,
    max_depth=8,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    tree_method="hist",
    random_state=42
)

model.fit(X_train, y_train)


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


Evaluation

In [15]:
y_pred = model.predict(X_test)

print(classification_report(
    y_test, y_pred,
    target_names=["Human", "AI"]
))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

       Human       0.98      0.99      0.98      1590
          AI       0.99      0.98      0.98      1565

    accuracy                           0.98      3155
   macro avg       0.98      0.98      0.98      3155
weighted avg       0.98      0.98      0.98      3155

Confusion Matrix:
[[1569   21]
 [  35 1530]]


Save Model

In [16]:
joblib.dump(model, "ai_voice_detector_xgb_v2.pkl")
print("Model saved.")


Model saved.


Inference on Single File

In [7]:
import joblib
model = joblib.load("ai_voice_detector_xgb_v2.pkl")


In [8]:
def predict_file(path):
    y = load_audio(path)
    emb = extract_embedding(y)
    probs = model.predict_proba(emb.reshape(1, -1))[0]
    label = "AI" if probs[1] > probs[0] else "Human"
    return {
        "prediction": label,
        "confidence": float(max(probs)),
        "explanation": (
            "Detected spectral and prosodic patterns "
            "consistent with AI-generated speech."
            if label == "AI"
            else
            "Speech characteristics match natural human patterns."
        )
    }

print(predict_file("test/humanhg.wav"))


{'prediction': 'Human', 'confidence': 0.9996724724769592, 'explanation': 'Speech characteristics match natural human patterns.'}
