In [1]:
import os
import librosa
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [22]:
!pip install --upgrade scikit-learn



In [2]:
!pip show scikit-learn

Name: scikit-learn
Version: 1.7.0
Summary: A set of python modules for machine learning and data mining
Home-page: 
Author: 
Author-email: 
License: BSD 3-Clause License

 Copyright (c) 2007-2024 The scikit-learn developers.
 All rights reserved.

 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:

 * Redistributions of source code must retain the above copyright notice, this
   list of conditions and the following disclaimer.

 * Redistributions in binary form must reproduce the above copyright notice,
   this list of conditions and the following disclaimer in the documentation
   and/or other materials provided with the distribution.

 * Neither the name of the copyright holder nor the names of its
   contributors may be used to endorse or promote products derived from
   this software without specific prior written permission.

 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIB

In [1]:
import sklearn
print(sklearn.__version__)

1.7.0


In [11]:
pip install pydub

Collecting pydub
  Obtaining dependency information for pydub from https://files.pythonhosted.org/packages/a6/53/d78dc063216e62fc55f6b2eebb447f6a4b0a59f55c8406376f76bf959b08/pydub-0.25.1-py2.py3-none-any.whl.metadata
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1
Note: you may need to restart the kernel to use updated packages.


In [12]:
import os, random, time, io, joblib, warnings
import numpy as np
from tqdm.auto import tqdm

import librosa
from pydub import AudioSegment  

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

# CONFIG — 
RANDOM_SEED = 42


NATIVE_DIRS = [
    "cmu_us_bdl_arctic/wav", 
    "cmu_us_slt_arctic/wav",  
]
NONNATIVE_ROOT = "nonnative"  


NATIVE_FILES_PER_SPK   = 80   
NONNATIVE_FILES_PER_SPK= 10  

SR_TARGET = 16000
N_MFCC    = 20

PARAM_GRID = {
    "clf__C": [0.25, 0.5, 1, 2, 4],
}


MODEL_OUT = r"C:\Users\natal\AccentCoach\server\model.joblib" 

USE_AUG = False

# Utils
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

def list_wavs(dir_path, limit=None):
    if not os.path.isdir(dir_path):
        return []
    files = [os.path.join(dir_path, f) for f in os.listdir(dir_path)
             if f.lower().endswith(".wav")]
    files.sort()
    return files[:limit] if limit else files

def opus_roundtrip_load(wav_path, sr_target=SR_TARGET):

    a = AudioSegment.from_file(wav_path)
    a = a.set_channels(1).set_frame_rate(sr_target)

    buf = io.BytesIO()
    a.export(buf, format="webm", codec="libopus", bitrate="32k")  # ~browser-like
    buf.seek(0)

    b = AudioSegment.from_file(buf, format="webm")
    b = b.set_channels(1).set_frame_rate(sr_target)

    arr = np.array(b.get_array_of_samples()).astype(np.float32)
   
    if b.sample_width == 2:
        arr /= 32768.0
    elif b.sample_width == 4:
        arr /= 2147483648.0
    # Ensure [-1, 1]
    arr = np.clip(arr, -1.0, 1.0)
    return arr, sr_target

def light_aug(y, sr):
   
    gain = 10 ** (np.random.uniform(-0.6, 0.6) / 20)
    y = np.clip(y * gain, -1.0, 1.0)
    rate = np.random.uniform(0.95, 1.05)
    try:
        y = librosa.effects.time_stretch(y, rate)
    except Exception:
        pass
    if np.random.rand() < 0.5:
        noise = np.random.randn(len(y)).astype(np.float32) * 0.004
        y = np.clip(y + noise, -1.0, 1.0)
    return y

def extract_features_320_from_array(y, sr, n_mfcc=N_MFCC):
    
    y, _ = librosa.effects.trim(y, top_db=30)

    min_len = int(0.5 * SR_TARGET)
    if len(y) < min_len:
        y = np.pad(y, (0, min_len - len(y)))

    peak = np.max(np.abs(y)) if y.size else 0.0
    if peak > 0:
        y = y / peak

    #MFCCs
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    d1   = librosa.feature.delta(mfcc)
    d2   = librosa.feature.delta(mfcc, order=2)

    #spectral stats
    zcr  = librosa.feature.zero_crossing_rate(y)[0]
    cent = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
    bw   = librosa.feature.spectral_bandwidth(y=y, sr=sr)[0]
    roll = librosa.feature.spectral_rolloff(y=y, sr=sr, roll_percent=0.95)[0]

    def stats(v):
        return np.array([np.mean(v), np.std(v), np.median(v),
                         np.percentile(v, 5), np.percentile(v, 95)],
                        dtype=np.float32)
    def agg(mat):
        return np.concatenate([stats(row) for row in mat], axis=0)

    feat = np.concatenate([
        agg(mfcc),  # 20 * 5 = 100
        agg(d1),    # 100
        agg(d2),    # 100
        stats(zcr), # 5
        stats(cent),# 5
        stats(bw),  # 5
        stats(roll) # 5  -> 320 total
    ], axis=0).astype(np.float32)
    feat[~np.isfinite(feat)] = 0.0
    return feat

def extract_features(path):
    """
    Full domain-matched loader + feature pipeline.
    """
    y, sr = opus_roundtrip_load(path, SR_TARGET)
    if USE_AUG:
        y = light_aug(y, sr)
    return extract_features_320_from_array(y, sr)

# Build balanced file lists
native_items = []  
for d, spk in [(NATIVE_DIRS[0], "bdl"), (NATIVE_DIRS[1], "slt")]:
    files = list_wavs(d, limit=NATIVE_FILES_PER_SPK)
    native_items.extend([(p, spk) for p in files])

native_total = len(native_items)
if native_total == 0:
    raise RuntimeError("No native files found. Check NATIVE_DIRS paths.")

# Non-natives 
nonnative_by_spk = {}  
if os.path.isdir(NONNATIVE_ROOT):
    for spk in sorted(os.listdir(NONNATIVE_ROOT)):
        wav_dir = os.path.join(NONNATIVE_ROOT, spk, "wav")
        if os.path.isdir(wav_dir):
            files = list_wavs(wav_dir, limit=NONNATIVE_FILES_PER_SPK)
            if files:
                random.shuffle(files)
                nonnative_by_spk[spk] = files

if not nonnative_by_spk:
    raise RuntimeError("No non-native files found. Check NONNATIVE_ROOT structure.")

def round_robin_select(spk2files, target):
    spks = list(spk2files.keys())
    idxs = {s: 0 for s in spks}
    picked = []
    while len(picked) < target:
        any_added = False
        for s in spks:
            i = idxs[s]
            if i < len(spk2files[s]):
                picked.append((spk2files[s][i], s))
                idxs[s] += 1
                any_added = True
                if len(picked) >= target:
                    break
        if not any_added:
            break
    return picked

selected_nonnat = round_robin_select(nonnative_by_spk, native_total)
print(f"Planned natives: {native_total} (from 2 speakers)")
print(f"Planned non-natives: {len(selected_nonnat)} (from {len(nonnative_by_spk)} speakers)")
if len(selected_nonnat) < native_total:
    print(f" Not enough non-native files to match natives (short by {native_total - len(selected_nonnat)})")

# Extract features (with tqdm)
X, y, groups, paths = [], [], [], []
t0 = time.perf_counter()

print("\nExtracting native features...")
for p, spk in tqdm(native_items, desc="Natives", dynamic_ncols=True):
    try:
        X.append(extract_features(p)); y.append(0); groups.append(spk); paths.append(p)
    except Exception as e:
        warnings.warn(f"Native file failed: {p} -> {e}")

print("Extracting non-native features...")
for p, spk in tqdm(selected_nonnat, desc="Non-Natives", dynamic_ncols=True):
    try:
        X.append(extract_features(p)); y.append(1); groups.append(spk); paths.append(p)
    except Exception as e:
        warnings.warn(f"Non-native file failed: {p} -> {e}")

X = np.vstack(X).astype(np.float32)
y = np.asarray(y, dtype=np.int64)
groups = np.asarray(groups)

print(f"\nLoaded {len(y)} items in {time.perf_counter()-t0:.1f}s")
print("Counts -> Native(0):", int((y==0).sum()), " Non-Native(1):", int((y==1).sum()))
print("Unique speakers:", len(np.unique(groups)))


# Manual 2-fold speaker-disjoint CV
# Fold A validates on native 'bdl' + half of non-native speakers
# Fold B validates on native 'slt' + the other half
native_spk = sorted(list(set([g for (g,l) in zip(groups,y) if l==0])))
if len(native_spk) < 2:
    raise RuntimeError(f"Need at least 2 native speakers; found {len(native_spk)}")

nonnat_spk = sorted(list(set([g for (g,l) in zip(groups,y) if l==1])))
random.shuffle(nonnat_spk)
half = len(nonnat_spk) // 2
halfA, halfB = set(nonnat_spk[:half]), set(nonnat_spk[half:])

valA = set([native_spk[0]]) | halfA
valB = set([native_spk[1]]) | halfB

train_idx_A = np.where(~np.isin(groups, list(valA)))[0]; val_idx_A = np.where(np.isin(groups, list(valA)))[0]
train_idx_B = np.where(~np.isin(groups, list(valB)))[0]; val_idx_B = np.where(np.isin(groups, list(valB)))[0]

def has_both(idx):
    return len(np.unique(y[idx])) == 2

if not (has_both(train_idx_A) and has_both(val_idx_A) and has_both(train_idx_B) and has_both(val_idx_B)):
    raise RuntimeError("A fold lost a class; adjust the per-speaker caps to ensure both classes in train/val.")

print("\nFold A class counts (train / val):",
      np.bincount(y[train_idx_A], minlength=2).tolist(),
      np.bincount(y[val_idx_A],   minlength=2).tolist())
print("Fold B class counts (train / val):",
      np.bincount(y[train_idx_B], minlength=2).tolist(),
      np.bincount(y[val_idx_B],   minlength=2).tolist())

cv_folds = [(train_idx_A, val_idx_A), (train_idx_B, val_idx_B)]


# Grid search (LogReg) with 2-fold CV
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(
        class_weight='balanced',
        max_iter=1000,
        solver='lbfgs',
        random_state=RANDOM_SEED
    )),
])

print("\nFitting GridSearchCV...")
grid = GridSearchCV(
    pipe,
    PARAM_GRID,
    cv=cv_folds,
    n_jobs=-1,
    scoring='f1_macro',   # macro F1 to balance both classes
    verbose=2,
    error_score='raise'
)

grid.fit(X, y)
best = grid.best_estimator_
print("\nBest params:", grid.best_params_)
print("Best CV macro-F1 (2-fold):", grid.best_score_)

# Validate on each fold with best
print("\n=== Validation on Fold A (val speakers:", sorted(list(valA)), ") ===")
y_pred_A = best.predict(X[val_idx_A])
print(classification_report(y[val_idx_A], y_pred_A, target_names=["Native", "Non-Native"]))
print("Confusion matrix:\n", confusion_matrix(y[val_idx_A], y_pred_A))

print("\n=== Validation on Fold B (val speakers:", sorted(list(valB)), ") ===")
y_pred_B = best.predict(X[val_idx_B])
print(classification_report(y[val_idx_B], y_pred_B, target_names=["Native", "Non-Native"]))
print("Confusion matrix:\n", confusion_matrix(y[val_idx_B], y_pred_B))

# Refit final model on ALL data and save
final = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(
        class_weight='balanced',
        max_iter=1000,
        solver='lbfgs',
        random_state=RANDOM_SEED,
        C=best.get_params()['clf__C']
    )),
])

print("\nRefitting final model on ALL data...")
final.fit(X, y)

os.makedirs(os.path.dirname(MODEL_OUT), exist_ok=True)
joblib.dump(final, MODEL_OUT)
print(f"\nSaved model to: {MODEL_OUT}")

def predict_file(path, model=final):
    yx, sr = opus_roundtrip_load(path, SR_TARGET)
    feat = extract_features_320_from_array(yx, sr).reshape(1, -1)
    proba = model.predict_proba(feat)[0]
    classes = list(model.named_steps['clf'].classes_)
    p_native = float(proba[classes.index(0)])
    p_non    = float(proba[classes.index(1)])
    pred     = int(model.predict(feat)[0])
    label    = "Native" if pred == 0 else "Non-Native"
    print(os.path.basename(path), "->", label,
          f"(p_native={p_native:.2f}, p_non_native={p_non:.2f})")
    return pred, (p_native, p_non)



Planned natives: 160 (from 2 speakers)
Planned non-natives: 160 (from 24 speakers)

Extracting native features...


Natives:   0%|          | 0/160 [00:00<?, ?it/s]

Extracting non-native features...


Non-Natives:   0%|          | 0/160 [00:00<?, ?it/s]


Loaded 320 items in 157.2s
Counts -> Native(0): 160  Non-Native(1): 160
Unique speakers: 26

Fold A class counts (train / val): [80, 78] [80, 82]
Fold B class counts (train / val): [80, 82] [80, 78]

Fitting GridSearchCV...
Fitting 2 folds for each of 5 candidates, totalling 10 fits

Best params: {'clf__C': 4}
Best CV macro-F1 (2-fold): 0.9715595047527616

=== Validation on Fold A (val speakers: ['ASI', 'BWC', 'ERMS', 'HJK', 'HQTV', 'LXC', 'MBMPS', 'NCC', 'PNV', 'SVBI', 'YBAA', 'YKWK', 'bdl'] ) ===
              precision    recall  f1-score   support

      Native       1.00      1.00      1.00        80
  Non-Native       1.00      1.00      1.00        82

    accuracy                           1.00       162
   macro avg       1.00      1.00      1.00       162
weighted avg       1.00      1.00      1.00       162

Confusion matrix:
 [[80  0]
 [ 0 82]]

=== Validation on Fold B (val speakers: ['ABA', 'EBVS', 'HKK', 'NJS', 'RRBI', 'SKA', 'THV', 'TLV', 'TNI', 'TXHC', 'YDCK', 'ZHAA',

In [8]:
import joblib, numpy as np, os

def extract_features(path, sr_target=16000, n_mfcc=20):
    import librosa, numpy as np
    y, sr = librosa.load(path, sr=sr_target, mono=True)
    y, _ = librosa.effects.trim(y, top_db=30)
    min_len = int(0.5 * sr_target)
    if len(y) < min_len: y = np.pad(y, (0, min_len - len(y)))
    peak = np.max(np.abs(y)) if y.size else 0.0
    if peak > 0: y = y / peak

    mfcc = librosa.feature.mfcc(y=y, sr=sr_target, n_mfcc=n_mfcc)
    d1   = librosa.feature.delta(mfcc)
    d2   = librosa.feature.delta(mfcc, order=2)
    zcr  = librosa.feature.zero_crossing_rate(y)[0]
    cent = librosa.feature.spectral_centroid(y=y, sr=sr_target)[0]
    bw   = librosa.feature.spectral_bandwidth(y=y, sr=sr_target)[0]
    roll = librosa.feature.spectral_rolloff(y=y, sr=sr_target, roll_percent=0.95)[0]

    def stats(v):
        return np.array([np.mean(v), np.std(v), np.median(v),
                         np.percentile(v, 5), np.percentile(v, 95)], dtype=np.float32)
    def agg(mat): return np.concatenate([stats(c) for c in mat], axis=0)

    feat = np.concatenate([agg(mfcc), agg(d1), agg(d2), stats(zcr), stats(cent), stats(bw), stats(roll)], 0).astype(np.float32)
    feat[~np.isfinite(feat)] = 0.0
    return feat

m = joblib.load("model.joblib")

def predict_file(path):
    x = extract_features(path).reshape(1, -1)
    proba = m.predict_proba(x)[0]
    cls = list(m.named_steps['clf'].classes_) if 'clf' in getattr(m, 'named_steps', {}) else list(m.classes_)
    p_native = float(proba[cls.index(0)]) if 0 in cls else None
    p_non    = float(proba[cls.index(1)]) if 1 in cls else None
    pred = int(m.predict(x)[0])
    label = "Native" if pred == 0 else "Non-Native"
    print(os.path.basename(path), "->", label, f"(p_native={p_native:.2f}, p_non_native={p_non:.2f})")
    return pred, p_native, p_non


predict_file("cmu_us_bdl_arctic/wav/arctic_a0009.wav")     
predict_file("nonnative/LXC/wav/arctic_a0003.wav")        


arctic_a0009.wav -> Native (p_native=1.00, p_non_native=0.00)
arctic_a0003.wav -> Non-Native (p_native=0.00, p_non_native=1.00)


(1, 7.675481947904457e-08, 0.9999999232451806)