In [90]:
"""
Benchmarking 11 ML Algorithms for Anomaly Detection in Survey Research (v3)
===========================================================================
- AE: trained on normals only, deeper architecture, 97.5th pct threshold
- 8 transductive algorithms: uniform 97.5th percentile threshold
- Stray: EVT p-value < 0.05
- OCSVM & IF: sensitivity analysis on contamination parameter
- 5-fold stratified CV for model-based algorithms
- Parameter tuning for all algorithms
"""
import numpy as np, pandas as pd
from scipy.spatial.distance import cdist
from scipy.stats import gumbel_r
from sklearn.neighbors import NearestNeighbors, LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (roc_auc_score, precision_score, recall_score,
                             f1_score, confusion_matrix, average_precision_score)
import warnings
warnings.filterwarnings('ignore')


In [91]:
# Cell 2

# ============================================================
# 1. DATA
# ============================================================
print("="*70); print("STEP 1: Data"); print("="*70)
df = pd.read_csv(
    r'G:\\My Drive\\WorkingFolder\\AI and Machine learning\\'
    r'AI in detecting aberrant response patterns\\2 Machine learning comparisons for anomaly\\'
    r'complex_survey_sim_3000.csv'
  # r'complex_survey_sim_1800.csv'
)
item_cols = [c for c in df.columns if c.startswith('Item')]
X_raw = df[item_cols].values.astype(np.float64)
labels = df['is_anomaly'].values
styles = df['style'].values
n, p = X_raw.shape
anomaly_types = ['acquiescence','extreme','careless','random','straightline','alternating']


STEP 1: Data


In [92]:
# Cell 3
print(f"Data: {n} x {p}, Normal={sum(labels==0)}, Anomaly={sum(labels==1)}")
for s in anomaly_types: print(f"  {s}: {sum(styles==s)}")

Data: 3000 x 85, Normal=2700, Anomaly=300
  acquiescence: 72
  extreme: 60
  careless: 60
  random: 48
  straightline: 36
  alternating: 24


In [93]:
# Cell 4
# Impute + scale
X_imp = X_raw.copy()
for j in range(p):
    m = np.isnan(X_imp[:,j]); X_imp[m,j] = np.nanmedian(X_imp[:,j])
X_01 = MinMaxScaler().fit_transform(X_imp)
X_std = StandardScaler().fit_transform(X_imp)
M_raw = (~np.isnan(X_raw)).astype(float)
print(f"Missing: {np.isnan(X_raw).sum()}")

Missing: 0


In [94]:
# Cell 5
# ============================================================
# 2. TUNE k
# ============================================================
print("\n"+"="*70); print("STEP 2: Tune k"); print("="*70)
k_cands = [5,10,15,20]
best_k, best_auc_k = 10, 0
for k in k_cands:
    lof_t = LocalOutlierFactor(n_neighbors=k, contamination='auto', novelty=False)
    lof_t.fit_predict(X_std)
    s = -lof_t.negative_outlier_factor_
    a = roc_auc_score(labels, s)
    print(f"  k={k:2d}: LOF AUC={a:.4f}")
    if a > best_auc_k: best_auc_k=a; best_k=k
K = best_k; print(f"Selected k={K}")


STEP 2: Tune k
  k= 5: LOF AUC=0.9390
  k=10: LOF AUC=0.9516
  k=15: LOF AUC=0.9532
  k=20: LOF AUC=0.9535
Selected k=20


In [95]:
# Cell 6
K_C = K+5
nn = NearestNeighbors(n_neighbors=K_C+1, metric='euclidean', n_jobs=-1)
nn.fit(X_std)
dist_all, idx_all = nn.kneighbors(X_std)
dist_all = dist_all[:,1:]; idx_all = idx_all[:,1:]

In [96]:
# Cell 7
THR_PCT = 97.5
print(f"Threshold: {THR_PCT}th percentile")

Threshold: 97.5th percentile


In [97]:
# Cell 8
# ============================================================
# 3. TRANSDUCTIVE ALGORITHMS
# ============================================================
print("\n"+"="*70); print("STEP 3: 8 Transductive algorithms"); print("="*70)
scores = {}; predictions = {}


STEP 3: 8 Transductive algorithms


In [98]:
# Cell 9
def classify_pct(s, pct=THR_PCT):
    t = np.percentile(s, pct); return (s>t).astype(int), t

In [99]:
# Cell 10
# 1. KNN-AGG
print("  [1] KNN-AGG")
K_MIN = max(1, K-5)
knn_s = np.sum(dist_all[:, K_MIN:K], axis=1)
scores['KNN-AGG']=knn_s; predictions['KNN-AGG'],t=classify_pct(knn_s)
print(f"      k_min={K_MIN},k_max={K}, thr={t:.3f}, flagged={predictions['KNN-AGG'].sum()}")

  [1] KNN-AGG
      k_min=15,k_max=20, thr=56.749, flagged=75


In [100]:
# Cell 11
# 2. LOF
print("  [2] LOF")
lof = LocalOutlierFactor(n_neighbors=K, contamination='auto', novelty=False, n_jobs=-1)
lof.fit_predict(X_std)
lof_s = -lof.negative_outlier_factor_
scores['LOF']=lof_s; predictions['LOF'],t=classify_pct(lof_s)
print(f"      k={K}, thr={t:.3f}, flagged={predictions['LOF'].sum()}")

  [2] LOF
      k=20, thr=1.423, flagged=75


In [101]:
# Cell 12
# 3. COF
print("  [3] COF")
cof_s = np.zeros(n)
for i in range(n):
    nbs = idx_all[i,:K]; cost=0.0; rem=list(nbs); vis=[i]
    for step in range(K):
        md=np.inf; best=-1
        for r in rem:
            for v in vis:
                d=np.linalg.norm(X_std[v]-X_std[r])
                if d<md: md=d; best=r
        if best>=0: cost+=md*(K-step)/K; rem.remove(best); vis.append(best)
    nc = np.mean([np.mean(dist_all[nb,:K]) for nb in nbs])
    cof_s[i] = cost/(nc+1e-10)
scores['COF']=cof_s; predictions['COF'],t=classify_pct(cof_s)
print(f"      k={K}, thr={t:.3f}, flagged={predictions['COF'].sum()}")

  [3] COF
      k=20, thr=10.542, flagged=75


In [102]:
# Cell 13
# 4. INFLO
print("  [4] INFLO")
inflo_s = np.zeros(n)
rnn = [set() for _ in range(n)]
for i in range(n):
    for j in idx_all[i,:K]: rnn[j].add(i)
for i in range(n):
    inf_set = set(idx_all[i,:K]) | rnn[i]
    if not inf_set: inflo_s[i]=1.0; continue
    di = 1.0/(np.mean(dist_all[i,:K])+1e-10)
    dn = [1.0/(np.mean(dist_all[j,:K])+1e-10) for j in inf_set]
    inflo_s[i] = np.mean(dn)/(di+1e-10)
scores['INFLO']=inflo_s; predictions['INFLO'],t=classify_pct(inflo_s)
print(f"      k={K}, thr={t:.3f}, flagged={predictions['INFLO'].sum()}")

  [4] INFLO
      k=20, thr=1.467, flagged=75


In [103]:
# Cell 14
# 5. KDEOS
print("  [5] KDEOS")
kde_r = np.zeros(n)
for i in range(n):
    bw=max(dist_all[i,K-1],1e-10)
    kde_r[i] = np.mean(np.exp(-0.5*(dist_all[i,:K]/bw)**2))/(bw+1e-10)
mu_k,std_k = np.mean(kde_r), np.std(kde_r)+1e-10
kdeos_s = -(kde_r-mu_k)/std_k
scores['KDEOS']=kdeos_s; predictions['KDEOS'],t=classify_pct(kdeos_s)
print(f"      k={K}, thr={t:.3f}, flagged={predictions['KDEOS'].sum()}")

  [5] KDEOS
      k=20, thr=2.776, flagged=75


In [104]:
# Cell 15
# 6. LDF
print("  [6] LDF")
lde = np.zeros(n)
for i in range(n):
    bw=dist_all[i,K-1]+1e-10
    lde[i] = np.mean(np.exp(-0.5*(dist_all[i,:K]/bw)**2))/bw
ldf_s = np.zeros(n)
for i in range(n):
    ldf_s[i] = np.mean(lde[idx_all[i,:K]])/(lde[i]+1e-10)
scores['LDF']=ldf_s; predictions['LDF'],t=classify_pct(ldf_s)
print(f"      k={K}, thr={t:.3f}, flagged={predictions['LDF'].sum()}")

  [6] LDF
      k=20, thr=1.473, flagged=75


In [105]:
# Cell 16
# 7. LDOF
print("  [7] LDOF")
ldof_s = np.zeros(n)
for i in range(n):
    nbs=idx_all[i,:K]; di=np.mean(dist_all[i,:K])
    pts=X_std[nbs]
    if len(nbs)>1:
        pw=cdist(pts,pts,'euclidean'); tri=np.triu_indices(len(nbs),k=1)
        Dn=np.mean(pw[tri]) if len(tri[0])>0 else 1.0
    else: Dn=1.0
    ldof_s[i]=di/(Dn+1e-10)
scores['LDOF']=ldof_s; predictions['LDOF'],t=classify_pct(ldof_s)
print(f"      k={K}, thr={t:.3f}, flagged={predictions['LDOF'].sum()}")

  [7] LDOF
      k=20, thr=1.237, flagged=75


In [106]:
# Cell 17
# 8. Stray
print("  [8] Stray (EVT)")
mg = np.array([np.max(np.diff(dist_all[i,:K])) if K>1 else 0 for i in range(n)])
mu_g=np.mean(mg); sig_g=np.std(mg)+1e-10
beta_g=sig_g*np.sqrt(6)/np.pi; mu_gum=mu_g-0.5772*beta_g
stray_pv = 1.0-gumbel_r.cdf(mg, loc=mu_gum, scale=beta_g)
stray_s = -np.log(stray_pv+1e-15)
scores['Stray']=stray_s
stray_alpha=0.05
predictions['Stray']=(stray_pv<stray_alpha).astype(int)
print(f"      k={K}, alpha={stray_alpha}, flagged={predictions['Stray'].sum()}")

  [8] Stray (EVT)
      k=20, alpha=0.05, flagged=158


In [107]:
# Cell 18
# ============================================================
# 4. AUTOENCODER (normals only, CV)
# ============================================================
print("\n"+"="*70); print("STEP 4: Autoencoder (normals only, 5-fold CV)"); print("="*70)
print("  NOTE: sklearn approximation. Replace with PyTorch for final paper.")


STEP 4: Autoencoder (normals only, 5-fold CV)
  NOTE: sklearn approximation. Replace with PyTorch for final paper.


# Cell 19
## PYTORCH SWAP POINT ###
Your PyTorch implementation key features to preserve:
  - Architecture: D->64->32->16->8->16->32->64->D, Sigmoid output
  - Train on normals ONLY
  - Denoising: 15% dropout of observed values  
  - Masked MSE (only score observed entries)
  - Early stopping patience=12
  - Threshold: 97.5th pct of TRAINING reconstruction errors

In [108]:
# Cell 20
ae_scores = np.full(n, np.nan)
ae_preds = np.zeros(n, dtype=int)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [109]:
# Cell 21
# Architecture tuning
print("  Tuning architecture...")
arch_cands = [
    ((64,32,16,8,16,32,64), 'deep-8'),
    ((64,32,8,32,64), 'deep-8-skip'),
    ((64,16,64), 'moderate-16'),
    ((128,32,8,32,128), 'wide-8'),
]
best_arch, best_arch_name, best_arch_auc = arch_cands[0][0], arch_cands[0][1], 0
for arch, name in arch_cands:
    fold_aucs = []
    for tr_i, te_i in skf.split(X_01, labels):
        tr_norm = tr_i[labels[tr_i]==0]
        ae_t = MLPRegressor(hidden_layer_sizes=arch, activation='relu', solver='adam',
                            max_iter=300, random_state=42, early_stopping=True,
                            validation_fraction=0.15, n_iter_no_change=12,
                            learning_rate_init=0.001, batch_size=128, tol=1e-5)
        ae_t.fit(X_01[tr_norm], X_01[tr_norm])
        recon = ae_t.predict(X_01[te_i])
        err = np.mean((X_01[te_i]-recon)**2, axis=1)
        if len(np.unique(labels[te_i]))>1:
            fold_aucs.append(roc_auc_score(labels[te_i], err))
    avg = np.mean(fold_aucs) if fold_aucs else 0
    print(f"    {name:15s} {str(arch):35s} AUC={avg:.4f}")
    if avg > best_arch_auc:
        best_arch_auc=avg; best_arch=arch; best_arch_name=name

  Tuning architecture...
    deep-8          (64, 32, 16, 8, 16, 32, 64)         AUC=0.9385
    deep-8-skip     (64, 32, 8, 32, 64)                 AUC=0.9366
    moderate-16     (64, 16, 64)                        AUC=0.9385
    wide-8          (128, 32, 8, 32, 128)               AUC=0.9406


In [131]:
# Cell 22
print(f"  Selected: {best_arch_name} {best_arch} (AUC={best_arch_auc:.4f})")

  Selected: wide-8 (128, 32, 8, 32, 128) (AUC=0.9406)


In [132]:
# Cell 23
# Final CV run
print("  Running final CV...")
for fold, (tr_i, te_i) in enumerate(skf.split(X_01, labels)):
    tr_norm = tr_i[labels[tr_i]==0]
    ae = MLPRegressor(hidden_layer_sizes=best_arch, activation='relu', solver='adam',
                      max_iter=500, random_state=42, early_stopping=True,
                      validation_fraction=0.15, n_iter_no_change=12,
                      learning_rate_init=0.001, batch_size=128, tol=1e-5)
    ae.fit(X_01[tr_norm], X_01[tr_norm])
    
    recon_test = ae.predict(X_01[te_i])
    err_test = np.mean((X_01[te_i]-recon_test)**2, axis=1)
    ae_scores[te_i] = err_test
    
    recon_train = ae.predict(X_01[tr_norm])
    err_train = np.mean((X_01[tr_norm]-recon_train)**2, axis=1)
    thr_ae = np.percentile(err_train, THR_PCT)
    ae_preds[te_i] = (err_test > thr_ae).astype(int)
    print(f"    Fold {fold+1}: train_n={len(tr_norm)}, thr={thr_ae:.4f}, "
          f"flagged={ae_preds[te_i].sum()}/{len(te_i)}")

  Running final CV...
    Fold 1: train_n=2160, thr=0.0476, flagged=61/600
    Fold 2: train_n=2160, thr=0.0469, flagged=77/600
    Fold 3: train_n=2160, thr=0.0492, flagged=63/600
    Fold 4: train_n=2160, thr=0.0498, flagged=58/600
    Fold 5: train_n=2160, thr=0.0494, flagged=70/600


In [133]:
# Cell 24
scores['Autoencoder']=ae_scores; predictions['Autoencoder']=ae_preds
print(f"  Total flagged: {ae_preds.sum()}")

  Total flagged: 329


In [134]:
# Cell 25
# ============================================================
# 5. OCSVM & ISOLATION FOREST
# ============================================================
print("\n"+"="*70); print("STEP 5: OCSVM & Isolation Forest"); print("="*70)


STEP 5: OCSVM & Isolation Forest


In [135]:
# Cell 26
nu_values = [0.05, 0.10, 0.15, 0.20, 0.25]
contam_values = [0.05, 0.10, 0.15, 0.20, 0.25]

In [136]:
# Cell 27
# OCSVM
print("\n  OCSVM...")
best_gamma='scale'; best_ga=0
for g in ['scale','auto']:
    fa=[]
    for tr_i,te_i in skf.split(X_std,labels):
        try:
            m=OneClassSVM(kernel='rbf',gamma=g,nu=0.15); m.fit(X_std[tr_i])
            fa.append(roc_auc_score(labels[te_i], -m.decision_function(X_std[te_i])))
        except: pass
    avg=np.mean(fa) if fa else 0
    if avg>best_ga: best_ga=avg; best_gamma=g
print(f"  gamma='{best_gamma}'")


  OCSVM...
  gamma='auto'


In [137]:
# Cell 28
ocsvm_sens = {}
for nu in nu_values:
    ns=np.zeros(n); np_=np.zeros(n,dtype=int)
    for tr_i,te_i in skf.split(X_std,labels):
        m=OneClassSVM(kernel='rbf',gamma=best_gamma,nu=nu); m.fit(X_std[tr_i])
        d=m.decision_function(X_std[te_i]); ns[te_i]=-d; np_[te_i]=(d<0).astype(int)
    a=roc_auc_score(labels,ns)
    ocsvm_sens[nu]={'scores':ns,'preds':np_,'n_flagged':np_.sum(),'auc':a}
    print(f"    nu={nu:.2f}: flagged={np_.sum():4d}, AUC={a:.4f}")
best_nu = max(nu_values, key=lambda v: ocsvm_sens[v]['auc'])
scores['OCSVM']=ocsvm_sens[best_nu]['scores']
predictions['OCSVM']=ocsvm_sens[best_nu]['preds']
print(f"  Best nu={best_nu}")

    nu=0.05: flagged= 182, AUC=0.8918
    nu=0.10: flagged= 339, AUC=0.8950
    nu=0.15: flagged= 483, AUC=0.8998
    nu=0.20: flagged= 631, AUC=0.9037
    nu=0.25: flagged= 771, AUC=0.9064
  Best nu=0.25


In [138]:
# Cell 29
# Isolation Forest
print("\n  Isolation Forest...")
best_ne,best_ia=200,0
for ne in [100,200,300]:
    fa=[]
    for tr_i,te_i in skf.split(X_std,labels):
        m=IsolationForest(n_estimators=ne,contamination=0.15,random_state=42,n_jobs=-1)
        m.fit(X_std[tr_i])
        fa.append(roc_auc_score(labels[te_i],-m.decision_function(X_std[te_i])))
    avg=np.mean(fa); print(f"    n_est={ne}: AUC={avg:.4f}")
    if avg>best_ia: best_ia=avg; best_ne=ne
print(f"  n_estimators={best_ne}")


  Isolation Forest...
    n_est=100: AUC=0.9002
    n_est=200: AUC=0.9140
    n_est=300: AUC=0.9131
  n_estimators=200


In [139]:
# Cell 30
if_sens = {}
for c in contam_values:
    cs=np.zeros(n); cp=np.zeros(n,dtype=int)
    for tr_i,te_i in skf.split(X_std,labels):
        m=IsolationForest(n_estimators=best_ne,contamination=c,random_state=42,n_jobs=-1)
        m.fit(X_std[tr_i])
        d=m.decision_function(X_std[te_i]); cs[te_i]=-d; cp[te_i]=(d<0).astype(int)
    a=roc_auc_score(labels,cs)
    if_sens[c]={'scores':cs,'preds':cp,'n_flagged':cp.sum(),'auc':a}
    print(f"    contam={c:.2f}: flagged={cp.sum():4d}, AUC={a:.4f}")
best_contam = max(contam_values, key=lambda v: if_sens[v]['auc'])
scores['IsolationForest']=if_sens[best_contam]['scores']
predictions['IsolationForest']=if_sens[best_contam]['preds']
print(f"  Best contam={best_contam}")

    contam=0.05: flagged= 155, AUC=0.9132
    contam=0.10: flagged= 323, AUC=0.9137
    contam=0.15: flagged= 489, AUC=0.9135
    contam=0.20: flagged= 629, AUC=0.9138
    contam=0.25: flagged= 784, AUC=0.9137
  Best contam=0.2


In [140]:
# Cell 31
# ============================================================
# 6. EVALUATION
# ============================================================
print("\n"+"="*70); print("STEP 6: Evaluation"); print("="*70)
alg_names = ['KNN-AGG','LOF','COF','INFLO','KDEOS','LDF','LDOF',
             'Autoencoder','OCSVM','IsolationForest','Stray']


STEP 6: Evaluation


In [142]:
# Cell 32
# Threshold summary
print("\n--- Threshold Summary ---")
for a in alg_names:
    print(f"  {a:20s}: flagged={predictions[a].sum():4d}")


--- Threshold Summary ---
  KNN-AGG             : flagged=  75
  LOF                 : flagged=  75
  COF                 : flagged=  75
  INFLO               : flagged=  75
  KDEOS               : flagged=  75
  LDF                 : flagged=  75
  LDOF                : flagged=  75
  Autoencoder         : flagged= 329
  OCSVM               : flagged= 771
  IsolationForest     : flagged= 629
  Stray               : flagged= 158


In [143]:
# Cell 33
# Overall
print("\n--- Overall Performance ---")
overall = {}
for a in alg_names:
    s=scores[a]; pr=predictions[a]
    v=~np.isnan(s); se=s[v]; le=labels[v]
    auc_r=roc_auc_score(le,se); ap=average_precision_score(le,se)
    prec=precision_score(labels,pr,zero_division=0)
    rec=recall_score(labels,pr,zero_division=0)
    f1=f1_score(labels,pr,zero_division=0)
    tn,fp,fn,tp=confusion_matrix(labels,pr).ravel()
    sp=tn/(tn+fp) if (tn+fp)>0 else 0
    overall[a]={'AUC-ROC':auc_r,'AP':ap,'Precision':prec,'Recall':rec,
                'F1':f1,'Specificity':sp,'TP':tp,'FP':fp,'FN':fn,'TN':tn}
    print(f"  {a:20s} AUC={auc_r:.3f} AP={ap:.3f} P={prec:.3f} R={rec:.3f} "
          f"F1={f1:.3f} Sp={sp:.3f} (TP={tp} FP={fp} FN={fn})")


--- Overall Performance ---
  KNN-AGG              AUC=0.927 AP=0.842 P=1.000 R=0.250 F1=0.400 Sp=1.000 (TP=75 FP=0 FN=225)
  LOF                  AUC=0.954 AP=0.902 P=1.000 R=0.250 F1=0.400 Sp=1.000 (TP=75 FP=0 FN=225)
  COF                  AUC=0.833 AP=0.623 P=0.987 R=0.247 F1=0.395 Sp=1.000 (TP=74 FP=1 FN=226)
  INFLO                AUC=0.941 AP=0.894 P=1.000 R=0.250 F1=0.400 Sp=1.000 (TP=75 FP=0 FN=225)
  KDEOS                AUC=0.929 AP=0.847 P=1.000 R=0.250 F1=0.400 Sp=1.000 (TP=75 FP=0 FN=225)
  LDF                  AUC=0.954 AP=0.906 P=1.000 R=0.250 F1=0.400 Sp=1.000 (TP=75 FP=0 FN=225)
  LDOF                 AUC=0.951 AP=0.894 P=1.000 R=0.250 F1=0.400 Sp=1.000 (TP=75 FP=0 FN=225)
  Autoencoder          AUC=0.940 AP=0.867 P=0.748 R=0.820 F1=0.782 Sp=0.969 (TP=246 FP=83 FN=54)
  OCSVM                AUC=0.906 AP=0.785 P=0.331 R=0.850 F1=0.476 Sp=0.809 (TP=255 FP=516 FN=45)
  IsolationForest      AUC=0.914 AP=0.763 P=0.397 R=0.833 F1=0.538 Sp=0.860 (TP=250 FP=379 FN=50)
  Stra

In [145]:
# Cell 34
# Per-type detection rate
print("\n--- Detection Rate by Type ---")
det_m = pd.DataFrame(index=alg_names, columns=anomaly_types, dtype=float)
for a in alg_names:
    for at in anomaly_types:
        idx=np.where(styles==at)[0]
        det_m.loc[a,at]=predictions[a][idx].mean() if len(idx)>0 else 0
print(det_m.round(3).to_string())


--- Detection Rate by Type ---
                 acquiescence  extreme  careless  random  straightline  alternating
KNN-AGG                 0.000    0.367     0.067   0.479         0.139        0.875
LOF                     0.000    0.450     0.100   0.417         0.139        0.708
COF                     0.056    0.300     0.200   0.521         0.056        0.542
INFLO                   0.000    0.450     0.117   0.500         0.139        0.500
KDEOS                   0.000    0.350     0.083   0.500         0.111        0.875
LDF                     0.000    0.450     0.100   0.438         0.139        0.667
LDOF                    0.000    0.550     0.117   0.417         0.083        0.500
Autoencoder             0.764    0.950     0.900   1.000         0.222        1.000
OCSVM                   0.764    1.000     0.867   1.000         0.444        1.000
IsolationForest         0.736    0.967     0.867   1.000         0.417        1.000
Stray                   0.014    0.033     0

In [146]:
# Cell 35
# Per-type AUC
print("\n--- AUC by Type ---")
auc_m = pd.DataFrame(index=alg_names, columns=anomaly_types, dtype=float)
for a in alg_names:
    s=scores[a]
    for at in anomaly_types:
        mask=(styles==at)|(labels==0); v=mask&~np.isnan(s)
        y_s=labels[v]; s_s=s[v]
        auc_m.loc[a,at]=roc_auc_score(y_s,s_s) if len(np.unique(y_s))>1 else np.nan
print(auc_m.round(3).to_string())


--- AUC by Type ---
                 acquiescence  extreme  careless  random  straightline  alternating
KNN-AGG                 0.888    0.985     0.976   1.000         0.683        1.000
LOF                     0.983    1.000     0.978   1.000         0.684        1.000
COF                     0.756    0.849     0.885   0.989         0.626        0.898
INFLO                   0.980    1.000     0.983   1.000         0.575        1.000
KDEOS                   0.896    0.985     0.980   1.000         0.674        1.000
LDF                     0.987    1.000     0.984   1.000         0.666        1.000
LDOF                    0.980    1.000     0.983   1.000         0.663        0.990
Autoencoder             0.960    0.994     0.987   1.000         0.613        1.000
OCSVM                   0.878    1.000     0.922   0.997         0.598        1.000
IsolationForest         0.874    0.984     0.936   0.995         0.674        1.000
Stray                   0.394    0.534     0.404   0.45

In [147]:
# Cell 36
# Per-type F1
print("\n--- F1 by Type ---")
f1_m = pd.DataFrame(index=alg_names, columns=anomaly_types, dtype=float)
for a in alg_names:
    for at in anomaly_types:
        mask=(styles==at)|(labels==0)
        f1_m.loc[a,at]=f1_score(labels[mask],predictions[a][mask],zero_division=0)
print(f1_m.round(3).to_string())


--- F1 by Type ---
                 acquiescence  extreme  careless  random  straightline  alternating
KNN-AGG                 0.000    0.537     0.125   0.648         0.244        0.933
LOF                     0.000    0.621     0.182   0.588         0.244        0.829
COF                     0.104    0.456     0.329   0.676         0.103        0.684
INFLO                   0.000    0.621     0.209   0.667         0.244        0.667
KDEOS                   0.000    0.519     0.154   0.667         0.200        0.933
LDF                     0.000    0.621     0.182   0.609         0.244        0.800
LDOF                    0.000    0.710     0.209   0.588         0.154        0.667
Autoencoder             0.524    0.570     0.548   0.536         0.126        0.366
OCSVM                   0.171    0.189     0.166   0.157         0.056        0.085
IsolationForest         0.210    0.233     0.212   0.202         0.070        0.112
Stray                   0.009    0.020     0.039   0.021

In [148]:
# Cell 37
# ============================================================
# 7. SAVE
# ============================================================
import os
print("\n" + "="*70); print("STEP 7: Save"); print("="*70)

output_dir = os.path.join(
    r"G:\My Drive\WorkingFolder",
    "AI and Machine learning",
    "AI in detecting aberrant response patterns",
    "2 Machine learning comparisons for anomaly"
)
os.makedirs(output_dir, exist_ok=True)

det_m.to_csv(os.path.join(output_dir, 'v3_detection_rates.csv'))
auc_m.to_csv(os.path.join(output_dir, 'v3_auc_by_type.csv'))
f1_m.to_csv(os.path.join(output_dir, 'v3_f1_by_type.csv'))
pd.DataFrame(overall).T.to_csv(os.path.join(output_dir, 'v3_overall.csv'))



STEP 7: Save


In [149]:
# Cell 38
sdf = pd.DataFrame(scores)
sdf['is_anomaly']=labels; sdf['style']=styles; sdf['respondent_id']=df['respondent_id'].values
for a in alg_names: sdf[f'{a}_pred']=predictions[a]
sdf.to_csv(os.path.join(output_dir, 'v3_raw_scores.csv'), index=False)

In [150]:
# Cell 39
# Sensitivity
sr=[]
for nu in nu_values:
    r={'alg':'OCSVM','param':'nu','val':nu,'flagged':ocsvm_sens[nu]['n_flagged'],'AUC':ocsvm_sens[nu]['auc']}
    for at in anomaly_types: r[f'det_{at}']=ocsvm_sens[nu]['preds'][styles==at].mean()
    sr.append(r)
for c in contam_values:
    r={'alg':'IF','param':'contam','val':c,'flagged':if_sens[c]['n_flagged'],'AUC':if_sens[c]['auc']}
    for at in anomaly_types: r[f'det_{at}']=if_sens[c]['preds'][styles==at].mean()
    sr.append(r)
pd.DataFrame(sr).to_csv(os.path.join(output_dir,'v3_sensitivity.csv'), index=False)

In [151]:
# Cell 40
# Parameters
pd.DataFrame({
    'Algorithm':alg_names,
    'Params':[f'k_min={K-5},k_max={K}',f'k={K}',f'k={K}',f'k={K}',f'k={K}',f'k={K}',f'k={K}',
              f'arch={best_arch}',f'nu={best_nu},g={best_gamma}',
              f'c={best_contam},ne={best_ne}',f'k={K},a={stray_alpha}'],
    'Threshold':[f'{THR_PCT}pct']*7+[f'{THR_PCT}pct(train-normals)','Dec.bndry','Dec.bndry',f'EVT p<{stray_alpha}'],
    'TrainOn':['All']*7+['Normals','All(CV)','All(CV)','All'],
    'CV':['No']*7+['5-fold']*3+['No'],
    'Flagged':[predictions[a].sum() for a in alg_names]
}).to_csv(os.path.join(output_dir, 'v3_params.csv'), index=False)

In [129]:
# Cell 41
# Save objects for plotting script
import pickle
pkl_path = os.path.join(output_dir, 'v3_data.pkl')

with open(pkl_path, 'wb') as f:
    pickle.dump({
        'scores':scores,'predictions':predictions,'overall':overall,
        'det_m':det_m,'auc_m':auc_m,'f1_m':f1_m,
        'alg_names':alg_names,'anomaly_types':anomaly_types,
        'ocsvm_sens':ocsvm_sens,'if_sens':if_sens,
        'nu_values':nu_values,'contam_values':contam_values,
        'styles':styles,'labels':labels,
        'K':K,'THR_PCT':THR_PCT,'best_nu':best_nu,'best_contam':best_contam,
        'stray_alpha':stray_alpha,'best_arch':best_arch,'best_gamma':best_gamma,'best_ne':best_ne
    }, f)

print(f"Saved pickle: {pkl_path}")

Saved pickle: G:\My Drive\WorkingFolder\AI and Machine learning\AI in detecting aberrant response patterns\2 Machine learning comparisons for anomaly\v3_data.pkl


In [152]:
# Cell 42
print("\nAll CSVs and data saved. Run v3_plots.py for figures.")
print("="*70); print("DONE"); print("="*70)


All CSVs and data saved. Run v3_plots.py for figures.
DONE
