In [1]:
# Cell 1
"""v3 Visualization script - run after v3_main.py"""
import numpy as np, pandas as pd, pickle
import matplotlib; matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
from math import pi

In [39]:
# Cell 2
from pathlib import Path

candidates = [
	Path('/home/claude/v3_data.pkl'),
	Path.cwd() / 'v3_data.pkl',
	Path.cwd().parent / 'v3_data.pkl',
	Path(
		'g:/My Drive/WorkingFolder/AI and Machine learning/'
		'AI in detecting aberrant response patterns/'
		'2 Machine learning comparisons for anomaly/v3_data.pkl'
	)
]

data_path = next((p for p in candidates if p.exists()), None)
if data_path is None:
	raise FileNotFoundError(
		"v3_data.pkl not found. Checked:\n" + "\n".join(str(p) for p in candidates)
	)

with open(data_path, 'rb') as f: D = pickle.load(f)

scores=D['scores']; preds=D['predictions']; overall=D['overall']
det_m=D['det_m']; auc_m=D['auc_m']; f1_m=D['f1_m']
alg_names=D['alg_names']; atypes=D['anomaly_types']
ocsvm_s=D['ocsvm_sens']; if_s=D['if_sens']
nu_vals=D['nu_values']; c_vals=D['contam_values']
styles=D['styles']; labels=D['labels']

In [40]:
# Cell 3
sns.set_theme(style="whitegrid", font_scale=1.05)
plt.rcParams['figure.dpi'] = 150
OUT = '/home/claude/'

In [41]:
# Cell 4
# Fig 1: Detection Rate Heatmap
fig, ax = plt.subplots(figsize=(10,7))
sns.heatmap(det_m.astype(float), annot=True, fmt='.2f', cmap='YlOrRd',
            vmin=0, vmax=1, linewidths=0.5, ax=ax, cbar_kws={'label':'Detection Rate'})
ax.set_title('Detection Rate by Algorithm and Anomaly Type', fontsize=13, fontweight='bold')

out_dir = Path(OUT)
if not out_dir.exists():
    out_dir = data_path.parent
out_dir.mkdir(parents=True, exist_ok=True)

plt.tight_layout(); plt.savefig(out_dir / 'v3_fig1_detection.png', bbox_inches='tight'); plt.close()
print("Fig 1")

Fig 1


In [42]:
# Cell 5
# Fig 2: AUC Heatmap
fig, ax = plt.subplots(figsize=(10,7))
sns.heatmap(auc_m.astype(float), annot=True, fmt='.3f', cmap='RdYlGn',
            vmin=0.0, vmax=1.0, linewidths=0.5, ax=ax, center=0.7,
            cbar_kws={'label':'AUC-ROC'})
ax.set_title('AUC-ROC by Algorithm and Anomaly Type (One-Type vs Normal)',
             fontsize=13, fontweight='bold')

out_dir = Path(OUT)
if not out_dir.exists():
    out_dir = data_path.parent
out_dir.mkdir(parents=True, exist_ok=True)

plt.tight_layout(); plt.savefig(out_dir / 'v3_fig2_auc.png', bbox_inches='tight'); plt.close()
print("Fig 2")

Fig 2


In [43]:
# Cell 6
# Fig 3: F1 Heatmap
fig, ax = plt.subplots(figsize=(10,7))
sns.heatmap(f1_m.astype(float), annot=True, fmt='.3f', cmap='RdYlGn',
            vmin=0, vmax=1, linewidths=0.5, ax=ax, center=0.5,
            cbar_kws={'label':'F1'})
ax.set_title('F1 Score by Algorithm and Anomaly Type', fontsize=13, fontweight='bold')

out_dir = Path(OUT)
if not out_dir.exists():
    out_dir = data_path.parent
out_dir.mkdir(parents=True, exist_ok=True)

plt.tight_layout(); plt.savefig(out_dir /'v3_fig3_f1.png', bbox_inches='tight'); plt.close()
print("Fig 3")

Fig 3


In [44]:
# Cell 7
# Fig 4: Ranked bars by type
fig, axes = plt.subplots(2,3, figsize=(18,11))
cpal = dict(zip(alg_names, sns.color_palette("husl", len(alg_names))))
for i, at in enumerate(atypes):
    ax = axes[i//3, i%3]
    vals = det_m[at].astype(float).sort_values(ascending=True)
    ax.barh(range(len(vals)), vals.values,
            color=[cpal[a] for a in vals.index], edgecolor='white')
    ax.set_yticks(range(len(vals))); ax.set_yticklabels(vals.index, fontsize=9)
    ax.set_xlim(0,1.1); ax.set_title(f'{at.capitalize()} (n={sum(styles==at)})',
                                       fontsize=12, fontweight='bold')
    ax.axvline(0.5, color='gray', ls='--', alpha=0.4)
    for j, v in enumerate(vals.values):
        ax.text(v+0.01, j, f'{v:.2f}', va='center', fontsize=8)

out_dir = Path(OUT)
if not out_dir.exists():
    out_dir = data_path.parent
out_dir.mkdir(parents=True, exist_ok=True)

plt.suptitle('Algorithm Ranking by Anomaly Type', fontsize=14, fontweight='bold')
plt.tight_layout(); plt.savefig(out_dir /'v3_fig4_ranking.png', bbox_inches='tight'); plt.close()
print("Fig 4")

Fig 4


In [45]:
# Cell 8
# Fig 5: Overall performance bars
fig, ax = plt.subplots(figsize=(12,6))
metrics = ['AUC-ROC','AP','F1','Specificity']
x = np.arange(len(alg_names)); w=0.2
cols = ['#2196F3','#4CAF50','#FF9800','#9C27B0']
for i, m in enumerate(metrics):
    ax.bar(x+i*w, [overall[a][m] for a in alg_names], w, label=m, color=cols[i], alpha=0.85)
ax.set_xticks(x+1.5*w); ax.set_xticklabels(alg_names, rotation=45, ha='right', fontsize=9)
ax.set_ylim(0,1.1); ax.legend(fontsize=9); ax.axhline(0.5, color='gray', ls='--', alpha=0.4)
ax.set_title('Overall Performance', fontsize=13, fontweight='bold')

out_dir = Path(OUT)
if not out_dir.exists():
    out_dir = data_path.parent
out_dir.mkdir(parents=True, exist_ok=True)

plt.tight_layout(); plt.savefig(out_dir /'v3_fig5_overall.png', bbox_inches='tight'); plt.close()
print("Fig 5")

Fig 5


In [46]:
# Cell 9
# Fig 6: Radar top 5
top5 = sorted(alg_names, key=lambda a: overall[a]['AUC-ROC'], reverse=True)[:5]
fig, ax = plt.subplots(figsize=(9,9), subplot_kw=dict(polar=True))
cr = sns.color_palette("husl", 5)
Nt = len(atypes)
angles = [i/Nt*2*pi for i in range(Nt)] + [0]
for r, alg in enumerate(top5):
    vals = [float(auc_m.loc[alg,at]) for at in atypes] + [float(auc_m.loc[alg,atypes[0]])]
    ax.plot(angles, vals, 'o-', lw=2, label=alg, color=cr[r])
    ax.fill(angles, vals, alpha=0.08, color=cr[r])
ax.set_xticks(angles[:-1]); ax.set_xticklabels([t.capitalize() for t in atypes], fontsize=11)
ax.set_ylim(0,1.05); ax.set_title('Top 5: AUC Profile', fontsize=13, fontweight='bold', pad=30)
ax.legend(loc='upper right', bbox_to_anchor=(1.35,1.1), fontsize=10)

out_dir = Path(OUT)
if not out_dir.exists():
    out_dir = data_path.parent
out_dir.mkdir(parents=True, exist_ok=True)

plt.tight_layout(); plt.savefig(out_dir /'v3_fig6_radar.png', bbox_inches='tight'); plt.close()
print("Fig 6")

Fig 6


In [47]:
# Cell 10
# Fig 7: Sensitivity
fig, axes = plt.subplots(1,2, figsize=(16,6))
for at in atypes:
    axes[0].plot(nu_vals, [ocsvm_s[nu]['preds'][styles==at].mean() for nu in nu_vals],
                 'o-', label=at.capitalize(), lw=2)
    axes[1].plot(c_vals, [if_s[c]['preds'][styles==at].mean() for c in c_vals],
                 'o-', label=at.capitalize(), lw=2)
axes[0].set_title('OCSVM: Sensitivity to nu', fontsize=13, fontweight='bold')
axes[0].set_xlabel('nu'); axes[0].set_ylabel('Detection Rate')
axes[0].legend(fontsize=8); axes[0].set_ylim(-0.05,1.05)
axes[1].set_title('IF: Sensitivity to contamination', fontsize=13, fontweight='bold')
axes[1].set_xlabel('contamination'); axes[1].set_ylabel('Detection Rate')
axes[1].legend(fontsize=8); axes[1].set_ylim(-0.05,1.05)

out_dir = Path(OUT)
if not out_dir.exists():
    out_dir = data_path.parent
out_dir.mkdir(parents=True, exist_ok=True)

plt.suptitle('Sensitivity Analysis', fontsize=14, fontweight='bold')
plt.tight_layout(); plt.savefig(out_dir /'v3_fig7_sensitivity.png', bbox_inches='tight'); plt.close()
print("Fig 7")

Fig 7


In [48]:
# Cell 11
# Fig 8: Agreement Jaccard
agree = pd.DataFrame(index=alg_names, columns=alg_names, dtype=float)
for a1 in alg_names:
    for a2 in alg_names:
        s1=set(np.where(preds[a1]==1)[0]); s2=set(np.where(preds[a2]==1)[0])
        u=len(s1|s2); agree.loc[a1,a2]=len(s1&s2)/u if u>0 else 0
fig, ax = plt.subplots(figsize=(10,8))
sns.heatmap(agree.astype(float), annot=True, fmt='.2f', cmap='Blues',
            vmin=0, vmax=1, linewidths=0.5, ax=ax, cbar_kws={'label':'Jaccard Similarity'})
ax.set_title('Algorithm Agreement (Jaccard)', fontsize=13, fontweight='bold')

out_dir = Path(OUT)
if not out_dir.exists():
    out_dir = data_path.parent
out_dir.mkdir(parents=True, exist_ok=True)

plt.tight_layout(); plt.savefig(out_dir /'v3_fig8_agreement.png', bbox_inches='tight'); plt.close()
print("Fig 8")

Fig 8


In [49]:
# Cell 12
# Fig 9: N flagged
fig, ax = plt.subplots(figsize=(10,5))
nf = [preds[a].sum() for a in alg_names]
colors_bar = ['#2196F3']*7+['#4CAF50']+['#FF9800']*2+['#E91E63']
bars = ax.bar(alg_names, nf, color=colors_bar, edgecolor='white', alpha=0.85)
ax.axhline(300, color='red', ls='--', lw=2, label='True anomalies (n=300)')
n_total = len(labels)
n_at_thr = int(n_total * (1 - D['THR_PCT']/100))
ax.axhline(n_at_thr, color='gray', ls=':', lw=1.5, label=f'Expected at {D["THR_PCT"]}th pct (n={n_at_thr})')
ax.set_ylabel('Cases Flagged'); ax.set_title('Number Flagged by Algorithm', fontsize=13, fontweight='bold')
ax.set_xticklabels(alg_names, rotation=45, ha='right', fontsize=9); ax.legend(fontsize=9)
for b, v in zip(bars, nf):
    ax.text(b.get_x()+b.get_width()/2., b.get_height()+5, str(v), ha='center', fontsize=9, fontweight='bold')

out_dir = Path(OUT)
if not out_dir.exists():
    out_dir = data_path.parent
out_dir.mkdir(parents=True, exist_ok=True)

plt.tight_layout(); plt.savefig(out_dir /'v3_fig9_nflagged.png', bbox_inches='tight'); plt.close()
print("Fig 9")

Fig 9


  ax.set_xticklabels(alg_names, rotation=45, ha='right', fontsize=9); ax.legend(fontsize=9)


In [50]:
# Cell 13
# Fig 10: Sensitivity AUC (overall) across parameter values
fig, axes = plt.subplots(1,2, figsize=(14,5))
axes[0].plot(nu_vals, [ocsvm_s[nu]['auc'] for nu in nu_vals], 'o-', lw=2, color='#2196F3')
axes[0].set_title('OCSVM: AUC-ROC vs nu', fontsize=13, fontweight='bold')
axes[0].set_xlabel('nu'); axes[0].set_ylabel('AUC-ROC'); axes[0].set_ylim(0.7,0.95)
n_flag_o = [ocsvm_s[nu]['n_flagged'] for nu in nu_vals]
ax2 = axes[0].twinx(); ax2.bar(nu_vals, n_flag_o, width=0.015, alpha=0.3, color='gray')
ax2.set_ylabel('N Flagged', color='gray')

Text(0, 0.5, 'N Flagged')

In [36]:
# Cell 14
axes[1].plot(c_vals, [if_s[c]['auc'] for c in c_vals], 'o-', lw=2, color='#FF9800')
axes[1].set_title('IF: AUC-ROC vs contamination', fontsize=13, fontweight='bold')
axes[1].set_xlabel('contamination'); axes[1].set_ylabel('AUC-ROC'); axes[1].set_ylim(0.7,0.95)
n_flag_i = [if_s[c]['n_flagged'] for c in c_vals]
ax3 = axes[1].twinx(); ax3.bar(c_vals, n_flag_i, width=0.015, alpha=0.3, color='gray')
ax3.set_ylabel('N Flagged', color='gray')

Text(0, 0.5, 'N Flagged')

In [51]:
# Cell 15

out_dir = Path(OUT)
if not out_dir.exists():
    out_dir = data_path.parent
out_dir.mkdir(parents=True, exist_ok=True)

plt.suptitle('Sensitivity: AUC-ROC Stability Across Parameter Values', fontsize=14, fontweight='bold')
plt.tight_layout(); plt.savefig(out_dir /'v3_fig10_sens_auc.png', bbox_inches='tight'); plt.close()
print("Fig 10")

Fig 10


In [52]:
# Cell 16
print("\nAll 10 figures saved.")


All 10 figures saved.


In [None]:
# Cell 17: Anomaly Score Distributions with 97% Cutoff
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Use the loaded data dictionary D
scores_data = D['scores']
alg_list = D['alg_names']

# Calculate number of rows/cols for subplots
n_algs = len(alg_list)
n_cols = 3
n_rows = (n_algs + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, 4 * n_rows))
axes = axes.flatten()

cutoff_percent = 97.0

for i, alg_name in enumerate(alg_list):
    ax = axes[i]
    if alg_name in scores_data:
        s_vals = scores_data[alg_name]
        # Filter out NaNs
        s_vals = s_vals[~np.isnan(s_vals)]
        
        # Plot distribution
        sns.histplot(s_vals, kde=True, ax=ax, color='steelblue', stat='density', alpha=0.6)
        
        # Calculate and plot cutoff line
        cutoff_val = np.percentile(s_vals, cutoff_percent)
        ax.axvline(cutoff_val, color='red', linestyle='--', linewidth=2, label=f'{cutoff_percent}% Cutoff: {cutoff_val:.2f}')
        
        ax.set_title(f'{alg_name} Distribution', fontsize=12, fontweight='bold')
        ax.set_xlabel('Anomaly Score')
        ax.legend()
    else:
        ax.text(0.5, 0.5, 'No score data available', ha='center', va='center')

# Hide unused subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

# Save the figure
out_path_dist = out_dir / 'v3_fig11_Score_Distributions.png'
fig.savefig(out_path_dist, dpi=300, bbox_inches='tight')
print(f"Saved: {out_path_dist}")


  plt.show()


Saved: g:\My Drive\WorkingFolder\AI and Machine learning\AI in detecting aberrant response patterns\2 Machine learning comparisons for anomaly\11_Score_Distributions.png


In [None]:
# Cell 18: Response Scale Distributions (NOTE: Requires raw data, currently not in pickle)
# To plot expected response scale distributions, we need the raw item response data (df).
# The current v3_data.pkl only contains processed results (scores, predictions, labels).

# 1. Load the original CSV to get raw responses
csv_path = data_path.parent / 'complex_survey_sim_3000.csv' 
# Fallback if 3000 not found, try 1800 or standard name
if not csv_path.exists():
    csv_path = data_path.parent / 'complex_survey_sim_1800.csv'

if csv_path.exists():
    print(f"Loading raw data from: {csv_path}")
    df_raw = pd.read_csv(csv_path)
    item_cols = [c for c in df_raw.columns if c.startswith('Item')]
    X_raw = df_raw[item_cols].values
    
    # 2. Plotting
    # We will plot the distribution of ALL item values for:
    # - Normal respondents (labels == 0)
    # - Each anomaly type (style == 'acquiescence', etc.)
    
    unique_styles = sorted(list(set(styles)))
    # Ensure 'Normal' is first if distinct, or handled via labels
    # style array usually has 'normal' or similar for normal cases, 
    # but we can use labels==0 to be sure.
    
    plot_groups = ['Normal'] + atypes
    
    n_groups = len(plot_groups)
    n_cols = 3
    n_rows = (n_groups + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, 4 * n_rows))
    axes = axes.flatten()
    
    # Define a consistent x-axis range (assuming 5-point Likert 1-5 or similar)
    # We check min/max from data
    min_val = np.nanmin(X_raw)
    max_val = np.nanmax(X_raw)
    bins = np.linspace(min_val - 0.5, max_val + 0.5, int(max_val - min_val + 2))
    
    for i, group_name in enumerate(plot_groups):
        ax = axes[i]
        
        if group_name == 'Normal':
            mask = (labels == 0)
            color = 'green'
        else:
            mask = (styles == group_name)
            color = 'red'
            
        if np.sum(mask) > 0:
            # Flatten all responses for this group into one big array
            group_responses = X_raw[mask].flatten()
            # Remove NaNs if any
            group_responses = group_responses[~np.isnan(group_responses)]
            
            sns.histplot(group_responses, bins=bins, ax=ax, color=color, stat='density', alpha=0.6, discrete=True)
            ax.set_title(f'{group_name.capitalize()} (n={np.sum(mask)})', fontsize=12, fontweight='bold')
            ax.set_xlabel('Response Value')
            ax.set_ylabel('Density')
            ax.set_xlim(min_val - 0.5, max_val + 0.5)
            ax.set_xticks(np.arange(min_val, max_val + 1))
        else:
            ax.text(0.5, 0.5, f'No data for {group_name}', ha='center', va='center')
            
    # Hide unused subplots
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])
        
    plt.tight_layout()
    plt.suptitle('Response Scale Value Distributions by Type', fontsize=16, fontweight='bold', y=1.02)
    plt.show() # Show briefly if running interactively
    
    # Save
    out_path_resp = out_dir / 'v3_fig12_Response_Distributions.png'
    fig.savefig(out_path_resp, dpi=300, bbox_inches='tight')
    print(f"Saved: {out_path_resp}")

else:
    print(f"Original CSV not found at {csv_path}. Cannot plot response distributions.")


Loading raw data from: g:\My Drive\WorkingFolder\AI and Machine learning\AI in detecting aberrant response patterns\2 Machine learning comparisons for anomaly\complex_survey_sim_3000.csv


  plt.show() # Show briefly if running interactively


Saved: g:\My Drive\WorkingFolder\AI and Machine learning\AI in detecting aberrant response patterns\2 Machine learning comparisons for anomaly\12_Response_Distributions.png
