In [None]:
cd '/Users/Max/Documents/GitHub/theory-kernel-analysis/'

In [None]:
import sys
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder

sys.path.append('src')
from models import KernelClassifier

### Part I changed ###

# I couldn't import a custom function from a different file in a .py script, 
# so I just ran then file containing the function in a notebook
%run '/Users/Max/Documents/GitHub/theory-kernel-analysis/src/sim_mappings.py'

# Here, I just substituted the original "MAPPINGS" with the simulated ones (either configs or aus)
MAPPINGS, list_configs = simulate_configs(500, 10)
#MAPPINGS, list_aus = simulate_aus(500, 10)

### No more changes (except data file path and .tsv file names) ###

# Define parameter names (AUs) and target label (EMOTIONS)
PARAM_NAMES = np.loadtxt('data/au_names_new.txt', dtype=str).tolist()
EMOTIONS = ['anger', 'disgust', 'fear', 'happy', 'sadness', 'surprise']

# One-hot encode target label
ohe = OneHotEncoder(categories='auto', sparse=False)
ohe.fit(np.arange(6)[:, np.newaxis])

# Define analysis parameters
beta = 1
kernel = 'linear'
subs = [str(s).zfill(2) for s in range(1, 61)]
scores_all, preds_all = [], []

# Loop across mappings (Darwin, Ekman, etc.)
for mapp_name, mapp in tqdm(MAPPINGS.items()):
    # ktype = kernel type (infer from kernel name)
    ktype = 'similarity' if kernel in ['cosine', 'sigmoid', 'linear'] else 'distance'

    # Initialize model!
    model = KernelClassifier(au_cfg=mapp, param_names=PARAM_NAMES, kernel=kernel, ktype=ktype,
                             binarize_X=False, normalization='softmax', beta=beta)
    
    #model = GridSearchCV(model, param_grid={'normalization': ['softmax', 'linear']})
    # Initialize scores (one score per subject and per emotion)
    scores = np.zeros((len(subs), len(EMOTIONS)))
    preds = []

    # Compute model performance per subject!
    for i, sub in enumerate(subs):
        data = pd.read_csv(f'/Users/Max/Documents/Studium/RA/AU mappings/data/ratings/sub-{sub}_ratings.tsv', sep='\t', index_col=0)
        data = data.query("emotion != 'other'")
        X, y = data.iloc[:, :-2], data.iloc[:, -2]

        # Technically, we're not "fitting" anything, but this will set up the mapping matrix (self.Z_)
        model.fit(X, y)

        # Predict data + compute performance (AUROC)
        y_pred = pd.DataFrame(model.predict_proba(X), index=X.index, columns=EMOTIONS)
        scores[i, :] = roc_auc_score(pd.get_dummies(y), y_pred, average=None)

        # Save results
        y_pred['sub'] = sub
        y_pred['intensity'] = data['intensity']
        y_pred['y_true'] = data['emotion']
        preds.append(y_pred)

    # Store scores and raw predictions
    scores = pd.DataFrame(scores, columns=EMOTIONS, index=subs).reset_index()
    scores = pd.melt(scores, id_vars='index', value_name='score', var_name='emotion')
    scores = scores.rename({'index': 'sub'}, axis=1)
    scores['mapping'] = mapp_name
    scores['kernel'] = kernel
    scores['beta'] = beta
    scores_all.append(scores)

    preds = pd.concat(preds, axis=0)
    preds['mapping'] = mapp_name    
    preds_all.append(preds)

# Save scores and predictions
scores = pd.concat(scores_all, axis=0)
scores.to_csv('results/scores_sim.tsv', sep='\t')
#print(scores.groupby(['emotion', 'mapping']).mean())

# Save predictions (takes a while). Not really necessary, but maybe useful for 
# follow-up analyses
preds = pd.concat(preds_all)
preds.to_csv('results/predictions_sim.tsv', sep='\t')

In [None]:
# Plot the config results
import matplotlib.pyplot as plt
import seaborn as sns

scores['n_config'] = np.repeat(list_configs, 360)
plt.figure(figsize=(9,5))
sns.set_style("white", {'font.family':'serif', 'ytick.left': True})
f = sns.barplot(x='n_configs', y='score', data=scores)
sns.stripplot(x='n_configs', y='score', data=scores, color='black')

In [None]:
# Plot the aus results
import matplotlib.pyplot as plt
import seaborn as sns

scores['n_aus'] = np.repeat(list_aus, 360)
plt.figure(figsize=(9,5))
sns.set_style("white", {'font.family':'serif', 'ytick.left': True})
f = sns.barplot(x='n_aus', y='score', data=scores)
sns.stripplot(x='n_aus', y='score', data=scores, color='black')