## Figural: Supervised Learning

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import clip
import pandas as pd
import numpy as np
import itertools
from xgboost import XGBClassifier, XGBRegressor
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, RandomForestRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
from scipy.stats import pearsonr
from tqdm.auto import tqdm
from figural.utils import autoset_device, load_data_and_gt, load_config, load_scorers, print_metrics, get_classification_metrics, get_regression_metrics

In [3]:
device = autoset_device()
model, preprocess = clip.load("ViT-B/32", device=device)

  elif torch.has_mps:


CLIP doesn't work on M1 GPUs yet; check here for updates: https://github.com/openai/CLIP/issues/247


In [4]:
from pathlib import Path
meta = load_config('../../config.yaml')
data = load_data_and_gt(meta, results_path='../../data/metrics/all_data.csv')
#data = pd.concat([data[(data.test == 'ttct') & data.crop_bottom],
#                  data[(data.test == 'audra') & ~data.crop_bottom]])
data = data[~data.crop_bottom]
data = data.drop(columns=['crop_bottom', 'contrast_factor', 'reg_err', 'img_path'])
data['path'] = data['path'].apply(Path)
data.groupby(['test', 'task']).count()

Ground Truth size:  (22018, 17)


Unnamed: 0_level_0,Unnamed: 1_level_0,path,blank_dist,id,activity,avg_dist,zlist_least_dist,zlist_most_dist,zlist_mean_dist,zlist_3least_dist,elaboration_raw,...,E,R,C,name,pdf_path,error,B,participant,O_raw,testset
test,task,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
audra,far,679,679,679,679,679,0,0,0,0,679,...,0,0,0,0,0,0,0,679,679,679
audra,general1,670,670,670,670,670,0,0,0,0,670,...,0,0,0,0,0,0,0,670,670,670
audra,primary,11075,11075,11075,11075,11075,0,0,0,0,11075,...,0,0,0,0,0,0,0,11075,11075,11075
ttct,bookleta,4935,4935,4935,4935,4935,4935,4935,4935,4935,4935,...,3630,3247,3564,3884,0,0,0,0,0,4935
ttct,bookletb,4730,4730,4730,4730,4730,4730,4730,4730,4730,4730,...,3044,2700,3027,3465,4620,0,513,0,0,4730


In [5]:
loadedtasks = load_scorers(model, preprocess, meta,
                           load_features=True, include_cropped=False,
                           save_dir='../../data/img_features/')

ttct/bookleta:   0%|          | 0/11 [00:00<?, ?it/s]

ttct/bookletb:   0%|          | 0/11 [00:00<?, ?it/s]

audra/primary:   0%|          | 0/13 [00:00<?, ?it/s]

audra/general1:   0%|          | 0/7 [00:00<?, ?it/s]

audra/far:   0%|          | 0/7 [00:00<?, ?it/s]

In [6]:
# add text features where relevant
new_loadedtasks = []
save_dir = Path('../../data/txt_features/')
save_dir.mkdir(exist_ok=True)

for loadedtask in tqdm(loadedtasks):
    scorer = loadedtask['scorer']
    assert len(loadedtask['paths']) == len(scorer.impaths)
    # get rows where data['path'] is in scorer.impaths, following the same order
    datasubset = pd.DataFrame(scorer.impaths, columns=['path']).merge(data, on='path', how='left')
    loadedtask['data'] = datasubset.drop_duplicates()

    # when applicable, add text features for titles
    hasName = ~datasubset['name'].isna()
    if hasName.sum() == 0:
        loadedtask['txt_features'] = False
        continue

    if save_dir:
        save_location = Path(save_dir) / f'{loadedtask["test"]}_{loadedtask["task"]}_{loadedtask["activity"]}'
        # try to load from save_location
        if save_location.exists():
            state_dict = torch.load(save_location)
            loadedtask['txt_features_data'] = state_dict['data']
            loadedtask['txt_features'] = state_dict['txt_features']
            continue
    
    text = clip.tokenize(datasubset[hasName].name.astype(str)).to(device)
    with torch.no_grad():
        text_features = model.encode_text(text)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    loadedtask['txt_features'] = text_features.cpu().numpy()
    if save_dir:
        state_dict = dict(data=datasubset[hasName], txt_features=loadedtask['txt_features'])
        torch.save(state_dict, save_location)

  0%|          | 0/49 [00:00<?, ?it/s]

Reorder loadedtasks (which are by activity) to a by_task listing.

In [7]:
pd.__version__

'1.5.3'

In [8]:
by_task = {}
for l in loadedtasks:
    name = l['test'] + '_' + l['task']
    if name not in by_task:
        by_task[name] = {
            'name': name,
            'test': l['test'],
            'task': l['task'],
            'paths': l['paths'],
            'activities': [l['activity']],
            'scorers': [l['scorer']],
            'data': l['data']
        }
    else:
        by_task[name]['scorers'].append(l['scorer'])
        by_task[name]['paths'] += l['paths']
        by_task[name]['activities'].append(l['activity'])
        by_task[name]['data'] = pd.concat([by_task[name]['data'], l['data']])

    if l['txt_features'] is not False:
        if 'txt_features' not in by_task[name]:
            by_task[name]['txt_features'] = l['txt_features']
        else:
            by_task[name]['txt_features'] = np.vstack([by_task[name]['txt_features'], l['txt_features']])
    

## Training a different classifier for each activity

For binary (i.e. originality), treat as classification. For others, treat as regressor.

In [9]:
def select_data(task_def, targetvar, condition):
    ''' Select data and features for a given task, target variable, and condition'''
    datasubset = task_def['data']
    class_n = len(datasubset[targetvar].dropna().unique())

    image_features = np.vstack([scorer.get_image_features() for scorer in task_def['scorers']])
    assert len(image_features) == len(datasubset), f"Mismatch in dataframe {len(datasubset)} and image features {len(image_features)}"
    #print("Total data and image_features size", len(datasubset))

    hasText = ~datasubset['name'].isna() # lines with text
    #print("Rows with text", sum(hasText))
    #print("Total data and image_features size", len(datasubset))
    #print("condition:", condition)

    if 'txt_features' in task_def:
        text_features = task_def['txt_features']
        assert sum(hasText) == len(text_features), f"Text features expected to align with hasText count, but don't ({sum(hasText)}, {len(text_features)})"

    if condition in ['text', 'image+txt']:
        if not hasText.any() or 'txt_features' not in task_def:
            print(f'no text features for {name}/{targetvar}/{classifier}/{condition}')
            return None, None, None
        else:
            datasubset = datasubset[hasText]
            image_features = image_features[hasText]

    if datasubset[targetvar].isna().all():
        print(f"No values for target variable '{targetvar}' ({name}/{targetvar}/{classifier}/{condition}')")
        return None, None, None
    
    if condition == 'text':
        embeds = text_features
    elif condition == 'image':
        embeds = image_features
    elif condition == 'image+txt':
        embeds = np.hstack([image_features, text_features])
    else:
        raise Exception('bad condition')
    
    assert len(embeds) == len(datasubset), "Mismatch in embed size"
    
    return datasubset, class_n, embeds

# test that data loading works okay
target_vars = ['O']
classifiers = ['rf']
conditions = ['text', 'image', 'image+txt']
activities = data.activity.unique().tolist() #["all"]
run_permutations = list(itertools.product(by_task.keys(), target_vars, classifiers, conditions))
pbar = tqdm(run_permutations)
for name, targetvar, classifier, condition in pbar:
    pbar.set_description(f"{name}/{targetvar}/{classifier}/{condition}")
    task_def = by_task[name]
    datasubset, class_n, embeds = select_data(task_def, targetvar, condition)


  0%|          | 0/15 [00:00<?, ?it/s]

no text features for audra_primary/O/rf/text
no text features for audra_primary/O/rf/image+txt
no text features for audra_general1/O/rf/text
no text features for audra_general1/O/rf/image+txt
no text features for audra_far/O/rf/text
no text features for audra_far/O/rf/image+txt


In [34]:
seed = 12345

result_collector = []
target_vars = ['O'] #+ ['R', 'E', 'T']
classifiers = ['rf', 'ada', 'xgboost']
conditions = ['text', 'image', 'image+txt']
activities = data.activity.unique().tolist() #+ ['all']

all_ytrue = []
all_ypred = []
all_labels = []
all_meta = []

# TODO add elaboration to the model and see how much variance it explains
#include_elab = [True, False]

run_permutations = list(itertools.product(by_task.keys(), target_vars, classifiers, conditions))
total_progress = len(run_permutations)*len(activities)

def prep_row(test_y, y_pred, class_n, classifier_approach='individual'):
    row = dict(
        test=task_def['test'], task=task_def['task'],
        targetvar=targetvar, class_n=class_n, condition=condition, classifier=classifier,
        approach=classifier_approach, support=len(test_y)
    )

    if class_n == 2:
        metrics = get_classification_metrics(test_y, y_pred)
    else:
        metrics = get_regression_metrics(test_y, y_pred)
    
    row.update(metrics)
    return row

def get_data_per_activity(datasubset, embeds, activity, targetvar):
    if activity != "all":
        matches = (datasubset.activity == activity) & (~pd.to_numeric(datasubset[targetvar], errors='coerce').isna())
    else:
        matches = (~pd.to_numeric(datasubset[targetvar], errors='coerce').isna())
        #embeds = np.hstack([cat_one_hot, embeds])

    if matches.sum() == 0:
        return

    return { 
        'train': {
            'data': embeds[(matches & ~datasubset.testset)],
            'labels': datasubset.loc[(matches & ~datasubset.testset), targetvar] #.astype(bool)
            },
        'test': {
            'data': embeds[(matches & datasubset.testset)],
            'labels': datasubset.loc[(matches & datasubset.testset), targetvar].values,
            'activities': datasubset.loc[(matches & datasubset.testset), 'activity'].values
        }

    }

def instantiate_classifier(classifier, class_n, seed):
    if classifier == 'rf':
        if class_n == 2:
            clf = RandomForestClassifier(n_estimators = 300, random_state=seed)
        else:
            clf = RandomForestRegressor(n_estimators = 300, random_state=seed)
    elif classifier == 'xgboost':
        if class_n == 2:
            objective='binary:logistic'
            clf = XGBClassifier(n_estimators=300, learning_rate=0.2, objective=objective, random_state=seed)
        else:
            objective='reg:squarederror'
            clf = XGBRegressor(n_estimators=300, learning_rate=0.2, objective=objective, random_state=seed)
        
    elif classifier == 'ada':
        if class_n == 2:
            clf = AdaBoostClassifier(n_estimators=300, random_state=seed)
        else:
            clf = AdaBoostRegressor(n_estimators=300, random_state=seed)
    elif classifier == 'bagg':
        if class_n == 2:
            clf = BaggingClassifier(n_estimators=300, random_state=seed)
        else:
            clf = BaggingRegressor(n_estimators=300, random_state=seed)
    return clf
    
with tqdm(total=total_progress) as tbar:
    for name, targetvar, classifier, condition in run_permutations:
        task_def = by_task[name]
        ytrue_collector = []
        ypred_collector = []
        label_collector = []

        datasubset, class_n, embeds = select_data(task_def, targetvar, condition)
        if datasubset is None:
            tbar.update(len(activities))
            continue
        
        assert len(embeds) == len(datasubset), "Mismatch in input sizes"

        # For 'combined' classifier, I initially one-hot encoded the activities into a set of special features
        # I'm commenting this out, however, because the tasks already have the black pixels in the same place identifying the
        # activity, so I'm just using the image features as-is

        #enc = OneHotEncoder().fit(datasubset.activity.values.reshape(-1, 1))
        #cat_one_hot = enc.transform(datasubset.activity.values.reshape(-1, 1)).toarray()
        #embeds = np.hstack([cat_one_hot, embeds])
        train_examples = 0
        for activity in activities:
            tbar.set_description(f'{name}/{targetvar}/{classifier}/{condition}/{activity}/{"classifier" if class_n == 2 else "regressor"}')

            gt = get_data_per_activity(datasubset, embeds, activity, targetvar)
            activity_train_count = len(gt['train']['labels']) if gt is not None else 0
            if gt is None:
                tbar.update()
                continue

            clf = instantiate_classifier(classifier, class_n, seed)
            clf.fit(gt['train']['data'], gt['train']['labels'])
            y_pred = clf.predict(gt['test']['data'])

            if activity != 'all':
                ypred_collector.append(y_pred)
                ytrue_collector.append(gt['test']['labels'])
                label_collector.append(gt['test']['activities'].tolist())
                train_examples += activity_train_count
            else:
                # save these results alone, not aggregated
                row = prep_row(gt['test']['labels'], y_pred, class_n, classifier_approach='combined')
                row['train_count'] = activity_train_count
                result_collector.append(row)
                print(name, targetvar, classifier, condition, 'combined\n-----')
                print_metrics(test_y, y_pred, 'class' if class_n == 2 else 'regression')
                print('\n\n')
            tbar.update()

        # combine activities (except all, which was already added to result_collector)
        if len(ytrue_collector):
            test_y = np.hstack(ytrue_collector)
            y_pred = np.hstack(ypred_collector)
            row = prep_row(test_y, y_pred, class_n, classifier_approach='individual')
            row['train_count'] = train_examples
            result_collector.append(row)
        print(name, targetvar, classifier, condition, 'individual\n-----')
        print_metrics(test_y, y_pred, 'class' if class_n == 2 else 'regression')
        print('\n\n')

        if (condition == 'image+txt'):
            print("Relative importance of features")
            # Determine the relative importance of each set of features
            featnames = ['img'] * 512 + ['txt'] * 512
            x = pd.DataFrame(zip(featnames, clf.feature_importances_), columns=['label', 'importance'])
            display(x.groupby('label').aggregate(['mean', 'sum']))

        all_ytrue += ytrue_collector
        all_ypred += ypred_collector
        all_labels += label_collector
        all_meta += [dict(test=name, targetvar=targetvar, classifier=classifier, condition=condition)] * len(ytrue_collector)

results = pd.DataFrame(result_collector)
results['rmse'] = results.mse.apply(np.sqrt)
first_cols = ['test', 'task', 'targetvar', 'condition', 'classifier', 'approach']
col_order = first_cols + [col for col in results.columns if col not in first_cols]
results = results[col_order].sort_values(first_cols)

# keep dataframe from trucating display horizontally
pd.set_option('display.max_columns', None)

# add a date string to filename
from datetime import datetime
now = datetime.now()
dt_string = now.strftime("%Y-%m-%d-%H-%M-%S")
results.round(4).to_csv(f'../../data/supervised_results_{dt_string}.csv', index=False)

# save all_ytrue, all_ypred, all_labels, all_meta to a csv
df_collector = []
for i, row in pd.DataFrame(all_meta).iterrows():
    df = pd.DataFrame(list(zip(all_labels[i], all_ytrue[i], all_ypred[i])), columns=['item', 'prediction', 'truth'])
    df['test'] = row['test']
    df['targetvar'] = row['targetvar']
    df['classifier'] = row['classifier']
    df['condition'] = row['condition']
    df_collector.append(df)
all_results = pd.concat(df_collector)
all_results.to_csv(f'../../data/supervised_results_all_{dt_string}.csv', index=False)

results.round(2).fillna('')

  0%|          | 0/1215 [00:00<?, ?it/s]

ttct_bookleta O rf text individual
-----
Accuracy: 0.78
Precision: 0.73
Recall: 0.93
F1: 0.82
Pearson_r: 0.57
Roc_auc: 0.77
Mcc: 0.57
Support: 303



ttct_bookleta O rf image individual
-----
Accuracy: 0.79
Precision: 0.75
Recall: 0.92
F1: 0.83
Pearson_r: 0.58
Roc_auc: 0.77
Mcc: 0.58
Support: 329



ttct_bookleta O rf image+txt individual
-----
Accuracy: 0.81
Precision: 0.75
Recall: 0.94
F1: 0.84
Pearson_r: 0.62
Roc_auc: 0.79
Mcc: 0.62
Support: 303



Relative importance of features


Unnamed: 0_level_0,importance,importance
Unnamed: 0_level_1,mean,sum
label,Unnamed: 1_level_2,Unnamed: 2_level_2
img,0.000853,0.436518
txt,0.001101,0.563482


ttct_bookleta O ada text individual
-----
Accuracy: 0.76
Precision: 0.74
Recall: 0.85
F1: 0.79
Pearson_r: 0.51
Roc_auc: 0.75
Mcc: 0.51
Support: 303



ttct_bookleta O ada image individual
-----
Accuracy: 0.80
Precision: 0.78
Recall: 0.88
F1: 0.83
Pearson_r: 0.59
Roc_auc: 0.79
Mcc: 0.59
Support: 329



ttct_bookleta O ada image+txt individual
-----
Accuracy: 0.84
Precision: 0.81
Recall: 0.91
F1: 0.86
Pearson_r: 0.68
Roc_auc: 0.83
Mcc: 0.68
Support: 303



Relative importance of features


Unnamed: 0_level_0,importance,importance
Unnamed: 0_level_1,mean,sum
label,Unnamed: 1_level_2,Unnamed: 2_level_2
img,0.000716,0.366667
txt,0.001237,0.633333


ttct_bookleta O xgboost text individual
-----
Accuracy: 0.77
Precision: 0.74
Recall: 0.88
F1: 0.81
Pearson_r: 0.55
Roc_auc: 0.76
Mcc: 0.55
Support: 303



ttct_bookleta O xgboost image individual
-----
Accuracy: 0.78
Precision: 0.76
Recall: 0.88
F1: 0.82
Pearson_r: 0.56
Roc_auc: 0.77
Mcc: 0.56
Support: 329



ttct_bookleta O xgboost image+txt individual
-----
Accuracy: 0.82
Precision: 0.77
Recall: 0.94
F1: 0.85
Pearson_r: 0.65
Roc_auc: 0.81
Mcc: 0.65
Support: 303



Relative importance of features


Unnamed: 0_level_0,importance,importance
Unnamed: 0_level_1,mean,sum
label,Unnamed: 1_level_2,Unnamed: 2_level_2
img,0.000812,0.415834
txt,0.001141,0.584166


ttct_bookletb O rf text individual
-----
Accuracy: 0.85
Precision: 0.85
Recall: 0.95
F1: 0.90
Pearson_r: 0.64
Roc_auc: 0.79
Mcc: 0.64
Support: 299



ttct_bookletb O rf image individual
-----
Accuracy: 0.82
Precision: 0.82
Recall: 0.94
F1: 0.88
Pearson_r: 0.56
Roc_auc: 0.75
Mcc: 0.56
Support: 318



ttct_bookletb O rf image+txt individual
-----
Accuracy: 0.87
Precision: 0.87
Recall: 0.96
F1: 0.91
Pearson_r: 0.68
Roc_auc: 0.81
Mcc: 0.68
Support: 299



Relative importance of features


Unnamed: 0_level_0,importance,importance
Unnamed: 0_level_1,mean,sum
label,Unnamed: 1_level_2,Unnamed: 2_level_2
img,0.000815,0.417191
txt,0.001138,0.582809


ttct_bookletb O ada text individual
-----
Accuracy: 0.83
Precision: 0.88
Recall: 0.87
F1: 0.87
Pearson_r: 0.59
Roc_auc: 0.80
Mcc: 0.59
Support: 299



ttct_bookletb O ada image individual
-----
Accuracy: 0.76
Precision: 0.82
Recall: 0.85
F1: 0.83
Pearson_r: 0.44
Roc_auc: 0.71
Mcc: 0.44
Support: 318



ttct_bookletb O ada image+txt individual
-----
Accuracy: 0.85
Precision: 0.88
Recall: 0.90
F1: 0.89
Pearson_r: 0.63
Roc_auc: 0.81
Mcc: 0.63
Support: 299



Relative importance of features


Unnamed: 0_level_0,importance,importance
Unnamed: 0_level_1,mean,sum
label,Unnamed: 1_level_2,Unnamed: 2_level_2
img,0.000638,0.326667
txt,0.001315,0.673333


ttct_bookletb O xgboost text individual
-----
Accuracy: 0.86
Precision: 0.87
Recall: 0.93
F1: 0.90
Pearson_r: 0.65
Roc_auc: 0.81
Mcc: 0.65
Support: 299



ttct_bookletb O xgboost image individual
-----
Accuracy: 0.78
Precision: 0.81
Recall: 0.89
F1: 0.85
Pearson_r: 0.45
Roc_auc: 0.71
Mcc: 0.45
Support: 318



ttct_bookletb O xgboost image+txt individual
-----
Accuracy: 0.87
Precision: 0.87
Recall: 0.95
F1: 0.91
Pearson_r: 0.68
Roc_auc: 0.81
Mcc: 0.68
Support: 299



Relative importance of features


Unnamed: 0_level_0,importance,importance
Unnamed: 0_level_1,mean,sum
label,Unnamed: 1_level_2,Unnamed: 2_level_2
img,0.000867,0.443786
txt,0.001086,0.556214


no text features for audra_primary/O/rf/text
audra_primary O rf image individual
-----
Mse: 0.01
Rmse: 0.09
R2: 0.61
Pearson_r: 0.78
R_pval: 0.00
Support: 1150



no text features for audra_primary/O/rf/image+txt
no text features for audra_primary/O/ada/text
audra_primary O ada image individual
-----
Mse: 0.01
Rmse: 0.09
R2: 0.62
Pearson_r: 0.80
R_pval: 0.00
Support: 1150



no text features for audra_primary/O/ada/image+txt
no text features for audra_primary/O/xgboost/text
audra_primary O xgboost image individual
-----
Mse: 0.01
Rmse: 0.10
R2: 0.58
Pearson_r: 0.76
R_pval: 0.00
Support: 1150



no text features for audra_primary/O/xgboost/image+txt
no text features for audra_general1/O/rf/text
audra_general1 O rf image individual
-----
Mse: 0.02
Rmse: 0.15
R2: 0.40
Pearson_r: 0.64
R_pval: 0.00
Support: 68



no text features for audra_general1/O/rf/image+txt
no text features for audra_general1/O/ada/text
audra_general1 O ada image individual
-----
Mse: 0.02
Rmse: 0.15
R2: 0.43
Pearson_

AttributeError: 'list' object has no attribute 'iterrows'

## Analysis

In [42]:
data_dir = Path('../../data/')
newest_result = sorted(list(data_dir.glob('supervised_results_2*.csv')))[-1]
print(newest_result)
excel_path = newest_result.parent / (newest_result.stem + '.xlsx')
results = pd.read_csv(newest_result)

# Tidying
results['rmse'] = results['mse'].apply(np.sqrt)
firstcols = ['task', 'condition', 'classifier', 'train_count', 'support']
rename_cols = {'support': 'test_count', 'r2':'R2', 'f1': 'F1', 'pearson_r': 'r'}
cols = firstcols + [col for col in results.columns if col not in firstcols]
results = results[cols].rename(columns=rename_cols).round(2)
results.fillna('').sample(2)

../../data/supervised_results_2023-11-21-14-43-26.csv


Unnamed: 0,task,condition,classifier,train_count,test_count,test,targetvar,approach,class_n,accuracy,precision,recall,F1,r,roc_auc,mcc,mse,rmse,R2,r_pval
2,far,image,xgboost,614,65,audra,O,individual,88,,,,,0.49,,,0.04,0.21,0.21,0.0
8,primary,image,xgboost,9925,1150,audra,O,individual,7762,,,,,0.76,,,0.01,0.09,0.58,0.0


In [47]:
# collect dataframes to write to excel
sheet_collector = {}

In [48]:
# Write each dataframe to a sheet of an excel file
from pandas import ExcelWriter

sheet_collector['all_supervised'] = results
for test in ['ttct', 'audra']:
    print(test.center(80, '=').upper())
    subset = results[results.test == test].dropna(axis=1).drop(columns=['test', 'targetvar', 'approach', 'class_n']).round(4)
    if test == 'audra':
        # only looking at the primary task
        subset = subset[subset.task == 'primary']
    sheet_collector[f'{test}_results'] = subset
    display(subset.fillna(''))



Unnamed: 0,task,condition,classifier,train_count,test_count,accuracy,precision,recall,F1,r,roc_auc,mcc
9,bookleta,image,ada,3177,329,0.8,0.78,0.88,0.83,0.59,0.79,0.59
10,bookleta,image,rf,3177,329,0.79,0.75,0.92,0.83,0.58,0.77,0.58
11,bookleta,image,xgboost,3177,329,0.78,0.76,0.88,0.82,0.56,0.77,0.56
12,bookleta,image+txt,ada,2953,303,0.84,0.81,0.91,0.86,0.68,0.83,0.68
13,bookleta,image+txt,rf,2953,303,0.8,0.76,0.94,0.84,0.62,0.79,0.62
14,bookleta,image+txt,xgboost,2953,303,0.82,0.77,0.94,0.85,0.65,0.81,0.65
15,bookleta,text,ada,2953,303,0.76,0.74,0.85,0.79,0.51,0.75,0.51
16,bookleta,text,rf,2953,303,0.78,0.73,0.93,0.82,0.57,0.77,0.57
17,bookleta,text,xgboost,2953,303,0.77,0.74,0.88,0.81,0.55,0.76,0.55
18,bookletb,image,ada,2722,318,0.76,0.82,0.85,0.83,0.44,0.71,0.44




Unnamed: 0,task,condition,classifier,train_count,test_count,r,mse,rmse,R2,r_pval
6,primary,image,ada,9925,1150,0.8,0.01,0.09,0.62,0.0
7,primary,image,rf,9925,1150,0.78,0.01,0.09,0.61,0.0
8,primary,image,xgboost,9925,1150,0.76,0.01,0.09,0.58,0.0


### All Data

Classification performance on originality:

In [49]:
# All TTCT results (for appendix)
results[results.task.isin(['bookleta', 'bookletb'])][['task', 'condition', 'classifier', 'train_count', 'test_count', 'accuracy', 'precision', 'recall', 'F1', 'r']] # ignore ROC AUC and MCC for succinctness

Unnamed: 0,task,condition,classifier,train_count,test_count,accuracy,precision,recall,F1,r
9,bookleta,image,ada,3177,329,0.8,0.78,0.88,0.83,0.59
10,bookleta,image,rf,3177,329,0.79,0.75,0.92,0.83,0.58
11,bookleta,image,xgboost,3177,329,0.78,0.76,0.88,0.82,0.56
12,bookleta,image+txt,ada,2953,303,0.84,0.81,0.91,0.86,0.68
13,bookleta,image+txt,rf,2953,303,0.8,0.76,0.94,0.84,0.62
14,bookleta,image+txt,xgboost,2953,303,0.82,0.77,0.94,0.85,0.65
15,bookleta,text,ada,2953,303,0.76,0.74,0.85,0.79,0.51
16,bookleta,text,rf,2953,303,0.78,0.73,0.93,0.82,0.57
17,bookleta,text,xgboost,2953,303,0.77,0.74,0.88,0.81,0.55
18,bookletb,image,ada,2722,318,0.76,0.82,0.85,0.83,0.44


In [26]:
print("Classification results")
for value in ['F1', 'accuracy', 'r', 'precision', 'recall', 'roc_auc']:
    o_class = results.drop(columns=['approach']).query('targetvar == "O" & class_n == 2').pivot(index=['test', 'task', 'classifier'], columns=['condition'], values=[value]).round(2)
    sheet_collector[f'study1_{value}'] = o_class
    display(o_class)

print("Regression Results")
o_reg = results.drop(columns=['approach']).query('targetvar == "O" & class_n != 2').pivot(index=['test', 'task', 'classifier'], columns=['condition'], values=['R2', 'rmse', 'r']).round(2)
sheet_collector[f'study2_metrics'] = o_reg
display(o_reg)

Classification results


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,F1,F1,F1
Unnamed: 0_level_1,Unnamed: 1_level_1,condition,image,image+txt,text
test,task,classifier,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
ttct,bookleta,ada,0.83,0.86,0.79
ttct,bookleta,rf,0.83,0.84,0.82
ttct,bookleta,xgboost,0.84,0.86,0.81
ttct,bookletb,ada,0.83,0.89,0.88
ttct,bookletb,rf,0.88,0.91,0.9
ttct,bookletb,xgboost,0.86,0.88,0.89


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,accuracy,accuracy,accuracy
Unnamed: 0_level_1,Unnamed: 1_level_1,condition,image,image+txt,text
test,task,classifier,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
ttct,bookleta,ada,0.8,0.84,0.76
ttct,bookleta,rf,0.79,0.8,0.78
ttct,bookleta,xgboost,0.8,0.83,0.78
ttct,bookletb,ada,0.76,0.85,0.83
ttct,bookletb,rf,0.82,0.87,0.85
ttct,bookletb,xgboost,0.79,0.83,0.84


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,r,r,r
Unnamed: 0_level_1,Unnamed: 1_level_1,condition,image,image+txt,text
test,task,classifier,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
ttct,bookleta,ada,0.59,0.68,0.52
ttct,bookleta,rf,0.58,0.62,0.57
ttct,bookleta,xgboost,0.6,0.67,0.56
ttct,bookletb,ada,0.44,0.63,0.6
ttct,bookletb,rf,0.56,0.68,0.64
ttct,bookletb,xgboost,0.49,0.58,0.61


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,precision,precision,precision
Unnamed: 0_level_1,Unnamed: 1_level_1,condition,image,image+txt,text
test,task,classifier,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
ttct,bookleta,ada,0.78,0.81,0.74
ttct,bookleta,rf,0.75,0.76,0.73
ttct,bookleta,xgboost,0.78,0.79,0.75
ttct,bookletb,ada,0.82,0.88,0.88
ttct,bookletb,rf,0.82,0.87,0.85
ttct,bookletb,xgboost,0.82,0.85,0.87


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,recall,recall,recall
Unnamed: 0_level_1,Unnamed: 1_level_1,condition,image,image+txt,text
test,task,classifier,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
ttct,bookleta,ada,0.88,0.91,0.86
ttct,bookleta,rf,0.92,0.94,0.93
ttct,bookleta,xgboost,0.91,0.94,0.89
ttct,bookletb,ada,0.85,0.9,0.87
ttct,bookletb,rf,0.94,0.96,0.95
ttct,bookletb,xgboost,0.9,0.91,0.91


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,roc_auc,roc_auc,roc_auc
Unnamed: 0_level_1,Unnamed: 1_level_1,condition,image,image+txt,text
test,task,classifier,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
ttct,bookleta,ada,0.79,0.83,0.75
ttct,bookleta,rf,0.77,0.79,0.77
ttct,bookleta,xgboost,0.79,0.82,0.77
ttct,bookletb,ada,0.71,0.81,0.8
ttct,bookletb,rf,0.75,0.81,0.79
ttct,bookletb,xgboost,0.73,0.77,0.79


Regression Results


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,R2,rmse,r
Unnamed: 0_level_1,Unnamed: 1_level_1,condition,image,image,image
test,task,classifier,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
audra,far,ada,0.42,0.68,0.65
audra,far,rf,0.34,0.73,0.59
audra,far,xgboost,0.2,0.8,0.49
audra,general1,ada,0.44,0.65,0.66
audra,general1,rf,0.4,0.67,0.64
audra,general1,xgboost,0.34,0.71,0.61
audra,primary,ada,0.62,0.56,0.8
audra,primary,rf,0.61,0.57,0.79
audra,primary,xgboost,0.59,0.58,0.77


Notes:

- Whereas `far` and `general1` were much better than ttct on unsupervised, here they're comparable
    - Why??? To check - distribution - check entropy of Audra tasks
    - Check sample size effect

In [18]:
# Save Excel Results
with ExcelWriter(excel_path) as writer:
    for sheet_name, df in sheet_collector.items():
        df.to_excel(writer, sheet_name=sheet_name, index=True)

Simplified view, since the 'approach' is really overkill for measuring and reporting.

In [52]:
x = results.query('targetvar != "O"').query("approach == 'individual'").pivot(index=['targetvar', 'classifier'], columns=['condition'], values=['rmse', 'r2'])
display(x.round(2))

targetvar,classifier


In [None]:
x = results.query('targetvar != "O"').query("approach == 'individual'").query('classifier=="rf"').pivot(index=['targetvar'], columns=['condition'], values=['rmse', 'r2'])
display(x.round(2))

Unnamed: 0_level_0,rmse,rmse,rmse,r2,r2,r2
condition,image,image+txt,text,image,image+txt,text
targetvar,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
E,3.36,3.38,4.14,0.54,0.53,0.3
R,0.76,0.76,0.83,0.17,0.17,0.0
T,0.98,0.9,0.9,0.19,0.31,0.32


### Narrowing down conditions

Comparing regression conditions by observing average performance across classifiers

In [None]:
x.groupby('targetvar').mean().round(2)

Unnamed: 0_level_0,rmse,rmse,rmse,r2,r2,r2
condition,image,image+txt,text,image,image+txt,text
targetvar,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
E,3.8,3.76,4.42,0.39,0.4,0.2
R,0.78,0.78,0.85,0.12,0.12,-0.05
T,1.01,0.9,0.92,0.14,0.31,0.28


Basically: an image-only model is all that's needed for elaboration and resistance to premature closure, and a text-only model is all that's needed for abstractness of titles. These aren't particularly surprising, but good to confirm, and interesting to see the slight $R^2$ for text on E and image on T.

---

Q: Which style of regressor/classifier is better - individual ones per task, a single classifier with one hot, or are they similar? I would expect the first or last case.

In [None]:
o.groupby('approach').mean().round(2)

Unnamed: 0_level_0,f1,f1,f1,accuracy,accuracy,accuracy
condition,image,image+txt,text,image,image+txt,text
approach,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
combined,0.83,0.83,0.79,0.78,0.78,0.72
individual,0.83,0.85,0.83,0.79,0.81,0.77


In [None]:
x.groupby('approach').mean().round(2)

Unnamed: 0_level_0,rmse,rmse,rmse,r2,r2,r2
condition,image,image+txt,text,image,image+txt,text
approach,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
combined,1.94,1.88,2.07,0.19,0.26,0.14
individual,1.79,1.75,2.05,0.24,0.3,0.15


 A: Typically individual classifiers/regressors have a slight edge. Smaller for classifiers.

 ---

 Q: What classifier works best?



In [None]:
results.groupby('classifier').mean().round(2)[['F1', 'accuracy', 'rmse', 'r']]

Unnamed: 0_level_0,F1,accuracy,rmse,r
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ada,0.82,0.79,0.63,0.62
rf,0.84,0.81,0.66,0.64
xgboost,0.83,0.8,0.7,0.61


A: RF.

-----

Q: What's the `RF/individual` performance for originality?

In [None]:
results[(results.classifier=='rf') & (results.approach=='individual')].drop(columns=['approach', 'classifier', 'class_n', 'targetvar']).fillna('')

Unnamed: 0,task,condition,train_count,test_count,test,accuracy,precision,recall,F1,r,roc_auc,mcc,mse,mae,R2,evs,rmse
1,far,image,614,65,audra,,,,,0.59,,,0.53,0.6,0.34,0.34,0.73
4,general1,image,602,68,audra,,,,,0.64,,,0.45,0.52,0.4,0.4,0.67
7,primary,image,9925,1150,audra,,,,,0.79,,,0.33,0.43,0.61,0.61,0.57
10,bookleta,image,3177,329,ttct,0.79,0.75,0.92,0.83,0.58,0.77,0.58,,,,,
13,bookleta,image+txt,2953,303,ttct,0.8,0.76,0.94,0.84,0.62,0.79,0.62,,,,,
16,bookleta,text,2953,303,ttct,0.78,0.73,0.93,0.82,0.57,0.77,0.57,,,,,
19,bookletb,image,934,115,ttct,0.82,0.81,0.89,0.85,0.63,0.81,0.63,,,,,
22,bookletb,image+txt,867,104,ttct,0.86,0.83,0.93,0.88,0.71,0.84,0.71,,,,,
25,bookletb,text,867,104,ttct,0.81,0.81,0.86,0.84,0.61,0.8,0.61,,,,,


In [None]:
o.loc[('rf', 'individual')]

          condition
f1        image        0.844262
          image+txt    0.877953
          text         0.850485
accuracy  image        0.801567
          image+txt     0.83812
          text         0.798956
Name: (rf, individual), dtype: object

### Errata

In [None]:
#import statsmodels.formula.api as smf
#stats = smf.ols('rmse ~ condition + approach + condition', data=results.query('targetvar=="E"')).fit()
#print(stats.summary())

- O - Originality
- R - Resistance to Premature Closure
- E - Elaboration
- T - Abstractness of Titles
- F - Fluency (doesn't concern us because we're looking by prompt)
- C - Creativity Index (an additional metric)