## Figural: Supervised Learning

In [141]:
import torch
import clip
import pandas as pd
import numpy as np
from xgboost import XGBClassifier, XGBRegressor
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, RandomForestRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
from tqdm.auto import tqdm
from figural.scoring import autoset_device, FiguralImage, preprocess_imlist, image_loader, collage, get_avg_sims, similarity_to_target

In [2]:
device = autoset_device()
model, preprocess = clip.load("ViT-B/32", device=device)

CLIP doesn't work on M1 GPUs yet; check here for updates: https://github.com/openai/CLIP/issues/247


In [109]:
from pathlib import Path
data_dir = Path('../../data')
seed = 1234
rng = np.random.default_rng(seed=seed)
data = pd.read_csv(data_dir / 'figural_ground_truth.csv', index_col=0)
data = data[['img_path', 'booklet', 'id', 'pdf_path', 'titlepage', 'F', 'O', 'T', 'E', 'R', 'C', 'Name']]
# 10% for test set
test_prop = 0.1
data['img_path'] = data.img_path.apply(lambda x: Path(x))
data['testset'] = (rng.random(size=len(data), seed=seed) < test_prop)
data['activity'] = data.img_path.apply(lambda x: x.parent.stem)
for measure_data in ['sims_to_blank.parquet', 'avg_sims.parquet', 'elaboration.parquet', 'zlist_sims_sketch_of.parquet']:
    x = pd.read_parquet(data_dir / measure_data)
    x = x.drop(columns=[y for y in ['path', 'cropped', 'contrast'] if y in x.columns])
    data = data.merge(x)
# remove some data errors
data.loc[data['F'] > 1, 'F'] = np.NaN
data.loc[data['T'] > 3, 'T'] = np.NaN
data.loc[data['R'] > 2, 'R'] = np.NaN
data.sample(1)

Unnamed: 0,img_path,booklet,id,pdf_path,titlepage,F,O,T,E,R,...,testset,activity,blank_sim,blank_sim_uncropped,avg_sim,avg_sim_uncropped,elaboration_raw,min_zlist,mean_zlist,lowest3_zlist
4693,../data/outputs/test1/activity2j/8e820-65849.jpg,BOOKLETA,8e820-65849,../data/ttct_figural/SS2012 Gumbs Am Post Asse...,3,,,,,,...,False,activity2j,0.97029,0.86569,0.860159,0.830236,0.019324,0.215478,0.258917,0.224041


In [33]:
img_paths = data_dir / 'outputs/bookleta' / data.activity / data.img_path.apply(lambda x: x.name)
imloader = image_loader(img_paths, contrast_factor=4, crop_bottom=True)
image_inputs = preprocess_imlist(imloader, preprocess, device=device)
# todo display example image in code to ensure it's clear whether cropped or uncropped

4935it [00:09, 532.57it/s]


In [34]:
text = clip.tokenize(data.Name.astype(str)).to(device)

with torch.no_grad():
    image_features = model.encode_image(image_inputs)
    text_features = model.encode_text(text)
 # normalize tensors
image_features /= image_features.norm(dim=1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)

## Training a single classifier for all activites using one-hot

Using a Random Forest classifier

In [46]:
seed = 1234567

In [47]:
# using one-hot
enc = OneHotEncoder().fit(data.activity.values.reshape(-1, 1))
enc.get_feature_names_out()
cat_one_hot = enc.transform(data.activity.values.reshape(-1, 1)).toarray()

for condition in ['text', 'image', 'image+txt']:
    print(condition.upper().center(80, '='))
    if condition == 'text':
        embeds = text_features
    elif condition == 'image':
        embeds = image_features
    elif condition == 'image+txt':
        embeds = np.hstack([image_features, text_features])
    else:
        raise Exception('bad condition')

    embeds = np.hstack([cat_one_hot, embeds])

    matches = (~data['O'].isna())
    train_X = embeds[(matches & ~data.testset)]
    train_y = data.loc[(matches & ~data.testset), 'O']
    test_X = embeds[(matches & data.testset)]
    test_y = data.loc[(matches & data.testset), 'O']
    print("train n:",train_X.shape[0])

    clf = RandomForestClassifier(n_estimators = 300, random_state=seed)
    clf.fit(train_X, train_y)
    y_pred = clf.predict(test_X)
    print(metrics.classification_report(test_y, y_pred, target_names=['Not Original', 'Original']))

train n: 3141
              precision    recall  f1-score   support

Not Original       0.77      0.46      0.58       120
    Original       0.78      0.93      0.85       245

    accuracy                           0.78       365
   macro avg       0.78      0.70      0.71       365
weighted avg       0.78      0.78      0.76       365

train n: 3141
              precision    recall  f1-score   support

Not Original       0.88      0.58      0.70       120
    Original       0.82      0.96      0.89       245

    accuracy                           0.84       365
   macro avg       0.85      0.77      0.79       365
weighted avg       0.84      0.84      0.83       365

train n: 3141
              precision    recall  f1-score   support

Not Original       0.86      0.58      0.70       120
    Original       0.82      0.96      0.88       245

    accuracy                           0.83       365
   macro avg       0.84      0.77      0.79       365
weighted avg       0.84      0.8

In [161]:
featnames = ['catlabel'] * enc.get_feature_names_out().shape[0] + ['img'] * 512 + ['txt'] * 512
x = pd.DataFrame(zip(featnames, clf.feature_importances_), columns=['label', 'importance'])
x.groupby('label').aggregate(['mean', 'sum'])

Unnamed: 0_level_0,importance,importance
Unnamed: 0_level_1,mean,sum
label,Unnamed: 1_level_2,Unnamed: 2_level_2
catlabel,0.000374,0.004118
img,0.001145,0.586238
txt,0.0008,0.409643


## Training a different classifier for each activity

For binary (i.e. originality), treat as classification. For others, treat as regressor.

In [272]:
result_collector = []
target_vars = ['O', 'R', 'E', 'T']
classifiers = ['rf', 'ada', 'xgboost']
conditions = ['text', 'image', 'image+txt']
activities = ["all"] + data.activity.unique().tolist()

total_progress = len(target_vars)*len(classifiers)*len(conditions)*len(activities)

# For 'combined' classifier, one-hot encode the activities
enc = OneHotEncoder().fit(data.activity.values.reshape(-1, 1))
enc.get_feature_names_out()
cat_one_hot = enc.transform(data.activity.values.reshape(-1, 1)).toarray()

def prep_row(test_y, y_pred, classifier_approach='individual'):
    row = dict(
        targetvar=targetvar, class_n=class_n, condition=condition, classifier=classifier,
        approach=classifier_approach, support=len(test_y),
        accuracy = None, rmse=None, f1=None, f1_micro=None, r2=None
    )
    if class_n == 2:
        row['accuracy'] = metrics.accuracy_score(test_y, y_pred),
        row['f1'] = metrics.f1_score(test_y, y_pred, average='binary')
    else:
        row['rmse'] = metrics.mean_squared_error(test_y, y_pred, squared=False)
        row['r2'] = metrics.r2_score(test_y, y_pred)
    return row

with tqdm(total=total_progress) as tbar:
    for targetvar in target_vars:
        class_n = len(data[targetvar].dropna().unique())
        for classifier in classifiers:
            for condition in conditions:
                ytrue_collector = []
                ypred_collector = []
                label_collector = []

                if condition == 'text':
                    embeds = text_features
                elif condition == 'image':
                    embeds = image_features
                elif condition == 'image+txt':
                    embeds = np.hstack([image_features, text_features])
                else:
                    raise Exception('bad condition')

                for activity in activities:
                    tbar.set_description(f'{targetvar}/{classifier}/{condition}/{activity}')

                    if activity != "all":
                        matches = (data.activity == activity) & (~pd.to_numeric(data[targetvar], errors='coerce').isna())
                    else:
                        matches = (~pd.to_numeric(data[targetvar], errors='coerce').isna())
                        embeds = np.hstack([cat_one_hot, embeds])
                    if matches.sum() == 0:
                        tbar.update()
                        continue # not an activity with tests scored?
                    train_X = embeds[(matches & ~data.testset)]
                    train_y = data.loc[(matches & ~data.testset), targetvar] #.astype(bool)
                    test_X = embeds[(matches & data.testset)]
                    test_y = data.loc[(matches & data.testset), targetvar].values #.astype(bool)

                    if classifier == 'rf':
                        if class_n == 2:
                            clf = RandomForestClassifier(n_estimators = 300, random_state=seed)
                        else:
                            clf = RandomForestRegressor(n_estimators = 300, random_state=seed)
                    elif classifier == 'xgboost':
                        if class_n == 2:
                            objective='binary:logistic'
                            clf = XGBClassifier(n_estimators=300, learning_rate=0.2, objective=objective, random_state=seed)
                        else:
                            objective='reg:squarederror'
                            clf = XGBRegressor(n_estimators=300, learning_rate=0.2, objective=objective, random_state=seed)
                        
                    elif classifier == 'ada':
                        if class_n == 2:
                            clf = AdaBoostClassifier(n_estimators=300, random_state=seed)
                        else:
                            clf = AdaBoostRegressor(n_estimators=300, random_state=seed)
                    elif classifier == 'bagg':
                        if class_n == 2:
                            clf = BaggingClassifier(n_estimators=300, random_state=seed)
                        else:
                            clf = BaggingRegressor(n_estimators=300, random_state=seed)
                    clf.fit(train_X, train_y)
                    y_pred = clf.predict(test_X)

                    
                    if activity != 'all':
                        ypred_collector.append(y_pred)
                        ytrue_collector.append(test_y)
                        label_collector.append(data.loc[(matches & data.testset), 'activity'].tolist())
                    else:
                        # save these results alone, not aggregated
                        row = prep_row(test_y, y_pred, classifier_approach='combined')
                        result_collector.append(row)
                    tbar.update()

                if len(ytrue_collector):
                    test_y = np.hstack(ytrue_collector)
                    y_pred = np.hstack(ypred_collector)
                    row = prep_row(test_y, y_pred, classifier_approach='individual')
                    result_collector.append(row)
                #print(metrics.classification_report(test_y, y_pred))
results = pd.DataFrame(result_collector)
results.loc[~results.accuracy.isna(), 'accuracy'] = results.loc[~results.accuracy.isna(), 'accuracy'].apply(lambda x: x[0]).astype(float)

first_cols = ['targetvar', 'condition', 'classifier', 'approach']
col_order = first_cols + [col for col in results.columns if col not in first_cols]
results = results[col_order].sort_values(['targetvar', 'condition', 'classifier', 'approach'])
results.to_csv(data_dir / ' supervised_results.csv', index=False)
results.sample(10)

  0%|          | 0/396 [00:00<?, ?it/s]

Unnamed: 0,targetvar,condition,classifier,approach,class_n,support,accuracy,rmse,f1,f1_micro,r2
36,O,text,rf,individual,2,383,0.798956,,0.850485,,
57,E,text,ada,individual,43,389,,4.799375,,,0.06122
17,R,image+txt,xgboost,combined,3,343,,0.802735,,,0.062325
9,R,text,rf,combined,3,343,,0.843477,,,-0.035272
40,O,image,ada,individual,2,383,0.770235,,0.81893,,
65,T,image+txt,rf,individual,4,390,,0.902687,,,0.312255
35,T,image+txt,xgboost,combined,4,390,,0.846563,,,0.395117
69,T,text,xgboost,individual,4,390,,0.981215,,,0.187393
26,E,image+txt,xgboost,combined,43,389,,3.371116,,,0.536828
16,R,image,xgboost,combined,3,343,,0.777039,,,0.121395


## Analysis

In [190]:
results = pd.read_csv(data_dir / ' supervised_results.csv')

### All Data

Classification performance on originality:

In [280]:
o = results.query('targetvar == "O"').pivot(index=['classifier', 'approach'], columns=['condition'], values=['f1', 'accuracy'])
display(o.round(3))

Performance on originality


Unnamed: 0_level_0,Unnamed: 1_level_0,f1,f1,f1,accuracy,accuracy,accuracy
Unnamed: 0_level_1,condition,image,image+txt,text,image,image+txt,text
classifier,approach,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
ada,combined,0.794179,0.779167,0.755741,0.741514,0.723238,0.694517
ada,individual,0.81893,0.836735,0.802419,0.770235,0.791123,0.744125
rf,combined,0.852652,0.855513,0.798548,0.804178,0.801567,0.710183
rf,individual,0.844262,0.877953,0.850485,0.801567,0.83812,0.798956
xgboost,combined,0.838446,0.864754,0.824458,0.793734,0.827676,0.767624
xgboost,individual,0.830579,0.849593,0.835938,0.785901,0.806789,0.780679


Performance on regression targets:

In [278]:
x = results.query('targetvar != "O"').pivot(index=['targetvar', 'classifier', 'approach'], columns=['condition'], values=['rmse', 'r2'])
display(x.round(2))

Performance on regression targets


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,rmse,rmse,rmse,r2,r2,r2
Unnamed: 0_level_1,Unnamed: 1_level_1,condition,image,image+txt,text,image,image+txt,text
targetvar,classifier,approach,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
E,ada,combined,5.38,5.29,5.04,-0.18,-0.14,-0.03
E,ada,individual,3.45,3.43,4.8,0.51,0.52,0.06
E,rf,combined,3.27,3.27,4.09,0.56,0.56,0.32
E,rf,individual,3.36,3.38,4.14,0.54,0.53,0.3
E,xgboost,combined,3.47,3.37,4.23,0.51,0.54,0.27
E,xgboost,individual,3.89,3.84,4.2,0.38,0.4,0.28
R,ada,combined,0.79,0.79,0.86,0.08,0.1,-0.07
R,ada,individual,0.78,0.77,0.82,0.12,0.13,0.03
R,rf,combined,0.74,0.76,0.84,0.2,0.16,-0.04
R,rf,individual,0.76,0.76,0.83,0.17,0.17,0.0


Simplified view, since the 'approach' is really overkill for measuring and reporting.

In [322]:
x = results.query('targetvar != "O"').query("approach == 'individual'").pivot(index=['targetvar', 'classifier'], columns=['condition'], values=['rmse', 'r2'])
display(x.round(2))

Unnamed: 0_level_0,Unnamed: 1_level_0,rmse,rmse,rmse,r2,r2,r2
Unnamed: 0_level_1,condition,image,image+txt,text,image,image+txt,text
targetvar,classifier,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
E,ada,3.45,3.43,4.8,0.51,0.52,0.06
E,rf,3.36,3.38,4.14,0.54,0.53,0.3
E,xgboost,3.89,3.84,4.2,0.38,0.4,0.28
R,ada,0.78,0.77,0.82,0.12,0.13,0.03
R,rf,0.76,0.76,0.83,0.17,0.17,0.0
R,xgboost,0.81,0.79,0.87,0.04,0.08,-0.11
T,ada,1.01,0.89,0.95,0.14,0.33,0.24
T,rf,0.98,0.9,0.9,0.19,0.31,0.32
T,xgboost,1.04,0.98,0.98,0.08,0.19,0.19


In [323]:
x = results.query('targetvar != "O"').query("approach == 'individual'").query('classifier=="rf"').pivot(index=['targetvar'], columns=['condition'], values=['rmse', 'r2'])
display(x.round(2))

Unnamed: 0_level_0,rmse,rmse,rmse,r2,r2,r2
condition,image,image+txt,text,image,image+txt,text
targetvar,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
E,3.36,3.38,4.14,0.54,0.53,0.3
R,0.76,0.76,0.83,0.17,0.17,0.0
T,0.98,0.9,0.9,0.19,0.31,0.32


### Narrowing down conditions

Comparing regression conditions by observing average performance across classifiers

In [279]:
x.groupby('targetvar').mean().round(2)

Unnamed: 0_level_0,rmse,rmse,rmse,r2,r2,r2
condition,image,image+txt,text,image,image+txt,text
targetvar,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
E,3.8,3.76,4.42,0.39,0.4,0.2
R,0.78,0.78,0.85,0.12,0.12,-0.05
T,1.01,0.9,0.92,0.14,0.31,0.28


Basically: an image-only model is all that's needed for elaboration and resistance to premature closure, and a text-only model is all that's needed for abstractness of titles. These aren't particularly surprising, but good to confirm, and interesting to see the slight $R^2$ for text on E and image on T.

---

Q: Which style of regressor/classifier is better - individual ones per task, a single classifier with one hot, or are they similar? I would expect the first or last case.

In [281]:
o.groupby('approach').mean().round(2)

Unnamed: 0_level_0,f1,f1,f1,accuracy,accuracy,accuracy
condition,image,image+txt,text,image,image+txt,text
approach,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
combined,0.83,0.83,0.79,0.78,0.78,0.72
individual,0.83,0.85,0.83,0.79,0.81,0.77


In [276]:
x.groupby('approach').mean().round(2)

Unnamed: 0_level_0,rmse,rmse,rmse,r2,r2,r2
condition,image,image+txt,text,image,image+txt,text
approach,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
combined,1.94,1.88,2.07,0.19,0.26,0.14
individual,1.79,1.75,2.05,0.24,0.3,0.15


 A: Typically individual classifiers/regressors have a slight edge. Smaller for classifiers.

 ---

 Q: What classifier works best?



In [289]:
o.groupby('classifier').mean().round(2)

Unnamed: 0_level_0,f1,f1,f1,accuracy,accuracy,accuracy
condition,image,image+txt,text,image,image+txt,text
classifier,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
ada,0.81,0.81,0.78,0.76,0.76,0.72
rf,0.85,0.87,0.82,0.8,0.82,0.75
xgboost,0.83,0.86,0.83,0.79,0.82,0.77


In [290]:
x.groupby('classifier').mean().round(2)

Unnamed: 0_level_0,rmse,rmse,rmse,r2,r2,r2
condition,image,image+txt,text,image,image+txt,text
classifier,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
ada,2.07,2.02,2.24,0.13,0.2,0.07
rf,1.68,1.65,1.94,0.31,0.36,0.21
xgboost,1.84,1.77,2.01,0.21,0.28,0.14


A: RF.

-----

Q: What's the `RF/individual` performance for originality?

In [319]:
o.loc[('rf', 'individual')]

          condition
f1        image        0.844262
          image+txt    0.877953
          text         0.850485
accuracy  image        0.801567
          image+txt     0.83812
          text         0.798956
Name: (rf, individual), dtype: object

### Errata

In [326]:
#import statsmodels.formula.api as smf
#stats = smf.ols('rmse ~ condition + approach + condition', data=results.query('targetvar=="E"')).fit()
#print(stats.summary())

- O - Originality
- R - Resistance to Premature Closure
- E - Elaboration
- T - Abstractness of Titles
- F - Fluency (doesn't concern us because we're looking by prompt)
- C - Creativity Index (an additional metric)