## Figural: Supervised Learning

In [1]:
import torch
import clip
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
from tqdm import tqdm
from figural.scoring import autoset_device, FiguralImage, preprocess_imlist, image_loader, collage, get_avg_sims, similarity_to_target

In [2]:
device = autoset_device()
model, preprocess = clip.load("ViT-B/32", device=device)

CLIP doesn't work on M1 GPUs yet; check here for updates: https://github.com/openai/CLIP/issues/247


In [3]:
data = pd.read_csv('../data/figural_ground_truth.csv', index_col=0)
# 10% for test set
test_prop = 0.1
data['testset'] = (np.random.random(size=len(data)) < test_prop)
data.sample(1)

Unnamed: 0,img_path,blank_sim,booklet,activity,Act_no,id,avg_sim,elaboration_raw,elaboration,pdf_path,titlepage,F,O,T,E,R,C,Name,testset
2573,../data/outputs/test1/activity1/6dcd4-8159.jpg,0.898965,BOOKLETA,activity1,1,6dcd4-8159,0.879442,5693,0.028185,../data/ttct_figural/Unmatched/Booklets/FS10 T...,1,,,,,,,black eye creature,False


In [8]:
data = data[data.R <= 2]
data['R'].value_counts()

0.0    1155
2.0    1077
1.0    1015
Name: R, dtype: int64

In [9]:
imloader = image_loader(data.img_path, contrast_factor=4, crop_bottom=True)
image_inputs = preprocess_imlist(imloader, preprocess, device=device)

3247it [00:05, 571.63it/s]


In [10]:
text = clip.tokenize(data.Name.astype(str)).to(device)

with torch.no_grad():
    image_features = model.encode_image(image_inputs)
    text_features = model.encode_text(text)
 # normalize tensors
image_features /= image_features.norm(dim=1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)

: 

: 

## Training a single classifier for all activites using one-hot

In [170]:
# using one-hot
enc = OneHotEncoder().fit(data.activity.values.reshape(-1, 1))
enc.get_feature_names_out()
cat_one_hot = enc.transform(data.activity.values.reshape(-1, 1)).toarray()

for condition in ['text', 'image', 'image+txt']:
    print(condition.upper().center(80, '='))
    if condition == 'text':
        embeds = text_features
    elif condition == 'image':
        embeds = image_features
    elif condition == 'image+txt':
        embeds = np.hstack([image_features, text_features])
    else:
        raise Exception('bad condition')

    embeds = np.hstack([cat_one_hot, embeds])

    matches = (~data['O'].isna())
    train_X = embeds[(matches & ~data.testset)]
    train_y = data.loc[(matches & ~data.testset), 'O']
    test_X = embeds[(matches & data.testset)]
    test_y = data.loc[(matches & data.testset), 'O']

    clf = RandomForestClassifier(n_estimators = 100, random_state=0)
    clf.fit(train_X, train_y)
    y_pred = clf.predict(test_X)
    print(metrics.classification_report(test_y, y_pred, target_names=['Not Original', 'Original']))

              precision    recall  f1-score   support

Not Original       0.77      0.46      0.57       123
    Original       0.75      0.92      0.83       220

    accuracy                           0.76       343
   macro avg       0.76      0.69      0.70       343
weighted avg       0.76      0.76      0.74       343

              precision    recall  f1-score   support

Not Original       0.77      0.63      0.69       123
    Original       0.81      0.90      0.85       220

    accuracy                           0.80       343
   macro avg       0.79      0.76      0.77       343
weighted avg       0.80      0.80      0.79       343

              precision    recall  f1-score   support

Not Original       0.81      0.54      0.65       123
    Original       0.78      0.93      0.85       220

    accuracy                           0.79       343
   macro avg       0.80      0.74      0.75       343
weighted avg       0.79      0.79      0.78       343



In [161]:
featnames = ['catlabel'] * enc.get_feature_names_out().shape[0] + ['img'] * 512 + ['txt'] * 512
x = pd.DataFrame(zip(featnames, clf.feature_importances_), columns=['label', 'importance'])
x.groupby('label').aggregate(['mean', 'sum'])

Unnamed: 0_level_0,importance,importance
Unnamed: 0_level_1,mean,sum
label,Unnamed: 1_level_2,Unnamed: 2_level_2
catlabel,0.000374,0.004118
img,0.001145,0.586238
txt,0.0008,0.409643


## Training a different classifier for each activity

In [None]:
for targetvar in list("O", "R"):
    print(f"Predicting variable: {targetvar}".center(80,'='))
    for classifier in ['rf']: #'ada', 'xgboost']:
        print(classifier.upper().center(80, '-'))

        for condition in ['image+txt']: #['text', 'image', 'image+txt']:
            ytrue_collector = []
            ypred_collector = []
            label_collector = []

            print(condition.upper().center(80, ' '))
            if condition == 'text':
                embeds = text_features
            elif condition == 'image':
                embeds = image_features
            elif condition == 'image+txt':
                embeds = np.hstack([image_features, text_features])
            else:
                raise Exception('bad condition')

            for activity in tqdm(data.activity.unique()):
                matches = (data.activity == activity) & (~pd.to_numeric(data[targetvar], errors='coerce').isna())
                train_X = embeds[(matches & ~data.testset)]
                train_y = data.loc[(matches & ~data.testset), targetvar] #.astype(bool)
                test_X = embeds[(matches & data.testset)]
                test_y = data.loc[(matches & data.testset), targetvar] #.astype(bool)

                if classifier == 'rf':
                    clf = RandomForestClassifier(n_estimators = 300)
                elif classifier == 'xgboost':
                    clf = XGBClassifier(n_estimators=300, learning_rate=0.2, objective='binary:logistic')
                elif classifier == 'ada':
                    clf = AdaBoostClassifier(n_estimators=100)
                elif classifier == 'bagg':
                    clf = BaggingClassifier(n_estimators=100)
                clf.fit(train_X, train_y)
                y_pred = clf.predict(test_X)

                ypred_collector.append(y_pred)
                ytrue_collector.append(test_y)
                label_collector.append(data.loc[(matches & data.testset), 'activity'].tolist())

            print("Overall".upper().center(80))

            test_y = np.hstack(ytrue_collector)
            y_pred = np.hstack(ypred_collector)
            print(metrics.classification_report(test_y, y_pred))

--------------------------------------BAGG--------------------------------------
                                   IMAGE+TXT                                    


100%|██████████| 11/11 [00:58<00:00,  5.36s/it]

                                    OVERALL                                     
              precision    recall  f1-score   support

         0.0       0.78      0.75      0.76       123
         1.0       0.86      0.88      0.87       220

    accuracy                           0.83       343
   macro avg       0.82      0.81      0.82       343
weighted avg       0.83      0.83      0.83       343






## Similarity to Zero-Originality Lists

O - Originality
R - Resistance to Premature Closure
E - Elaboration
T - Abstractness of Titles
F - Fluency (doesn't concern us because we're looking by prompt)
C - Creativity Index (an additional metric)

In [2]:
data['F'].value_counts()

NameError: name 'data' is not defined