In [None]:
import pandas as pd

In [2]:
desired_columns = ["Physical transformation","Future self","Past self","Current self","Physical wishes"]
data = data[desired_columns]

In [3]:
data['Looks/Body'] = data[desired_columns].astype(str).agg(' '.join, axis=1)
data['Looks/Body'] = data['Looks/Body'].apply(lambda x: 1 if '1' in x else 0)

In [4]:
desired_columns = ["Looks/Body"]
data = data[desired_columns]

In [5]:
desired_columns = ["all_text"]
text = text.rename_axis("POSTING ID")
text = text[desired_columns]

In [7]:
df = pd.merge(text, data, on='POSTING ID')
df.rename(columns={'Looks/Body': 'label'}, inplace=True)

In [8]:
df.reset_index(drop=True)

In [8]:
import logging
import datasets
import matplotlib.pyplot as plt
import gc
import torch
from sklearn.metrics import precision_score, accuracy_score, f1_score, classification_report, PrecisionRecallDisplay
import numpy as np
import os
from small_text.base import LABEL_IGNORED

from small_text import (
    EmptyPoolException,
    PoolBasedActiveLearner,
    PoolExhaustedException,
    BreakingTies,
    EmbeddingKMeans,
    SetFitClassificationFactory,
    SetFitModelArguments,
    TextDataset,
    random_initialization_balanced,
    SubsamplingQueryStrategy
)

import pandas as pd

import pigeonXT as pixt
from typing import List, Optional, Any, Tuple, Dict

class Annotation:
    @staticmethod
    def run_annotation(df: pd.DataFrame, labels: List[str], column_name: str)->pixt.annotate:
        # This will only setup the annotation, and needs to be confirmed via UI interaction
        return pixt.annotate(
            examples=df[[column_name]].rename(columns={column_name: 'example'}),
            options=labels,
            task_type='classification',
            buttons_in_a_row=3,
            reset_buttons_after_click=True,
            include_next=True
        )
    
def samp(dat,n):
    if len(dat) < n:
        return dat.drop(columns=["label"])
    return dat.sample(n).drop(columns=["label"])
    
# disables the progress bar for notebooks: https://github.com/huggingface/datasets/issues/2651
datasets.logging.get_verbosity = lambda: logging.NOTSET

POSSIBLE_LABELS = [0,1]

target_labels = np.arange(len(POSSIBLE_LABELS)-1).astype(int)

In [10]:
all_unlabeled = text.iloc[1000:1200]

In [8]:
init_labels = np.array([1 if s == '1' else 0 for s in df.label.values.tolist()])
init_dataset = TextDataset.from_arrays(df.all_text.values.tolist(),                        
                                        init_labels,
                                        target_labels=target_labels)

In [8]:
full_labeled_sample = df[['all_text','label']].rename(columns={"all_text":"modeling_text"})

In [8]:
all_unlabeled = all_unlabeled[~all_unlabeled.all_text.isin(full_labeled_sample.modeling_text.values.tolist())]

In [14]:
full_labeled_sample
full_labeled_sample_ones = full_labeled_sample[full_labeled_sample['label'] == 1]
full_labeled_sample_zeros = full_labeled_sample[full_labeled_sample['label'] == 0]
sampled_ones = full_labeled_sample_ones.sample(n=50, random_state=42)
sampled_zeros = full_labeled_sample_zeros.sample(n=50, random_state=42)
balanced_df = pd.concat([sampled_ones, sampled_zeros])
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [15]:
validation_data = balanced_df

In [16]:
init_labeled = full_labeled_sample.groupby("label").apply(samp,n=200).reset_index()
#validation_data = full_labeled_sample[~full_labeled_sample.modeling_text.isin(init_labeled.modeling_text)].groupby("label").apply(samp,n=100).reset_index()
len(init_labeled), len(validation_data)

(365, 100)

In [17]:
validation_data['label'].nunique() 

2

In [18]:
target_labels = [0,1]

In [8]:
init_dataset = TextDataset.from_arrays(init_labeled.modeling_text.values.tolist(),                        
                                       init_labeled.label.values,
                                       target_labels=target_labels)

In [20]:
print("Labels in data:", init_labeled.label.unique())
print("Target labels:", target_labels)

Labels in data: [0 1]
Target labels: [0, 1]


In [21]:
validation_dataset = TextDataset.from_arrays(validation_data.modeling_text.values.tolist(),                        
                                             validation_data.label.values,
                                               target_labels=target_labels)

In [23]:
posts = all_unlabeled.all_text.values.tolist()

In [None]:
len(init_dataset.x)

365

In [8]:
from small_text import LABEL_UNLABELED
model_args = SetFitModelArguments('sentence-transformers/paraphrase-mpnet-base-v2')

clf_factory = SetFitClassificationFactory(model_args,
                                          len(POSSIBLE_LABELS),
                                          classification_kwargs=dict({
                                              'device': 'cuda',
                                              'mini_batch_size': 8
                                          }))

# define a query strategy and initialize a pool-based active learner
query_strategy = SubsamplingQueryStrategy(BreakingTies())

def initialize_with_warmstart(init_dataset):
    

    # Append the initial labeled data to our train dataset. This is only necessary because the logistic regression head 
    #   implicitly obtains the number of classes from the training data. If we omitted this and the first query 
    #   would not return all four labels, the model head would predict three classes instead of four.
    labeled_indices = np.arange(len(init_dataset.y))

    train = TextDataset.from_arrays(init_dataset.x + posts, 
                                    np.append(init_dataset.y, np.array([LABEL_UNLABELED]*len(posts))), 
                                    target_labels=target_labels)
    
    # suppress progress bars in jupyter notebook
    setfit_train_kwargs = {'show_progress_bar': False}

    active_learner = PoolBasedActiveLearner(clf_factory, query_strategy, train, 
                                            fit_kwargs={'setfit_train_kwargs': setfit_train_kwargs})
    active_learner._clf = clf_factory.new()
    active_learner._clf.fit(init_dataset, setfit_train_kwargs=setfit_train_kwargs)

    active_learner.y = init_dataset.y
    active_learner.indices_labeled = labeled_indices
    active_learner._index_to_position = active_learner._build_index_to_position_dict()
    
    return active_learner, train


active_learner, train = initialize_with_warmstart(init_dataset)

In [None]:
def evaluate(active_learner, train, test):
    
    if len(train) == 0:
        return np.nan
    
    y_pred = active_learner.classifier.predict(train)
    y_score = active_learner.classifier.predict_proba(test)
    y_pred_test = active_learner.classifier.predict(test)
    
    test_acc = accuracy_score(y_pred_test, test.y)
    test_f1 = f1_score(test.y, y_pred_test, average="macro")

    print('Train accuracy: {:.2f}'.format(accuracy_score(y_pred, train.y)))
    print('Test accuracy: {:.2f}'.format(test_acc))
    print('Test F1: {:.2f}'.format(test_f1))
    print(classification_report(test.y,y_pred_test))
    return test_acc


results_setfit = []
results_setfit.append(evaluate(active_learner, train[active_learner.indices_labeled], validation_dataset))

In [26]:
round_v = 1
prefix = "Looks_Body"

In [8]:
train_dat = pd.DataFrame({"tr":train.x})

In [8]:
num_queries = 20

round_v = round_v + 1
# ...where each iteration consists of labeling 20 samples
q_indices = active_learner.query(num_samples=num_queries)

annotations = Annotation.run_annotation(train_dat.iloc[q_indices,:],['0','1'],'tr')

In [75]:
out_fil = f"ann_{prefix}_{round_v}.csv"
if os.path.exists(out_fil):
    print("file exists, change the name")
else:
    annotations.to_csv(out_fil,index=False)

In [77]:
def set_labels(annotations):
    labels = []
    for x in annotations.label:
        if x == '1':
            labels.append(1)
        elif x == '0':
            labels.append(0)
        else:
            labels.append(LABEL_IGNORED)
        
        #lab = get_val(x)
        # if lab is None:
        #     labels.append(LABEL_IGNORED)
        # else:
        #     labels.append(lab)
    return np.array(labels)

In [8]:
active_learner.update(set_labels(annotations))

In [None]:

# memory fix: https://github.com/UKPLab/sentence-transformers/issues/487, https://github.com/UKPLab/sentence-transformers/issues/1793
gc.collect()
torch.cuda.empty_cache()

print('---------------')
print('Iteration #{:d} ({} samples)'.format(0, len(active_learner.indices_labeled)))
results_setfit.append(evaluate(active_learner, train[active_learner.indices_labeled], validation_dataset))

In [8]:
pd.set_option('display.max_colwidth', None)
m = pd.DataFrame({"x":validation_dataset.x, "y":validation_dataset.y, "pred": active_learner.classifier.predict_proba(validation_dataset)[:,1]})
m['pred_bin'] = m['pred'] > .5
m[(m.pred_bin == 0) & (m.y == 0)]