## Score Alternate Uses Task with sT5

<a href="https://colab.research.google.com/github/massivetexts/llm_aut_study/blob/main/notebooks/Sentence-T5 AUT Scoring.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Sentence-T5 is an embedding space based on the T5 transformer model, allowing a comparison to traditional automated divergent thinking approaches, such as SemDis and OCS.

This code is related to Organisciak, P., Acar, S., Dumas, D., & Berthiaume, K. (2022). Beyond Semantic Distance: Automated Scoring of Divergent Thinking Greatly Improves with Large Language Models. http://dx.doi.org/10.13140/RG.2.2.32393.31840.

In [None]:
!pip install tensorflow_text

In [16]:
#@title Setup
import tensorflow_hub as hub
import tensorflow as tf
import tensorflow_text as text
from pathlib import Path
import numpy as np
import random
import json
from scipy.spatial.distance import cosine
import pandas as pd
from tqdm.auto import tqdm

model = "st5-base" #@param ["st5-base", "st5-large", "st5-3b"]
hub_url = f"https://tfhub.dev/google/sentence-t5/{model}/1"  
encoder = hub.KerasLayer(hub_url)



In [17]:
# Example usage
english_sentences = tf.constant(["dog", "Puppies are nice.", "I enjoy taking long walks along the beach with my dog."])
english_embeds = encoder(english_sentences)
print (english_embeds)

[<tf.Tensor: shape=(3, 768), dtype=float32, numpy=
array([[-0.02503787, -0.01858248,  0.01717696, ..., -0.03797331,
        -0.06851538,  0.00777527],
       [-0.01625948, -0.01902418,  0.01037012, ..., -0.00342999,
        -0.02487066, -0.02187493],
       [-0.0161721 , -0.00130474,  0.01163577, ...,  0.02126005,
        -0.03934697,  0.01307131]], dtype=float32)>]


In [18]:
#@title Params
base_dir = Path('drive/MyDrive/Grants/MOTES/') #@param { type: 'raw' }
gt_dir = base_dir / 'Data' / 'aut_ground_truth' #@param { type: 'raw' }
print("GT options", [x.name for x in gt_dir.glob('*tar.gz')])
data_subdir = "gt_main2" #@param ['gt_byparticipant', 'gt_main2', 'gt_byprompt']

!cp "{gt_dir}/{data_subdir}.tar.gz" .
!tar -xf {data_subdir}.tar.gz
data_dir = Path('data') / data_subdir
evaldir = base_dir / 'Data' / 'evaluation' / data_subdir #@param { type: 'raw' }
!mkdir -p {evaldir}
random_seed = 987 #@param {type:'number'}

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)

set_seed(random_seed)

GT options ['gt_main.tar.gz', 'gt_bypart3.tar.gz', 'gt_byprompt4.tar.gz', 'gt_byparticipant.tar.gz', 'gt_byprompt.tar.gz', 'all.tar.gz', 'gt_main2.tar.gz', 'gt_main_std.tar.gz']


In [19]:
testset = pd.DataFrame([json.loads(x.read_text()) for x in (data_dir / 'test').iterdir()])
testset.sample()

Unnamed: 0,src,question,prompt,response,id,target,participant,response_num,count
595,betal18,What is a surprising use for a BOX?,box,A derby car,betal18_box-7cc7,2.1,betal182184,,


In [20]:
batch_size = 200
if model == "st5-large":
    batch_size = 100
elif model == "st5-3b":
    batch_size = 25

def unique_embeds_from_series(s, batch_size=100):
    vals = s.unique().tolist()
    val_collector = []

    batches = 1+ len(vals)//batch_size
    for i in tqdm(range(batches)):
        vals_tf = tf.constant(vals[i::batches])
        val_embeds = encoder(vals_tf)[0]
        for j, val in enumerate(vals[i::batches]):
            val_collector.append((val, val_embeds[j].numpy()))
    return pd.DataFrame(val_collector, columns=[s.name, f'{s.name}_embedding'])

prompt_ref = unique_embeds_from_series(testset.prompt, batch_size)
response_ref = unique_embeds_from_series(testset.response, batch_size)
prompt_ref.sample()

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

Unnamed: 0,prompt,prompt_embedding
7,knife,"[0.0006218081, -0.0077829, -0.0077463207, 0.03..."


In [21]:
combined = testset.merge(prompt_ref, how='left').merge(response_ref, how='left') #.merge(question_ref, how='left')
combined['predicted'] = combined.apply(lambda x: cosine(x['prompt_embedding'], x['response_embedding']), axis=1)
combined['model'] = model
output = combined[['id', 'model', 'participant', 'prompt', 'target', 'predicted', 'src']]
output.to_csv(evaldir / f'{model}.csv')