In [1]:
!pip install -q transformers

In [2]:
import os
import pickle
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel

In [3]:
# Initialize RoBERTa
model_id = "roberta-base"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Load the model on CPU
model = AutoModel.from_pretrained(model_id)
model.to('cpu')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropou

In [4]:
!unzip /content/prepped_stan_small.zip

Archive:  /content/prepped_stan_small.zip
  inflating: prepped_stan_small/ASU-T10_ExpBlock1-Oneatatime.txt  
  inflating: prepped_stan_small/ASU-T10_ExpBlock2-DolphinShow.txt  


In [5]:
# Function to get lagged conversational turns
def process_input_data(df: pd.DataFrame) -> pd.DataFrame:
    df['utter1'] = df['content']
    df['utter2'] = df['content'].shift(-1)
    df['utter_order'] = df['participant'] + ' ' + df['participant'].shift(-1)
    return df

# Function to compute embeddings using RoBERTa, with caching
default_embedding_engine = "RoBERTa"
def get_embedding_with_cache(
    text: str,
    embedding_cache: dict,
    tokenizer,
    model,
    engine: str = default_embedding_engine
) -> list:
    if text is None:
        return None
    if (text, engine) not in embedding_cache.keys():
        # Tokenize and get embeddings from RoBERTa
        inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=32)
        input_ids = inputs["input_ids"]

        with torch.no_grad():
            outputs = model(input_ids=input_ids)

        # Extract embeddings from model outputs
        embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy().tolist()[0]

        embedding_cache[(text, engine)] = embedding
    return embedding_cache[(text, engine)]

# Function to process and get embeddings/cosines for a single file
def process_file(file_path, embedding_cache, tokenizer, model, default_embedding_engine):
    df = pd.read_csv(file_path, sep='\t', encoding='utf-8')
    df = process_input_data(df)

    # Create column of embeddings
    for column in ["utter1", "utter2"]:
        df[f"{column}_embedding"] = df[column].apply(lambda x: get_embedding_with_cache(x, embedding_cache, tokenizer, model))

    # Create column of cosine similarity
    df["cosine_similarity"] = df.apply(
        lambda row: cosine_similarity(
            np.array(row["utter1_embedding"]).reshape(1, -1),
            np.array(row["utter2_embedding"]).reshape(1, -1)
        )[0][0] if row["utter1_embedding"] is not None and row["utter2_embedding"] is not None else None,
        axis=1
    )

    return df

In [6]:
# Load or initialize the embedding cache for RoBERTa
embedding_cache_path = "./data/roberta_embedding_cache.pkl"
embedding_cache_dir = os.path.dirname(embedding_cache_path)

# Create the directory if it does not exist
if not os.path.exists(embedding_cache_dir):
    os.makedirs(embedding_cache_dir)

try:
    with open(embedding_cache_path, "rb") as f:
        embedding_cache = pickle.load(f)
except FileNotFoundError:
    embedding_cache = {}

# Path to the folder containing the text files
folder_path = "./prepped_stan_small"
text_files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f)) and f.endswith('.txt')]

In [7]:
# Process each file and update the cache
concatenated_df = pd.DataFrame()
for file_name in tqdm(text_files, desc="Processing files"):
    file_path = os.path.join(folder_path, file_name)
    df = process_file(file_path, embedding_cache, tokenizer, model, default_embedding_engine)
    concatenated_df = pd.concat([concatenated_df, df], ignore_index=True)

    # Save the updated embedding cache to disk after processing each file
    with open(embedding_cache_path, "wb") as embedding_cache_file:
        pickle.dump(embedding_cache, embedding_cache_file)

# concatenated_df now contains all the processed data
concatenated_df.to_csv("concatenated_df.csv", index=False)

Processing files:   0%|          | 0/2 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
Processing files: 100%|██████████| 2/2 [00:13<00:00,  6.87s/it]


In [9]:
df_view = pd.read_csv("concatenated_df.csv")
df_view

Unnamed: 0,participant,content,token,lemma,tagged_token,tagged_lemma,tagged_stan_token,tagged_stan_lemma,file,utter1,utter2,utter_order,utter1_embedding,utter2_embedding,cosine_similarity
0,PC:,yeah let's try that one,"['yeah', 'let', 'us', 'try', 'that', 'one']","['yeah', 'let', 'u', 'try', 'that', 'one']","[('yeah', 'NN'), ('let', 'VBD'), ('us', 'PRP')...","[('yeah', 'NNS'), ('let', 'VBP'), ('u', 'JJ'),...","[('yeah', 'JJ'), ('let', 'VB'), ('us', 'PRP'),...","[('yeah', 'JJ'), ('let', 'VB'), ('u', 'FW'), (...",ASU-T10_ExpBlock1-Oneatatime.txt,yeah let's try that one,this one,PC: PB:,"[-0.05718561261892319, -0.03673159331083298, -...","[-0.09039264917373657, 0.11435879021883011, -0...",0.964272
1,PB:,this one,"['this', 'one']","['this', 'one']","[('this', 'DT'), ('one', 'NN')]","[('this', 'DT'), ('one', 'NN')]","[('this', 'DT'), ('one', 'CD')]","[('this', 'DT'), ('one', 'CD')]",ASU-T10_ExpBlock1-Oneatatime.txt,this one,yeah that one doesn't look too bad,PB: PA:,"[-0.09039264917373657, 0.11435879021883011, -0...","[-0.022548239678144455, 0.03876623883843422, -...",0.938527
2,PA:,yeah that one doesn't look too bad,"['yeah', 'that', 'one', 'does', 'not', 'look',...","['yeah', 'that', 'one', 'do', 'not', 'look', '...","[('yeah', 'NN'), ('that', 'WDT'), ('one', 'CD'...","[('yeah', 'NN'), ('that', 'WDT'), ('one', 'CD'...","[('yeah', 'NN'), ('that', 'IN'), ('one', 'CD')...","[('yeah', 'NN'), ('that', 'IN'), ('one', 'CD')...",ASU-T10_ExpBlock1-Oneatatime.txt,yeah that one doesn't look too bad,dropped something,PA: PB:,"[-0.022548239678144455, 0.03876623883843422, -...","[-0.03447270020842552, 0.08327183127403259, -0...",0.943913
3,PB:,dropped something,"['dropped', 'something']","['drop', 'something']","[('dropped', 'VBD'), ('something', 'NN')]","[('drop', 'NN'), ('something', 'NN')]","[('dropped', 'VBD'), ('something', 'NN')]","[('drop', 'NN'), ('something', 'NN')]",ASU-T10_ExpBlock1-Oneatatime.txt,dropped something,i feel like that's the solution to all of these,PB: PA:,"[-0.03447270020842552, 0.08327183127403259, -0...","[-0.010948986746370792, 0.1288169026374817, 0....",0.888959
4,PA:,i feel like that's the solution to all of these,"['i', 'feel', 'like', 'that', 'is', 'the', 'so...","['i', 'feel', 'like', 'that', 'be', 'the', 'so...","[('i', 'JJ'), ('feel', 'VBP'), ('like', 'IN'),...","[('i', 'JJ'), ('feel', 'VBP'), ('like', 'IN'),...","[('i', 'LS'), ('feel', 'VB'), ('like', 'IN'), ...","[('i', 'LS'), ('feel', 'VB'), ('like', 'IN'), ...",ASU-T10_ExpBlock1-Oneatatime.txt,i feel like that's the solution to all of these,boy yeah just make a thing like a weight like...,PA: PB:,"[-0.010948986746370792, 0.1288169026374817, 0....","[0.01567079871892929, 0.002193540334701538, 0....",0.911543
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,PA:,i think we'll have to make platforms or somet...,"['i', 'think', 'we', 'will', 'have', 'to', 'ma...","['i', 'think', 'we', 'will', 'have', 'to', 'ma...","[('i', 'JJ'), ('think', 'VBP'), ('we', 'PRP'),...","[('i', 'JJ'), ('think', 'VBP'), ('we', 'PRP'),...","[('i', 'FW'), ('think', 'VBP'), ('we', 'PRP'),...","[('i', 'FW'), ('think', 'VBP'), ('we', 'PRP'),...",ASU-T10_ExpBlock2-DolphinShow.txt,i think we'll have to make platforms or somet...,the bottom one to the left one i don't know ye...,PA: PB:,"[0.03515678644180298, 0.037898652255535126, -0...","[-0.04833640158176422, 0.037174176424741745, 0...",0.921612
61,PB:,the bottom one to the left one i don't know ye...,"['the', 'bottom', 'one', 'to', 'the', 'left', ...","['the', 'bottom', 'one', 'to', 'the', 'left', ...","[('the', 'DT'), ('bottom', 'JJ'), ('one', 'CD'...","[('the', 'DT'), ('bottom', 'JJ'), ('one', 'CD'...","[('the', 'DT'), ('bottom', 'JJ'), ('one', 'CD'...","[('the', 'DT'), ('bottom', 'JJ'), ('one', 'CD'...",ASU-T10_ExpBlock2-DolphinShow.txt,the bottom one to the left one i don't know ye...,okay those would not connect to each other wai...,PB: PC:,"[-0.04833640158176422, 0.037174176424741745, 0...","[0.019083712249994278, -0.013006923720240593, ...",0.967022
62,PC:,okay those would not connect to each other wai...,"['okay', 'those', 'would', 'not', 'connect', '...","['okay', 'those', 'would', 'not', 'connect', '...","[('okay', 'IN'), ('those', 'DT'), ('would', 'M...","[('okay', 'IN'), ('those', 'DT'), ('would', 'M...","[('okay', 'JJ'), ('those', 'DT'), ('would', 'M...","[('okay', 'JJ'), ('those', 'DT'), ('would', 'M...",ASU-T10_ExpBlock2-DolphinShow.txt,okay those would not connect to each other wai...,that might be okay and then if we could jump i...,PC: PA:,"[0.019083712249994278, -0.013006923720240593, ...","[-0.01624179631471634, 0.06377503275871277, 0....",0.950928
63,PA:,that might be okay and then if we could jump i...,"['that', 'might', 'be', 'okay', 'and', 'then',...","['that', 'might', 'be', 'okay', 'and', 'then',...","[('that', 'DT'), ('might', 'MD'), ('be', 'VB')...","[('that', 'DT'), ('might', 'MD'), ('be', 'VB')...","[('that', 'WDT'), ('might', 'MD'), ('be', 'VB'...","[('that', 'WDT'), ('might', 'MD'), ('be', 'VB'...",ASU-T10_ExpBlock2-DolphinShow.txt,that might be okay and then if we could jump i...,okay so,PA: PC:,"[-0.01624179631471634, 0.06377503275871277, 0....","[-0.07056449353694916, 0.03589553385972977, -0...",0.803336


In [11]:
max(df_view.cosine_similarity)

0.9831999001996622

In [14]:
df_view[df_view['cosine_similarity'] > 0.90].shape[0]

49

In [17]:
df_view[df_view['cosine_similarity'] > 0.90].shape[0]/df_view.shape[0]

0.7538461538461538