In [4]:
# !pip install transformers

/Users/nduran4/Dropbox (ASU)/Mac/Desktop/GitProjects/llm-linguistic-alignment/.venv/bin/pip


In [1]:
# !git clone https://github.com/nickduran/llm-linguistic-alignment.git

Cloning into 'llm-linguistic-alignment'...
remote: Enumerating objects: 28, done.[K
remote: Counting objects: 100% (28/28), done.[K
remote: Compressing objects: 100% (23/23), done.[K
remote: Total 28 (delta 8), reused 24 (delta 5), pack-reused 0[K
Receiving objects: 100% (28/28), 416.79 KiB | 2.35 MiB/s, done.
Resolving deltas: 100% (8/8), done.


In [1]:
import os
from os import listdir
from os.path import isfile, join

import pickle
import random

import pandas as pd
import numpy as np

from tqdm import tqdm  # for progress bars

from sklearn.metrics.pairwise import cosine_similarity

## for integrating with BERT
import torch
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")

In [2]:
# Function to get lagged conversational turns
def process_input_data(df: pd.DataFrame) -> pd.DataFrame:
    df['utter1'] = df['content']
    df['utter2'] = df['content'].shift(-1)
    df['utter_order'] = df['participant'] + ' ' + df['participant'].shift(-1)
    return df

In [3]:
def get_embedding_with_cache(text):

  if text is None:
    return None

  if text in embedding_cache:
    return embedding_cache[text]

  tokens = tokenizer.tokenize(text)

  # Convert tokens to token IDs
  token_ids = tokenizer.convert_tokens_to_ids(tokens)

  # Add special tokens [CLS] and [SEP]
  token_ids = [tokenizer.cls_token_id] + token_ids + [tokenizer.sep_token_id]

  # Convert token IDs to tensors
  input_ids = torch.tensor([token_ids])

  # Get BERT embeddings
  with torch.no_grad():
      outputs = model(input_ids)

  # Extract hidden states (last layer)
  last_hidden_states = outputs.last_hidden_state

  # Average pooling to get single embeddings
  embedding = torch.mean(last_hidden_states, dim=1).numpy()

  embedding_cache[text] = embedding

  # print(embedding.shape)

  return embedding

In [4]:
# Function to process and get embeddings/cosines for a single file
def process_file(file_path, embedding_cache):
  df = pd.read_csv(file_path, sep='\t', encoding='utf-8')
  df = process_input_data(df)\

  for column in ["utter1", "utter2"]:
    df[f"{column}_embedding"] = df[column].apply(get_embedding_with_cache)


  df["cosine_similarity"] = df.apply(
      lambda row: cosine_similarity(
          np.array(row["utter1_embedding"]).reshape(1, -1),
          np.array(row["utter2_embedding"]).reshape(1, -1)
          )[0][0] if row["utter1_embedding"] is not None and row["utter2_embedding"] is not None else None,
      axis=1
    )

  return df

In [5]:
# Load or initialize the embedding cache, if none there (first time running), then create empty cache to build
embedding_cache_path = "data/bert_embedding_cache.pkl"
try:
    with open(embedding_cache_path, "rb") as f:
        embedding_cache = pickle.load(f)
except FileNotFoundError:
    embedding_cache = {}

##########

# Path to the folder containing the text files
folder_path = "data/prepped_stan_small"
text_files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f)) and f.endswith('.txt')]

# Process each file and update the cache
concatenated_df = pd.DataFrame()

for file_name in tqdm(text_files, desc="Processing files"):
    file_path = os.path.join(folder_path, file_name)
    df = process_file(file_path, embedding_cache)
    concatenated_df = pd.concat([concatenated_df, df], ignore_index=True)

    # Save the updated embedding cache to disk after processing each file
    with open(embedding_cache_path, "wb") as embedding_cache_file:
        pickle.dump(embedding_cache, embedding_cache_file)


Processing files: 100%|██████████| 2/2 [00:02<00:00,  1.18s/it]


In [6]:
concatenated_df

Unnamed: 0,participant,content,token,lemma,tagged_token,tagged_lemma,tagged_stan_token,tagged_stan_lemma,file,utter1,utter2,utter_order,utter1_embedding,utter2_embedding,cosine_similarity
0,PC:,i thought that maybe that would do something,"['i', 'thought', 'that', 'maybe', 'that', 'wou...","['i', 'think', 'that', 'maybe', 'that', 'would...","[('i', 'NN'), ('thought', 'VBD'), ('that', 'IN...","[('i', 'NN'), ('think', 'VBP'), ('that', 'IN')...","[('i', 'LS'), ('thought', 'VBD'), ('that', 'IN...","[('i', 'FW'), ('think', 'VBP'), ('that', 'IN')...",ASU-T10_ExpBlock2-DolphinShow.txt,i thought that maybe that would do something,i think you have to c maybe close it and it wi...,PC: PA:,"[[0.16115263, -0.18967539, -0.091776505, 0.072...","[[0.16965896, -0.09341999, 0.31895387, 0.05972...",0.631972
1,PA:,i think you have to c maybe close it and it wi...,"['i', 'think', 'you', 'have', 'to', 'c', 'mayb...","['i', 'think', 'you', 'have', 'to', 'c', 'mayb...","[('i', 'NN'), ('think', 'VBP'), ('you', 'PRP')...","[('i', 'NN'), ('think', 'VBP'), ('you', 'PRP')...","[('i', 'LS'), ('think', 'VB'), ('you', 'PRP'),...","[('i', 'LS'), ('think', 'VB'), ('you', 'PRP'),...",ASU-T10_ExpBlock2-DolphinShow.txt,i think you have to c maybe close it and it wi...,should i restart this it's like down there,PA: PC:,"[[0.16965896, -0.09341999, 0.31895387, 0.05972...","[[0.16538186, 0.040479492, 0.08581751, 0.03785...",0.730898
2,PC:,should i restart this it's like down there,"['should', 'i', 'start', 'this', 'it', 'is', '...","['should', 'i', 'start', 'this', 'it', 'be', '...","[('should', 'MD'), ('i', 'VB'), ('start', 'VB'...","[('should', 'MD'), ('i', 'VB'), ('start', 'VB'...","[('should', 'MD'), ('i', 'FW'), ('start', 'VB'...","[('should', 'MD'), ('i', 'FW'), ('start', 'VB'...",ASU-T10_ExpBlock2-DolphinShow.txt,should i restart this it's like down there,yeah i would just restart it,PC: PA:,"[[0.16538186, 0.040479492, 0.08581751, 0.03785...","[[0.42204922, -0.11065055, 0.2772329, 0.342672...",0.633439
3,PA:,yeah i would just restart it,"['yeah', 'i', 'would', 'just', 'start', 'it']","['yeah', 'i', 'would', 'just', 'start', 'it']","[('yeah', 'NN'), ('i', 'NN'), ('would', 'MD'),...","[('yeah', 'NN'), ('i', 'NN'), ('would', 'MD'),...","[('yeah', 'JJ'), ('i', 'FW'), ('would', 'MD'),...","[('yeah', 'JJ'), ('i', 'FW'), ('would', 'MD'),...",ASU-T10_ExpBlock2-DolphinShow.txt,yeah i would just restart it,okay now what,PA: PC:,"[[0.42204922, -0.11065055, 0.2772329, 0.342672...","[[0.2932856, -0.49990234, 0.16143925, 0.002446...",0.640285
4,PC:,okay now what,"['okay', 'now', 'what']","['okay', 'now', 'what']","[('okay', 'RB'), ('now', 'RB'), ('what', 'WP')]","[('okay', 'RB'), ('now', 'RB'), ('what', 'WP')]","[('okay', 'JJ'), ('now', 'RB'), ('what', 'WP')]","[('okay', 'JJ'), ('now', 'RB'), ('what', 'WP')]",ASU-T10_ExpBlock2-DolphinShow.txt,okay now what,you didn't close it,PC: PA:,"[[0.2932856, -0.49990234, 0.16143925, 0.002446...","[[0.295946, 0.26871645, -0.15591066, 0.1497117...",0.505967
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,PC:,yeah but i think it might have gotten stuck now,"['yeah', 'but', 'i', 'think', 'it', 'might', '...","['yeah', 'but', 'i', 'think', 'it', 'might', '...","[('yeah', 'NN'), ('but', 'CC'), ('i', 'JJ'), (...","[('yeah', 'NN'), ('but', 'CC'), ('i', 'JJ'), (...","[('yeah', 'NN'), ('but', 'CC'), ('i', 'FW'), (...","[('yeah', 'NN'), ('but', 'CC'), ('i', 'FW'), (...",ASU-T10_ExpBlock1-Oneatatime.txt,yeah but i think it might have gotten stuck now,if you hit the space bar i think it restarts it,PC: PA:,"[[0.31347594, -0.051073104, 0.18032749, 0.1860...","[[0.11059667, -0.09000373, 0.13568524, -0.2138...",0.755873
61,PA:,if you hit the space bar i think it restarts it,"['if', 'you', 'hit', 'the', 'space', 'bar', 'i...","['if', 'you', 'hit', 'the', 'space', 'bar', 'i...","[('if', 'IN'), ('you', 'PRP'), ('hit', 'VBP'),...","[('if', 'IN'), ('you', 'PRP'), ('hit', 'VBP'),...","[('if', 'IN'), ('you', 'PRP'), ('hit', 'VBP'),...","[('if', 'IN'), ('you', 'PRP'), ('hit', 'VBP'),...",ASU-T10_ExpBlock1-Oneatatime.txt,if you hit the space bar i think it restarts it,we got a gold one,PA: PC:,"[[0.11059667, -0.09000373, 0.13568524, -0.2138...","[[0.20519297, -0.1421821, 0.15854359, 0.135607...",0.657690
62,PC:,we got a gold one,"['we', 'got', 'a', 'gold', 'one']","['we', 'get', 'a', 'gold', 'one']","[('we', 'PRP'), ('got', 'VBD'), ('a', 'DT'), (...","[('we', 'PRP'), ('get', 'VBP'), ('a', 'DT'), (...","[('we', 'PRP'), ('got', 'VBD'), ('a', 'DT'), (...","[('we', 'PRP'), ('get', 'VBP'), ('a', 'DT'), (...",ASU-T10_ExpBlock1-Oneatatime.txt,we got a gold one,sweet come on,PC: PB:,"[[0.20519297, -0.1421821, 0.15854359, 0.135607...","[[0.06242614, -0.16263641, 0.55421257, 0.21565...",0.596897
63,PB:,sweet come on,"['sweet', 'come', 'on']","['sweet', 'come', 'on']","[('sweet', 'JJ'), ('come', 'NN'), ('on', 'IN')]","[('sweet', 'JJ'), ('come', 'NN'), ('on', 'IN')]","[('sweet', 'JJ'), ('come', 'VBN'), ('on', 'IN')]","[('sweet', 'JJ'), ('come', 'VBN'), ('on', 'IN')]",ASU-T10_ExpBlock1-Oneatatime.txt,sweet come on,i don't know what that means but gold is good,PB: PA:,"[[0.06242614, -0.16263641, 0.55421257, 0.21565...","[[0.27067685, 0.3821062, 0.1225989, 0.16572182...",0.500318
