### Setup environment

In [None]:
# pip uninstall transformers
# pip install -q git+https://github.com/zphang/transformers@c3dc391
# pip install git+https://github.com/huggingface/peft.git
# pip install bitsandbytes

### Import Necessary Libraries

In [None]:
import os  # Provides a way to use operating system dependent functionality like reading or writing to the file system
import pickle
import pandas as pd  # Powerful data structures for data analysis, time series, and statistics
import numpy as np  # Support for large, multi-dimensional arrays and matrices, along with a collection of mathematical functions to operate on these arrays
from tqdm import tqdm  # For progress bars
from sklearn.metrics.pairwise import cosine_similarity  # Computes the cosine similarity between samples in a matrix

# For W2V specifically
import re  # Provides regular expression matching operations
import ast  # Abstract Syntax Trees, used for parsing and analyzing Python source code
from collections import Counter  # Provides a way to count the frequency of elements in a collection
import gensim  # Library for unsupervised topic modeling and natural language processing
import gensim.downloader as api  # Downloads and loads pre-trained models and datasets
# from gensim.models import KeyedVectors  # Provides efficient word vector representation and storage

# For BERT specifically 
import torch
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")

# For GPT specifically
import openai
openai.api_key = [ENTER HERE]

# For LLAMA specifically
import torch
from transformers import LLaMATokenizer, LLaMAForCausalLM
from transformers import GenerationConfig
token = "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"
tokenizer = LLaMATokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", use_auth_token=token)

# Begin Word2Vec

### SETUP for W2V

In [4]:
# retrieve the curent working directory where the script is being executed
script_dir = os.getcwd()

# create a directory called "gensim-data" (if doesn't already exist)
local_cache_dir = os.path.join(script_dir, "gensim-data")
os.makedirs(local_cache_dir, exist_ok=True)
print(f"Local cache directory: {local_cache_dir}")

# configure Gensim to use local_cache_dir as base directory for downloading and storing models
api.BASE_DIR = local_cache_dir
print(f"Gensim BASE_DIR set to: {api.BASE_DIR}")

# checks if specified models are already downloaded to cache directory, if not, download them
def download_and_cache_models(models, cache_dir):
    api.BASE_DIR = cache_dir
    for model_name in models:
        model_path = os.path.join(cache_dir, model_name)
        if not os.path.exists(model_path):
            try:
                print(f"Downloading model: {model_name}")
                model = api.load(model_name)
                print(f"Downloaded and cached model: {model_name}")
            except Exception as e:
                print(f"Error downloading {model_name}: {e}")
        else:
            print(f"Model {model_name} already exists at: {model_path}")

# specifies the list of models to be cached locally, invoking the download_and_cache_models function 
models_to_cache = ['word2vec-google-news-300', 'glove-twitter-200']
download_and_cache_models(models_to_cache, local_cache_dir)

# attempts to load a model from specified file path
def load_model_if_not_exists(model_path, binary=True):
    try:
        if not os.path.exists(model_path):
            raise FileNotFoundError(f"Model file {model_path} does not exist.")
        return gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=binary)
    except Exception as e:
        print(f"Error loading model from {model_path}: {e}")
        return None

# checks if the w2v_google_model is already loaded in the global namespace. if not, attempts to load it from local cache directory.
if 'w2v_google_model' not in globals():
    w2v_google_model_path = os.path.join(local_cache_dir, 'word2vec-google-news-300', 'word2vec-google-news-300.gz')
    w2v_google_model = load_model_if_not_exists(w2v_google_model_path, binary=True)
    if w2v_google_model is not None:
        print("Word2Vec Google News model loaded from local cache successfully.")
    else:
        print("Failed to load Word2Vec Google News model.")

# note: possible todo: is it more efficient to use gensim.downloader.load(model_name)?
# note, downloading model, it downloads properly, but also throws the exception warning for some reason. 
# TODO: instead of just loading google news model into global workspace, load in all within "models_to_cache" list

Local cache directory: /Users/nduran4/Desktop/GitProjects/llm-linguistic-alignment/gensim-data
Gensim BASE_DIR set to: /Users/nduran4/Desktop/GitProjects/llm-linguistic-alignment/gensim-data
Model word2vec-google-news-300 already exists at: /Users/nduran4/Desktop/GitProjects/llm-linguistic-alignment/gensim-data/word2vec-google-news-300
Model glove-twitter-200 already exists at: /Users/nduran4/Desktop/GitProjects/llm-linguistic-alignment/gensim-data/glove-twitter-200


### FUNCTION DEFINITIONS

In [5]:
def aggregate_conversations(folder_path: str) -> pd.DataFrame:
    """
    Aggregates multiple .txt files located in a specified folder 
    into a single pandas DataFrame. Each file is expected to be 
    tab-separated. 
    
    Returns a DataFrame containing the concatenated content of all 
    the .txt files
    """
    text_files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f)) and f.endswith('.txt')]
    concatenated_df = pd.DataFrame()

    for file_name in text_files:
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path, sep='\t', encoding='utf-8')
        concatenated_df = pd.concat([concatenated_df, df], ignore_index=True)
    
    return concatenated_df

def build_filtered_vocab(data: pd.DataFrame, output_file_directory: str, high_sd_cutoff: float = 3, low_n_cutoff: int = 1):
    """
    Constructs a vocabulary from the ‘lemma’ column of the input DataFrame, 
    applying frequency-based filtering: ords occurring less frequently 
    than low_n_cutoff or more frequently than a certain standard deviation 
    above the mean (high_sd_cutoff) are filtered out. 
    
    Returns: Two lists: one with all vocabulary words and another with filtered words
    Outputs: The vocabulary frequencies to files
    """ 

    all_sentences = [re.sub(r'[^\w\s]+', '', str(row)).split() for row in data['lemma']]
    all_words = [word for sentence in all_sentences for word in sentence]

    frequency = Counter(all_words)

    frequency_filt = {word: freq for word, freq in frequency.items() if len(word) > 1 and freq > low_n_cutoff}
    
    if high_sd_cutoff is not None:
        mean_freq = np.mean(list(frequency_filt.values()))
        std_freq = np.std(list(frequency_filt.values()))
        cutoff_freq = mean_freq + (std_freq * high_sd_cutoff)
        filteredWords = {word: freq for word, freq in frequency_filt.items() if freq < cutoff_freq}
    else:
        filteredWords = frequency_filt
  
    vocabfreq_all = pd.DataFrame(frequency.items(), columns=["word", "count"]).sort_values(by='count', ascending=False)
    vocabfreq_filt = pd.DataFrame(filteredWords.items(), columns=["word", "count"]).sort_values(by='count', ascending=False)
  
    vocabfreq_all.to_csv(os.path.join(output_file_directory, 'vocab_unfilt_freqs.txt'), encoding='utf-8', index=False, sep='\t')
    vocabfreq_filt.to_csv(os.path.join(output_file_directory, 'vocab_filt_freqs.txt'), encoding='utf-8', index=False, sep='\t')
    
    return list(frequency.keys()), list(filteredWords.keys())

def is_list_like_column(series):
    """
    Checks if a pandas Series contains list-like strings (i.e., strings that 
    look like lists).
    """    

    try:
        return series.apply(lambda x: x.strip().startswith("[")).all()
    except AttributeError:
        return False

def convert_columns_to_lists(df: pd.DataFrame) -> pd.DataFrame:
    """
    Converts any columns in a DataFrame that contain list-like strings into 
    actual Python lists using ast.literal_eval.
    """        

    columns_converted = []
    for col in df.columns:
        if is_list_like_column(df[col]):
            df[col] = df[col].apply(ast.literal_eval)
            columns_converted.append(col)
    return df, columns_converted

def pair_columns_lagged(df: pd.DataFrame, include_stan: bool = True) -> pd.DataFrame:
    """
    Creates lagged pairs of specified columns, generating new columns with a 
    suffix of ‘1’ for the original content and ‘2’ for the lagged content. Also 
    adds a new column indicating the order of participants between successive rows.
    """   

    columns_to_lag = ['content', 'token', 'lemma', 'tagged_token', 'tagged_lemma']
    
    if include_stan:
        stan_columns = [col for col in df.columns if 'stan' in col]
        columns_to_lag.extend(stan_columns)
    
    for col in columns_to_lag:
        if col in df.columns:  
            df[f'{col}1'] = df[col]
            df[f'{col}2'] = df[col].shift(-1)
    
    df['utter_order'] = df['participant'] + ' ' + df['participant'].shift(-1)
    
    return df

def compute_cosine_similarities(df: pd.DataFrame, columns: list):
    """
    Computes cosine similarities between pairs of vectors in the specified columns 
    and adds the results as new columns in the DataFrame. Handles ‘token’ and ‘lemma’ 
    columns separately.
    """   

    for col1, col2 in columns:
        similarities = []
        for i in range(len(df)):
            vec1 = df.iloc[i][col1]
            vec2 = df.iloc[i][col2]
            if vec1 is not None and vec2 is not None:
                similarity = cosine_similarity([vec1], [vec2])[0][0]
            else:
                similarity = None
            similarities.append(similarity)

        if 'token' in col1:
            similarity_column_name = 'token_cosine_similarity'
        elif 'lemma' in col1:
            similarity_column_name = 'lemma_cosine_similarity'

        df[similarity_column_name] = similarities
    return df

def get_sum_embeddings(token_list, model):
    """
    Calculates the sum of word embeddings for a list of tokens using a 
    pre-trained Word2Vec model.
    """ 

    if token_list is None:
        return None    
    embeddings = []
    for word in token_list:
        if word in model.key_to_index:  
            embeddings.append(model[word])    
    if embeddings:
        sum_embedding = np.sum(embeddings, axis=0)
        return sum_embedding
    else:
        return None  
    
def process_file_for_W2V(file_path, vocab_list: list):     
    """
    Processes a file containing conversation data, filters tokens 
    based on a provided vocabulary list, pairs and lags columns, computes 
    word embeddings, and then calculates cosine similarities between the embeddings.
    """ 

    df = pd.read_csv(file_path, sep='\t', encoding='utf-8')

    df, columns_converted = convert_columns_to_lists(df)

    columns_to_filter = ['lemma','token']
    for col in columns_to_filter:

        df[col] = df[col].apply(lambda token_list: [word for word in token_list if word in vocab_list])

    df = pair_columns_lagged(df)

    for column in ["lemma", "token"]:
        df[f"{column}1_sum_embedding"] = df[f"{column}1"].apply(lambda tokens: get_sum_embeddings(tokens, w2v_google_model))
        df[f"{column}2_sum_embedding"] = df[f"{column}2"].apply(lambda tokens: get_sum_embeddings(tokens, w2v_google_model))

    embedding_columns = [
        ("lemma1_sum_embedding", "lemma2_sum_embedding"),
        ("token1_sum_embedding", "token2_sum_embedding")
    ] 

    df = compute_cosine_similarities(df, embedding_columns)

    return df


### Main Run

In [6]:
# Define folder paths
folder_path = "./data/prepped_stan_small"
output_file_directory = "outputW2V"

# Check if the output directory exists, create it if not
os.makedirs(output_file_directory, exist_ok=True)

# List all text files in the folder
text_files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f)) and f.endswith('.txt')]

# Aggregate the conversations into a single DataFrame
concatenated_text_files = aggregate_conversations(folder_path)

# Build the filtered vocabulary and save it to the output directory
vocab_all, vocab_filtered = build_filtered_vocab(concatenated_text_files, output_file_directory)

# Process each file and concatenate the results into a single DataFrame
concatenated_df = pd.DataFrame()
for file_name in tqdm(text_files, desc="Processing files"):
    file_path = os.path.join(folder_path, file_name)
    df = process_file_for_W2V(file_path, vocab_filtered)
    concatenated_df = pd.concat([concatenated_df, df], ignore_index=True)

Processing files: 100%|██████████| 2/2 [00:00<00:00, 34.29it/s]


In [7]:
concatenated_df.head()

Unnamed: 0,participant,content,token,lemma,tagged_token,tagged_lemma,tagged_stan_token,tagged_stan_lemma,file,content1,...,tagged_stan_token2,tagged_stan_lemma1,tagged_stan_lemma2,utter_order,lemma1_sum_embedding,lemma2_sum_embedding,token1_sum_embedding,token2_sum_embedding,lemma_cosine_similarity,token_cosine_similarity
0,PC:,i thought that maybe that would do something,"[that, maybe, that, would, do, something]","[think, that, maybe, that, would, do, something]","[(i, NN), (thought, VBD), (that, IN), (maybe, ...","[(i, NN), (think, VBP), (that, IN), (maybe, RB...","[(i, LS), (thought, VBD), (that, IN), (maybe, ...","[(i, FW), (think, VBP), (that, IN), (maybe, RB...",ASU-T10_ExpBlock2-DolphinShow.txt,i thought that maybe that would do something,...,"[(i, LS), (think, VB), (you, PRP), (have, VBP)...","[(i, FW), (think, VBP), (that, IN), (maybe, RB...","[(i, LS), (think, VB), (you, PRP), (have, VBP)...",PC: PA:,"[0.33276367, 0.18133545, 0.54364014, 1.277832,...","[0.71087646, 0.19366455, 0.7921753, 2.4785156,...","[0.37963867, 0.11444092, 0.53430176, 1.0141602...","[0.5878296, 0.18084717, 0.7727661, 2.2480469, ...",0.861987,0.826057
1,PA:,i think you have to c maybe close it and it wi...,"[think, you, have, maybe, close, and, will, yo...","[think, you, have, maybe, close, and, will, yo...","[(i, NN), (think, VBP), (you, PRP), (have, VBP...","[(i, NN), (think, VBP), (you, PRP), (have, VBP...","[(i, LS), (think, VB), (you, PRP), (have, VBP)...","[(i, LS), (think, VB), (you, PRP), (have, VBP)...",ASU-T10_ExpBlock2-DolphinShow.txt,i think you have to c maybe close it and it wi...,...,"[(should, MD), (i, FW), (start, VB), (this, DT...","[(i, LS), (think, VB), (you, PRP), (have, VBP)...","[(should, MD), (i, FW), (start, VB), (this, DT...",PA: PC:,"[0.71087646, 0.19366455, 0.7921753, 2.4785156,...","[0.2972412, 0.25402832, 0.30085754, 0.6894531,...","[0.5878296, 0.18084717, 0.7727661, 2.2480469, ...","[0.2972412, 0.25402832, 0.30085754, 0.6894531,...",0.749735,0.751777
2,PC:,should i restart this it's like down there,"[start, this, like, down, there]","[start, this, like, down, there]","[(should, MD), (i, VB), (start, VB), (this, DT...","[(should, MD), (i, VB), (start, VB), (this, DT...","[(should, MD), (i, FW), (start, VB), (this, DT...","[(should, MD), (i, FW), (start, VB), (this, DT...",ASU-T10_ExpBlock2-DolphinShow.txt,should i restart this it's like down there,...,"[(yeah, JJ), (i, FW), (would, MD), (just, RB),...","[(should, MD), (i, FW), (start, VB), (this, DT...","[(yeah, JJ), (i, FW), (would, MD), (just, RB),...",PC: PA:,"[0.2972412, 0.25402832, 0.30085754, 0.6894531,...","[0.40307617, 0.18392944, 0.17602539, 0.7675781...","[0.2972412, 0.25402832, 0.30085754, 0.6894531,...","[0.40307617, 0.18392944, 0.17602539, 0.7675781...",0.777175,0.777175
3,PA:,yeah i would just restart it,"[yeah, would, just, start]","[yeah, would, just, start]","[(yeah, NN), (i, NN), (would, MD), (just, RB),...","[(yeah, NN), (i, NN), (would, MD), (just, RB),...","[(yeah, JJ), (i, FW), (would, MD), (just, RB),...","[(yeah, JJ), (i, FW), (would, MD), (just, RB),...",ASU-T10_ExpBlock2-DolphinShow.txt,yeah i would just restart it,...,"[(okay, JJ), (now, RB), (what, WP)]","[(yeah, JJ), (i, FW), (would, MD), (just, RB),...","[(okay, JJ), (now, RB), (what, WP)]",PA: PC:,"[0.40307617, 0.18392944, 0.17602539, 0.7675781...","[0.24768066, -0.050964355, 0.26208496, 0.27709...","[0.40307617, 0.18392944, 0.17602539, 0.7675781...","[0.24768066, -0.050964355, 0.26208496, 0.27709...",0.720251,0.720251
4,PC:,okay now what,"[okay, now, what]","[okay, now, what]","[(okay, RB), (now, RB), (what, WP)]","[(okay, RB), (now, RB), (what, WP)]","[(okay, JJ), (now, RB), (what, WP)]","[(okay, JJ), (now, RB), (what, WP)]",ASU-T10_ExpBlock2-DolphinShow.txt,okay now what,...,"[(you, PRP), (did, VBD), (not, RB), (close, VB...","[(okay, JJ), (now, RB), (what, WP)]","[(you, PRP), (do, VBP), (not, RB), (close, VB)...",PC: PA:,"[0.24768066, -0.050964355, 0.26208496, 0.27709...","[0.40890503, -0.14001465, 0.30407715, 0.839355...","[0.24768066, -0.050964355, 0.26208496, 0.27709...","[0.28585815, -0.15283203, 0.28466797, 0.608886...",0.661706,0.616577


In [22]:
concatenated_df.shape

(65, 30)

In [23]:
# Identify and rename the semantic specific columns
df = concatenated_df.filter(like='embedding').join(concatenated_df.filter(like='cosine_similarity'))
df = df.rename(columns=lambda x: f"{x}_W2V")

# Merge these renamed columns back to the original dataframe
concatenated_df = concatenated_df.join(df)

In [24]:
concatenated_df.head()

Unnamed: 0,participant,content,token,lemma,tagged_token,tagged_lemma,tagged_stan_token,tagged_stan_lemma,file,content1,...,token1_sum_embedding,token2_sum_embedding,lemma_cosine_similarity,token_cosine_similarity,lemma1_sum_embedding_W2V,lemma2_sum_embedding_W2V,token1_sum_embedding_W2V,token2_sum_embedding_W2V,lemma_cosine_similarity_W2V,token_cosine_similarity_W2V
0,PC:,i thought that maybe that would do something,"[that, maybe, that, would, do, something]","[think, that, maybe, that, would, do, something]","[(i, NN), (thought, VBD), (that, IN), (maybe, ...","[(i, NN), (think, VBP), (that, IN), (maybe, RB...","[(i, LS), (thought, VBD), (that, IN), (maybe, ...","[(i, FW), (think, VBP), (that, IN), (maybe, RB...",ASU-T10_ExpBlock2-DolphinShow.txt,i thought that maybe that would do something,...,"[0.37963867, 0.11444092, 0.53430176, 1.0141602...","[0.5878296, 0.18084717, 0.7727661, 2.2480469, ...",0.861987,0.826057,"[0.33276367, 0.18133545, 0.54364014, 1.277832,...","[0.71087646, 0.19366455, 0.7921753, 2.4785156,...","[0.37963867, 0.11444092, 0.53430176, 1.0141602...","[0.5878296, 0.18084717, 0.7727661, 2.2480469, ...",0.861987,0.826057
1,PA:,i think you have to c maybe close it and it wi...,"[think, you, have, maybe, close, and, will, yo...","[think, you, have, maybe, close, and, will, yo...","[(i, NN), (think, VBP), (you, PRP), (have, VBP...","[(i, NN), (think, VBP), (you, PRP), (have, VBP...","[(i, LS), (think, VB), (you, PRP), (have, VBP)...","[(i, LS), (think, VB), (you, PRP), (have, VBP)...",ASU-T10_ExpBlock2-DolphinShow.txt,i think you have to c maybe close it and it wi...,...,"[0.5878296, 0.18084717, 0.7727661, 2.2480469, ...","[0.2972412, 0.25402832, 0.30085754, 0.6894531,...",0.749735,0.751777,"[0.71087646, 0.19366455, 0.7921753, 2.4785156,...","[0.2972412, 0.25402832, 0.30085754, 0.6894531,...","[0.5878296, 0.18084717, 0.7727661, 2.2480469, ...","[0.2972412, 0.25402832, 0.30085754, 0.6894531,...",0.749735,0.751777
2,PC:,should i restart this it's like down there,"[start, this, like, down, there]","[start, this, like, down, there]","[(should, MD), (i, VB), (start, VB), (this, DT...","[(should, MD), (i, VB), (start, VB), (this, DT...","[(should, MD), (i, FW), (start, VB), (this, DT...","[(should, MD), (i, FW), (start, VB), (this, DT...",ASU-T10_ExpBlock2-DolphinShow.txt,should i restart this it's like down there,...,"[0.2972412, 0.25402832, 0.30085754, 0.6894531,...","[0.40307617, 0.18392944, 0.17602539, 0.7675781...",0.777175,0.777175,"[0.2972412, 0.25402832, 0.30085754, 0.6894531,...","[0.40307617, 0.18392944, 0.17602539, 0.7675781...","[0.2972412, 0.25402832, 0.30085754, 0.6894531,...","[0.40307617, 0.18392944, 0.17602539, 0.7675781...",0.777175,0.777175
3,PA:,yeah i would just restart it,"[yeah, would, just, start]","[yeah, would, just, start]","[(yeah, NN), (i, NN), (would, MD), (just, RB),...","[(yeah, NN), (i, NN), (would, MD), (just, RB),...","[(yeah, JJ), (i, FW), (would, MD), (just, RB),...","[(yeah, JJ), (i, FW), (would, MD), (just, RB),...",ASU-T10_ExpBlock2-DolphinShow.txt,yeah i would just restart it,...,"[0.40307617, 0.18392944, 0.17602539, 0.7675781...","[0.24768066, -0.050964355, 0.26208496, 0.27709...",0.720251,0.720251,"[0.40307617, 0.18392944, 0.17602539, 0.7675781...","[0.24768066, -0.050964355, 0.26208496, 0.27709...","[0.40307617, 0.18392944, 0.17602539, 0.7675781...","[0.24768066, -0.050964355, 0.26208496, 0.27709...",0.720251,0.720251
4,PC:,okay now what,"[okay, now, what]","[okay, now, what]","[(okay, RB), (now, RB), (what, WP)]","[(okay, RB), (now, RB), (what, WP)]","[(okay, JJ), (now, RB), (what, WP)]","[(okay, JJ), (now, RB), (what, WP)]",ASU-T10_ExpBlock2-DolphinShow.txt,okay now what,...,"[0.24768066, -0.050964355, 0.26208496, 0.27709...","[0.28585815, -0.15283203, 0.28466797, 0.608886...",0.661706,0.616577,"[0.24768066, -0.050964355, 0.26208496, 0.27709...","[0.40890503, -0.14001465, 0.30407715, 0.839355...","[0.24768066, -0.050964355, 0.26208496, 0.27709...","[0.28585815, -0.15283203, 0.28466797, 0.608886...",0.661706,0.616577


In [25]:
W2V_concatenated_df = concatenated_df.copy()

# End Word2Vec

# Begin BERT

### FUNCTION DEFINITIONS

In [8]:
def process_input_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Prepares a DataFrame for further processing by creating new columns:
	•	utter1: Holds the content of the current utterance.
	•	utter2: Holds the content of the next utterance (shifted by -1).
	•	utter_order: Concatenates the current and next participant labels to track the order of interactions.
    """ 

    df['utter1'] = df['content']
    df['utter2'] = df['content'].shift(-1)
    df['utter_order'] = df['participant'] + ' ' + df['participant'].shift(-1)
    return df

def get_embedding_with_cache(text):
    """
    Generates a BERT embedding for a given text while utilizing a cache to avoid redundant computations:
	•	  Checks if the embedding for the given text is already in the cache. If so, returns it.
	•	  If not cached, tokenizes the text, converts tokens to IDs, and feeds them to the BERT model to get the embedding.
	•	  The embedding is then averaged over all tokens and stored in the cache for future use.
    """ 

    if text is None:
      return None

    if text in embedding_cache:
      return embedding_cache[text]

    tokens = tokenizer.tokenize(text)
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    token_ids = [tokenizer.cls_token_id] + token_ids + [tokenizer.sep_token_id]
    input_ids = torch.tensor([token_ids])
    with torch.no_grad():
        outputs = model(input_ids)
    last_hidden_states = outputs.last_hidden_state
    embedding = torch.mean(last_hidden_states, dim=1).numpy()
    embedding_cache[text] = embedding

    return embedding

def process_file(file_path, embedding_cache):            
  """
  Processes a single file to compute BERT embeddings for pairs of utterances and 
  calculates the cosine similarity between these embeddings:
	•	  Reads the file into a DataFrame.
	•	  Processes the input data using process_input_data.
	•	  Applies the get_embedding_with_cache function to each utterance pair 
      to compute embeddings.
	•	  Computes the cosine similarity between embeddings of successive utterances.
  """ 
  
  df = pd.read_csv(file_path, sep='\t', encoding='utf-8')
  df = process_input_data(df)\

  for column in ["utter1", "utter2"]:
    df[f"{column}_embedding"] = df[column].apply(get_embedding_with_cache)

  df["cosine_similarity"] = df.apply(
      lambda row: cosine_similarity(
          np.array(row["utter1_embedding"]).reshape(1, -1),
          np.array(row["utter2_embedding"]).reshape(1, -1)
          )[0][0] if row["utter1_embedding"] is not None and row["utter2_embedding"] is not None else None,
      axis=1
    )

  return df


### Main Run

In [9]:
# Attempts to load a pre-existing embedding cache from a file (bert_embedding_cache.pkl).
	# •	If the file exists, it loads the cache using pickle, which allows previously computed 
    #   embeddings to be reused, saving computation time.
	# •	If the file does not exist (e.g., the first time the code is run), it initializes an 
    #   empty dictionary (embedding_cache = {}) to start building the cache from 
    #   scratch.embedding_cache_path = "data/bert_embedding_cache.pkl"
embedding_cache_path = "data/bert_embedding_cache.pkl"
try:
    with open(embedding_cache_path, "rb") as f:
        embedding_cache = pickle.load(f)
except FileNotFoundError:
    embedding_cache = {}

# Path to the folder containing the text files
folder_path = "data/prepped_stan_small"
text_files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f)) and f.endswith('.txt')]

# This loop processes each text file one by one:
	# •	File Processing: For each file, it reads the content, generates BERT embeddings 
    #   for the utterances, and computes cosine similarities between consecutive utterances 
    #   using the process_file function.
	# •	Concatenation: The results from each file are concatenated into a single DataFrame 
    #   (concatenated_df) that will hold all the processed data. concatenated_df = pd.DataFrame()
for file_name in tqdm(text_files, desc="Processing files"):
    file_path = os.path.join(folder_path, file_name)
    df = process_file(file_path, embedding_cache)
    concatenated_df = pd.concat([concatenated_df, df], ignore_index=True)

    with open(embedding_cache_path, "wb") as embedding_cache_file:
        pickle.dump(embedding_cache, embedding_cache_file)


Processing files: 100%|██████████| 2/2 [00:02<00:00,  1.49s/it]


In [11]:
concatenated_df.head()

Unnamed: 0,participant,content,token,lemma,tagged_token,tagged_lemma,tagged_stan_token,tagged_stan_lemma,file,content1,...,lemma2_sum_embedding,token1_sum_embedding,token2_sum_embedding,lemma_cosine_similarity,token_cosine_similarity,utter1,utter2,utter1_embedding,utter2_embedding,cosine_similarity
0,PC:,i thought that maybe that would do something,"[that, maybe, that, would, do, something]","[think, that, maybe, that, would, do, something]","[(i, NN), (thought, VBD), (that, IN), (maybe, ...","[(i, NN), (think, VBP), (that, IN), (maybe, RB...","[(i, LS), (thought, VBD), (that, IN), (maybe, ...","[(i, FW), (think, VBP), (that, IN), (maybe, RB...",ASU-T10_ExpBlock2-DolphinShow.txt,i thought that maybe that would do something,...,"[0.71087646, 0.19366455, 0.7921753, 2.4785156,...","[0.37963867, 0.11444092, 0.53430176, 1.0141602...","[0.5878296, 0.18084717, 0.7727661, 2.2480469, ...",0.861987,0.826057,,,,,
1,PA:,i think you have to c maybe close it and it wi...,"[think, you, have, maybe, close, and, will, yo...","[think, you, have, maybe, close, and, will, yo...","[(i, NN), (think, VBP), (you, PRP), (have, VBP...","[(i, NN), (think, VBP), (you, PRP), (have, VBP...","[(i, LS), (think, VB), (you, PRP), (have, VBP)...","[(i, LS), (think, VB), (you, PRP), (have, VBP)...",ASU-T10_ExpBlock2-DolphinShow.txt,i think you have to c maybe close it and it wi...,...,"[0.2972412, 0.25402832, 0.30085754, 0.6894531,...","[0.5878296, 0.18084717, 0.7727661, 2.2480469, ...","[0.2972412, 0.25402832, 0.30085754, 0.6894531,...",0.749735,0.751777,,,,,
2,PC:,should i restart this it's like down there,"[start, this, like, down, there]","[start, this, like, down, there]","[(should, MD), (i, VB), (start, VB), (this, DT...","[(should, MD), (i, VB), (start, VB), (this, DT...","[(should, MD), (i, FW), (start, VB), (this, DT...","[(should, MD), (i, FW), (start, VB), (this, DT...",ASU-T10_ExpBlock2-DolphinShow.txt,should i restart this it's like down there,...,"[0.40307617, 0.18392944, 0.17602539, 0.7675781...","[0.2972412, 0.25402832, 0.30085754, 0.6894531,...","[0.40307617, 0.18392944, 0.17602539, 0.7675781...",0.777175,0.777175,,,,,
3,PA:,yeah i would just restart it,"[yeah, would, just, start]","[yeah, would, just, start]","[(yeah, NN), (i, NN), (would, MD), (just, RB),...","[(yeah, NN), (i, NN), (would, MD), (just, RB),...","[(yeah, JJ), (i, FW), (would, MD), (just, RB),...","[(yeah, JJ), (i, FW), (would, MD), (just, RB),...",ASU-T10_ExpBlock2-DolphinShow.txt,yeah i would just restart it,...,"[0.24768066, -0.050964355, 0.26208496, 0.27709...","[0.40307617, 0.18392944, 0.17602539, 0.7675781...","[0.24768066, -0.050964355, 0.26208496, 0.27709...",0.720251,0.720251,,,,,
4,PC:,okay now what,"[okay, now, what]","[okay, now, what]","[(okay, RB), (now, RB), (what, WP)]","[(okay, RB), (now, RB), (what, WP)]","[(okay, JJ), (now, RB), (what, WP)]","[(okay, JJ), (now, RB), (what, WP)]",ASU-T10_ExpBlock2-DolphinShow.txt,okay now what,...,"[0.40890503, -0.14001465, 0.30407715, 0.839355...","[0.24768066, -0.050964355, 0.26208496, 0.27709...","[0.28585815, -0.15283203, 0.28466797, 0.608886...",0.661706,0.616577,,,,,


In [18]:
concatenated_df.shape

(3317, 15)

In [19]:
concatenated_df.columns

Index(['participant', 'content', 'token', 'lemma', 'tagged_token',
       'tagged_lemma', 'tagged_stan_token', 'tagged_stan_lemma', 'file',
       'utter1', 'utter2', 'utter_order', 'utter1_embedding',
       'utter2_embedding', 'cosine_similarity'],
      dtype='object')

In [20]:
BERT_columns = ['participant', 'content', 'token', 'lemma', 'tagged_token',
       'tagged_lemma', 'tagged_stan_token', 'tagged_stan_lemma', 'file',
       'BERT_utter1', 'BERT_utter2', 'BERT_utter_order', 'BERT_utter1_embedding',
       'BERT_utter2_embedding', 'BERT_cosine_similarity']

In [21]:
concatenated_df.columns = BERT_columns

In [22]:
concatenated_df.head()

Unnamed: 0,participant,content,token,lemma,tagged_token,tagged_lemma,tagged_stan_token,tagged_stan_lemma,file,BERT_utter1,BERT_utter2,BERT_utter_order,BERT_utter1_embedding,BERT_utter2_embedding,BERT_cosine_similarity
0,Operator female,tuscon police xxx,"['tucson', 'police', 'xxx']","['tucson', 'police', 'xxx']","[('tucson', 'NN'), ('police', 'NN'), ('xxx', '...","[('tucson', 'NN'), ('police', 'NN'), ('xxx', '...","[('tucson', 'NN'), ('police', 'NN'), ('xxx', '...","[('tucson', 'NN'), ('police', 'NN'), ('xxx', '...",1-TUC_0.txt,tuscon police xxx,this is xxx,Operator female Operator 2 female,"[[-0.10026315, -0.19990648, -0.06480523, -0.07...","[[0.0770198, 0.01935324, 0.46077225, -0.279308...",0.624709
1,Operator 2 female,this is xxx,"['this', 'is', 'xxx']","['this', 'be', 'xxx']","[('this', 'DT'), ('is', 'VBZ'), ('xxx', 'JJ')]","[('this', 'DT'), ('be', 'VB'), ('xxx', 'JJ')]","[('this', 'DT'), ('is', 'VBZ'), ('xxx', 'NN')]","[('this', 'DT'), ('be', 'VB'), ('xxx', 'NN')]",1-TUC_0.txt,this is xxx,i need somebody here,Operator 2 female Caller male,"[[0.0770198, 0.01935324, 0.46077225, -0.279308...","[[0.6451, 0.094501376, 0.17763025, -0.04114826...",0.537029
2,Caller male,i need somebody here,"['i', 'need', 'somebody', 'here']","['i', 'need', 'somebody', 'here']","[('i', 'NNS'), ('need', 'VBP'), ('somebody', '...","[('i', 'NNS'), ('need', 'VBP'), ('somebody', '...","[('i', 'LS'), ('need', 'MD'), ('somebody', 'NN...","[('i', 'LS'), ('need', 'MD'), ('somebody', 'NN...",1-TUC_0.txt,i need somebody here,sir hold on one second okay he saw a patient ...,Caller male Operator 2 female,"[[0.6451, 0.094501376, 0.17763025, -0.04114826...","[[-0.00607915, -0.12721325, 0.34245804, -0.006...",0.560584
3,Operator 2 female,sir hold on one second okay he saw a patient ...,"['sir', 'hold', 'on', 'one', 'second', 'okay',...","['sir', 'hold', 'on', 'one', 'second', 'okay',...","[('sir', 'NN'), ('hold', 'NN'), ('on', 'IN'), ...","[('sir', 'NN'), ('hold', 'NN'), ('on', 'IN'), ...","[('sir', 'NNP'), ('hold', 'VBP'), ('on', 'IN')...","[('sir', 'NNP'), ('hold', 'VBP'), ('on', 'IN')...",1-TUC_0.txt,sir hold on one second okay he saw a patient ...,okay hello does it look like he's been stabbed...,Operator 2 female Operator female,"[[-0.00607915, -0.12721325, 0.34245804, -0.006...","[[0.35690638, -0.1165302, 0.14006563, -0.24390...",0.778168
4,Operator female,okay hello does it look like he's been stabbed...,"['okay', 'hello', 'does', 'it', 'look', 'like'...","['okay', 'hello', 'do', 'it', 'look', 'like', ...","[('okay', 'JJ'), ('hello', 'NN'), ('does', 'VB...","[('okay', 'NN'), ('hello', 'NN'), ('do', 'VBP'...","[('okay', 'JJ'), ('hello', 'UH'), ('does', 'VB...","[('okay', 'JJ'), ('hello', 'UH'), ('do', 'VBP'...",1-TUC_0.txt,okay hello does it look like he's been stabbed...,honestly i couldn't i couldn't see anything i ...,Operator female Caller male,"[[0.35690638, -0.1165302, 0.14006563, -0.24390...","[[0.5303155, 0.13550542, 0.1008422, -0.0862038...",0.77086


In [23]:
BERT_concatenated_df = concatenated_df.copy()

# End BERT

# Begin GPT

In [12]:
def process_input_data(df: pd.DataFrame) -> pd.DataFrame:
    df['utter1'] = df['content']
    df['utter2'] = df['content'].shift(-1)
    df['utter_order'] = df['participant'] + ' ' + df['participant'].shift(-1)
    return df

default_embedding_engine = "text-embedding-ada-002"  
def get_embedding_with_cache(
    text: str,
    engine: str = default_embedding_engine
) -> list:

    if text is None:
        return None
    if (text, engine) not in embedding_cache.keys():

        embedding_cache[(text, engine)] = openai.embeddings.create(input=[text], model=engine).data[0].embedding
    return embedding_cache[(text, engine)]

def process_file(file_path, embedding_cache, default_embedding_engine):       
    df = pd.read_csv(file_path, sep='\t', encoding='utf-8')
    df = process_input_data(df)

    for column in ["utter1", "utter2"]:
        df[f"{column}_embedding"] = df[column].apply(get_embedding_with_cache)

    df["cosine_similarity"] = df.apply(
        lambda row: cosine_similarity(
            np.array(row["utter1_embedding"]).reshape(1, -1),
            np.array(row["utter2_embedding"]).reshape(1, -1)
        )[0][0] if row["utter1_embedding"] is not None and row["utter2_embedding"] is not None else None,
        axis=1
    )

    return df

##########

embedding_cache_path = "data/gpt_embedding_cache.pkl"
try:
    with open(embedding_cache_path, "rb") as f:
        embedding_cache = pickle.load(f)
except FileNotFoundError:
    embedding_cache = {}

##########

folder_path = "./data/prepped_stan_small"
text_files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f)) and f.endswith('.txt')]

concatenated_df = pd.DataFrame()
for file_name in tqdm(text_files, desc="Processing files"):
    file_path = os.path.join(folder_path, file_name)
    df = process_file(file_path, embedding_cache, default_embedding_engine)
    concatenated_df = pd.concat([concatenated_df, df], ignore_index=True)

    with open(embedding_cache_path, "wb") as embedding_cache_file:
        pickle.dump(embedding_cache, embedding_cache_file)



Processing files: 100%|██████████| 2/2 [00:17<00:00,  8.88s/it]


In [13]:
concatenated_df.head()

Unnamed: 0,participant,content,token,lemma,tagged_token,tagged_lemma,tagged_stan_token,tagged_stan_lemma,file,utter1,utter2,utter_order,utter1_embedding,utter2_embedding,cosine_similarity
0,PC:,i thought that maybe that would do something,"['i', 'thought', 'that', 'maybe', 'that', 'wou...","['i', 'think', 'that', 'maybe', 'that', 'would...","[('i', 'NN'), ('thought', 'VBD'), ('that', 'IN...","[('i', 'NN'), ('think', 'VBP'), ('that', 'IN')...","[('i', 'LS'), ('thought', 'VBD'), ('that', 'IN...","[('i', 'FW'), ('think', 'VBP'), ('that', 'IN')...",ASU-T10_ExpBlock2-DolphinShow.txt,i thought that maybe that would do something,i think you have to c maybe close it and it wi...,PC: PA:,"[-0.03237445279955864, 0.0003060059098061174, ...","[0.003450230695307255, 0.003762950887903571, -...",0.80791
1,PA:,i think you have to c maybe close it and it wi...,"['i', 'think', 'you', 'have', 'to', 'c', 'mayb...","['i', 'think', 'you', 'have', 'to', 'c', 'mayb...","[('i', 'NN'), ('think', 'VBP'), ('you', 'PRP')...","[('i', 'NN'), ('think', 'VBP'), ('you', 'PRP')...","[('i', 'LS'), ('think', 'VB'), ('you', 'PRP'),...","[('i', 'LS'), ('think', 'VB'), ('you', 'PRP'),...",ASU-T10_ExpBlock2-DolphinShow.txt,i think you have to c maybe close it and it wi...,should i restart this it's like down there,PA: PC:,"[0.003450230695307255, 0.003762950887903571, -...","[-0.0031581157818436623, -0.018044402822852135...",0.781013
2,PC:,should i restart this it's like down there,"['should', 'i', 'start', 'this', 'it', 'is', '...","['should', 'i', 'start', 'this', 'it', 'be', '...","[('should', 'MD'), ('i', 'VB'), ('start', 'VB'...","[('should', 'MD'), ('i', 'VB'), ('start', 'VB'...","[('should', 'MD'), ('i', 'FW'), ('start', 'VB'...","[('should', 'MD'), ('i', 'FW'), ('start', 'VB'...",ASU-T10_ExpBlock2-DolphinShow.txt,should i restart this it's like down there,yeah i would just restart it,PC: PA:,"[-0.0031581157818436623, -0.018044402822852135...","[-0.003446547780185938, -0.007897388190031052,...",0.862412
3,PA:,yeah i would just restart it,"['yeah', 'i', 'would', 'just', 'start', 'it']","['yeah', 'i', 'would', 'just', 'start', 'it']","[('yeah', 'NN'), ('i', 'NN'), ('would', 'MD'),...","[('yeah', 'NN'), ('i', 'NN'), ('would', 'MD'),...","[('yeah', 'JJ'), ('i', 'FW'), ('would', 'MD'),...","[('yeah', 'JJ'), ('i', 'FW'), ('would', 'MD'),...",ASU-T10_ExpBlock2-DolphinShow.txt,yeah i would just restart it,okay now what,PA: PC:,"[-0.003446547780185938, -0.007897388190031052,...","[0.006002475507557392, -0.016771331429481506, ...",0.766299
4,PC:,okay now what,"['okay', 'now', 'what']","['okay', 'now', 'what']","[('okay', 'RB'), ('now', 'RB'), ('what', 'WP')]","[('okay', 'RB'), ('now', 'RB'), ('what', 'WP')]","[('okay', 'JJ'), ('now', 'RB'), ('what', 'WP')]","[('okay', 'JJ'), ('now', 'RB'), ('what', 'WP')]",ASU-T10_ExpBlock2-DolphinShow.txt,okay now what,you didn't close it,PC: PA:,"[0.006002475507557392, -0.016771331429481506, ...","[0.00545839499682188, -0.00776617182418704, -0...",0.750938


In [27]:
concatenated_df.columns

Index(['participant', 'content', 'token', 'lemma', 'tagged_token',
       'tagged_lemma', 'tagged_stan_token', 'tagged_stan_lemma', 'file',
       'utter1', 'utter2', 'utter_order', 'utter1_embedding',
       'utter2_embedding', 'cosine_similarity'],
      dtype='object')

In [28]:
GPT_columns = ['participant', 'content', 'token', 'lemma', 'tagged_token',
       'tagged_lemma', 'tagged_stan_token', 'tagged_stan_lemma', 'file',
       'GPT_utter1', 'GPT_utter2', 'GPT_utter_order', 'GPT_utter1_embedding',
       'GPT_utter2_embedding', 'GPT_cosine_similarity']

In [29]:
concatenated_df.columns = GPT_columns

In [30]:
concatenated_df.head()

Unnamed: 0,participant,content,token,lemma,tagged_token,tagged_lemma,tagged_stan_token,tagged_stan_lemma,file,GPT_utter1,GPT_utter2,GPT_utter_order,GPT_utter1_embedding,GPT_utter2_embedding,GPT_cosine_similarity
0,Operator female,tuscon police xxx,"['tucson', 'police', 'xxx']","['tucson', 'police', 'xxx']","[('tucson', 'NN'), ('police', 'NN'), ('xxx', '...","[('tucson', 'NN'), ('police', 'NN'), ('xxx', '...","[('tucson', 'NN'), ('police', 'NN'), ('xxx', '...","[('tucson', 'NN'), ('police', 'NN'), ('xxx', '...",1-TUC_0.txt,tuscon police xxx,this is xxx,Operator female Operator 2 female,"[-0.009749563410878181, 0.0005670702084898949,...","[-0.010334055870771408, -0.008830797858536243,...",0.808507
1,Operator 2 female,this is xxx,"['this', 'is', 'xxx']","['this', 'be', 'xxx']","[('this', 'DT'), ('is', 'VBZ'), ('xxx', 'JJ')]","[('this', 'DT'), ('be', 'VB'), ('xxx', 'JJ')]","[('this', 'DT'), ('is', 'VBZ'), ('xxx', 'NN')]","[('this', 'DT'), ('be', 'VB'), ('xxx', 'NN')]",1-TUC_0.txt,this is xxx,i need somebody here,Operator 2 female Caller male,"[-0.010334055870771408, -0.008830797858536243,...","[-0.03374534100294113, 0.005746633280068636, -...",0.806422
2,Caller male,i need somebody here,"['i', 'need', 'somebody', 'here']","['i', 'need', 'somebody', 'here']","[('i', 'NNS'), ('need', 'VBP'), ('somebody', '...","[('i', 'NNS'), ('need', 'VBP'), ('somebody', '...","[('i', 'LS'), ('need', 'MD'), ('somebody', 'NN...","[('i', 'LS'), ('need', 'MD'), ('somebody', 'NN...",1-TUC_0.txt,i need somebody here,sir hold on one second okay he saw a patient ...,Caller male Operator 2 female,"[-0.03374534100294113, 0.005746633280068636, -...","[-0.014573554508388042, 0.0018402658170089126,...",0.757559
3,Operator 2 female,sir hold on one second okay he saw a patient ...,"['sir', 'hold', 'on', 'one', 'second', 'okay',...","['sir', 'hold', 'on', 'one', 'second', 'okay',...","[('sir', 'NN'), ('hold', 'NN'), ('on', 'IN'), ...","[('sir', 'NN'), ('hold', 'NN'), ('on', 'IN'), ...","[('sir', 'NNP'), ('hold', 'VBP'), ('on', 'IN')...","[('sir', 'NNP'), ('hold', 'VBP'), ('on', 'IN')...",1-TUC_0.txt,sir hold on one second okay he saw a patient ...,okay hello does it look like he's been stabbed...,Operator 2 female Operator female,"[-0.014573554508388042, 0.0018402658170089126,...","[-0.01353168860077858, 0.0013247638707980514, ...",0.77466
4,Operator female,okay hello does it look like he's been stabbed...,"['okay', 'hello', 'does', 'it', 'look', 'like'...","['okay', 'hello', 'do', 'it', 'look', 'like', ...","[('okay', 'JJ'), ('hello', 'NN'), ('does', 'VB...","[('okay', 'NN'), ('hello', 'NN'), ('do', 'VBP'...","[('okay', 'JJ'), ('hello', 'UH'), ('does', 'VB...","[('okay', 'JJ'), ('hello', 'UH'), ('do', 'VBP'...",1-TUC_0.txt,okay hello does it look like he's been stabbed...,honestly i couldn't i couldn't see anything i ...,Operator female Caller male,"[-0.01353168860077858, 0.0013247638707980514, ...","[-0.027948414906859398, 0.004147775005549192, ...",0.820494


In [31]:
GPT_concatenated_df = concatenated_df.copy()

# End GPT

# Begin Llama 2

The following model is Llama 2 version: 7 billion parameters, if you use CPU it would probably warn you (in such case we handeled it by try-except strategy) and if you have GPU it will automatically detect it and run it.

The 2-part model would be downloaded by the following code, and you will only download the models ONCE! it will remember that it has been already downloaded if you run it again. So, you only need to download the pretrained weights once.

In [34]:
try:
    model = LLaMAForCausalLM.from_pretrained(
        "meta-llama/Llama-2-7b-chat-hf",
        load_in_8bit=True,
        device_map="auto",
        use_auth_token=token
    )
except ValueError as e:
    print(f"An error occurred: {e}")

    # Attempt to handle the error without accessing the model if it wasn't loaded
    print("The model could not be fully loaded on the GPU.")
    
    # Now try to load the model on CPU
    try:
        model = LLaMAForCausalLM.from_pretrained(
            "meta-llama/Llama-2-7b-chat-hf",
            load_in_8bit=False,  # Disable 8-bit loading if it causes issues
            device_map={"": "cpu"},  # Force loading on CPU
            use_auth_token=token
        )
        print("Model successfully loaded on CPU.")
    except Exception as e:
        print(f"Failed to load the model on CPU as well: {e}")

The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


An error occurred: 
                        Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit
                        the quantized model. If you have set a value for `max_memory` you should increase that. To have
                        an idea of the modules that are set on the CPU or RAM you can print model.hf_device_map.
                        
The model could not be fully loaded on the GPU.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model successfully loaded on CPU.


In [35]:
def process_input_data(df: pd.DataFrame) -> pd.DataFrame:
    df['utter1'] = df['content']
    df['utter2'] = df['content'].shift(-1)
    df['utter_order'] = df['participant'] + ' ' + df['participant'].shift(-1)
    return df

In [36]:
default_embedding_engine = "LLaMA2"
def get_embedding_with_cache(
    text: str,
    embedding_cache: dict,
    tokenizer,
    model,
    engine: str = default_embedding_engine
) -> list:
    if text is None:
        return None
    if (text, engine) not in embedding_cache.keys():

        inputs = tokenizer(text, return_tensors="pt")
        input_ids = inputs["input_ids"].cuda()

        with torch.no_grad():
            outputs = model(input_ids=input_ids, output_hidden_states=True)

        hidden_states = outputs.hidden_states[-1]
        embedding = hidden_states.mean(dim=1).cpu().numpy().tolist()[0]

        embedding_cache[(text, engine)] = embedding
    return embedding_cache[(text, engine)]

In [37]:
def process_file(file_path, embedding_cache, tokenizer, model, default_embedding_engine):
    df = pd.read_csv(file_path, sep='\t', encoding='utf-8')
    df = process_input_data(df)

    for column in ["utter1", "utter2"]:
        df[f"{column}_embedding"] = df[column].apply(lambda x: get_embedding_with_cache(x, embedding_cache, tokenizer, model))

    df["cosine_similarity"] = df.apply(
        lambda row: cosine_similarity(
            np.array(row["utter1_embedding"]).reshape(1, -1),
            np.array(row["utter2_embedding"]).reshape(1, -1)
        )[0][0] if row["utter1_embedding"] is not None and row["utter2_embedding"] is not None else None,
        axis=1
    )

    return df

In [38]:
embedding_cache_path = "./data/llama2_embedding_cache.pkl"
embedding_cache_dir = os.path.dirname(embedding_cache_path)

if not os.path.exists(embedding_cache_dir):
    os.makedirs(embedding_cache_dir)

try:
    with open(embedding_cache_path, "rb") as f:
        embedding_cache = pickle.load(f)
except FileNotFoundError:
    embedding_cache = {}

In [39]:
folder_path = "./data/prepped_stan_small"
text_files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f)) and f.endswith('.txt')]

In [40]:
concatenated_df = pd.DataFrame()
for file_name in tqdm(text_files, desc="Processing files"):
    file_path = os.path.join(folder_path, file_name)
    df = process_file(file_path, embedding_cache, tokenizer, model, default_embedding_engine)
    concatenated_df = pd.concat([concatenated_df, df], ignore_index=True)

    with open(embedding_cache_path, "wb") as embedding_cache_file:
        pickle.dump(embedding_cache, embedding_cache_file)

# concatenated_df.to_csv("concatenated_df_llama2.csv", index=False)

Processing files: 100%|██████████| 87/87 [00:28<00:00,  3.08it/s]


In [41]:
concatenated_df

Unnamed: 0,participant,content,token,lemma,tagged_token,tagged_lemma,tagged_stan_token,tagged_stan_lemma,file,utter1,utter2,utter_order,utter1_embedding,utter2_embedding,cosine_similarity
0,Operator female,tuscon police xxx,"['tucson', 'police', 'xxx']","['tucson', 'police', 'xxx']","[('tucson', 'NN'), ('police', 'NN'), ('xxx', '...","[('tucson', 'NN'), ('police', 'NN'), ('xxx', '...","[('tucson', 'NN'), ('police', 'NN'), ('xxx', '...","[('tucson', 'NN'), ('police', 'NN'), ('xxx', '...",1-TUC_0.txt,tuscon police xxx,this is xxx,Operator female Operator 2 female,"[0.5087890625, -1.2724609375, 0.76220703125, 0...","[0.34619140625, -0.07781982421875, 0.751953125...",0.645413
1,Operator 2 female,this is xxx,"['this', 'is', 'xxx']","['this', 'be', 'xxx']","[('this', 'DT'), ('is', 'VBZ'), ('xxx', 'JJ')]","[('this', 'DT'), ('be', 'VB'), ('xxx', 'JJ')]","[('this', 'DT'), ('is', 'VBZ'), ('xxx', 'NN')]","[('this', 'DT'), ('be', 'VB'), ('xxx', 'NN')]",1-TUC_0.txt,this is xxx,i need somebody here,Operator 2 female Caller male,"[0.34619140625, -0.07781982421875, 0.751953125...","[1.3330078125, -0.22119140625, -0.256591796875...",0.445610
2,Caller male,i need somebody here,"['i', 'need', 'somebody', 'here']","['i', 'need', 'somebody', 'here']","[('i', 'NNS'), ('need', 'VBP'), ('somebody', '...","[('i', 'NNS'), ('need', 'VBP'), ('somebody', '...","[('i', 'LS'), ('need', 'MD'), ('somebody', 'NN...","[('i', 'LS'), ('need', 'MD'), ('somebody', 'NN...",1-TUC_0.txt,i need somebody here,sir hold on one second okay he saw a patient ...,Caller male Operator 2 female,"[1.3330078125, -0.22119140625, -0.256591796875...","[1.45703125, -2.55078125, 0.83984375, -0.18359...",0.584490
3,Operator 2 female,sir hold on one second okay he saw a patient ...,"['sir', 'hold', 'on', 'one', 'second', 'okay',...","['sir', 'hold', 'on', 'one', 'second', 'okay',...","[('sir', 'NN'), ('hold', 'NN'), ('on', 'IN'), ...","[('sir', 'NN'), ('hold', 'NN'), ('on', 'IN'), ...","[('sir', 'NNP'), ('hold', 'VBP'), ('on', 'IN')...","[('sir', 'NNP'), ('hold', 'VBP'), ('on', 'IN')...",1-TUC_0.txt,sir hold on one second okay he saw a patient ...,okay hello does it look like he's been stabbed...,Operator 2 female Operator female,"[1.45703125, -2.55078125, 0.83984375, -0.18359...","[1.8779296875, -2.7109375, 1.240234375, 0.0852...",0.748699
4,Operator female,okay hello does it look like he's been stabbed...,"['okay', 'hello', 'does', 'it', 'look', 'like'...","['okay', 'hello', 'do', 'it', 'look', 'like', ...","[('okay', 'JJ'), ('hello', 'NN'), ('does', 'VB...","[('okay', 'NN'), ('hello', 'NN'), ('do', 'VBP'...","[('okay', 'JJ'), ('hello', 'UH'), ('does', 'VB...","[('okay', 'JJ'), ('hello', 'UH'), ('do', 'VBP'...",1-TUC_0.txt,okay hello does it look like he's been stabbed...,honestly i couldn't i couldn't see anything i ...,Operator female Caller male,"[1.8779296875, -2.7109375, 1.240234375, 0.0852...","[1.5244140625, -2.16015625, 1.0234375, 0.51464...",0.818535
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3312,operator female,how many people assaulted him do you know,"['how', 'many', 'people', 'assaulted', 'him', ...","['how', 'many', 'people', 'assault', 'him', 'd...","[('how', 'WRB'), ('many', 'JJ'), ('people', 'N...","[('how', 'WRB'), ('many', 'JJ'), ('people', 'N...","[('how', 'WRB'), ('many', 'JJ'), ('people', 'N...","[('how', 'WRB'), ('many', 'JJ'), ('people', 'N...",9-TUC_0.txt,how many people assaulted him do you know,do not know i was at the front door for this n...,operator female caller male,"[2.9140625, -2.720703125, 1.287109375, -0.5629...","[2.150390625, -1.9609375, 0.298583984375, -0.2...",0.587910
3313,caller male,do not know i was at the front door for this n...,"['do', 'not', 'know', 'i', 'was', 'at', 'the',...","['do', 'not', 'know', 'i', 'be', 'at', 'the', ...","[('do', 'VB'), ('not', 'RB'), ('know', 'VB'), ...","[('do', 'VB'), ('not', 'RB'), ('know', 'VB'), ...","[('do', 'VB'), ('not', 'RB'), ('know', 'VB'), ...","[('do', 'VB'), ('not', 'RB'), ('know', 'VB'), ...",9-TUC_0.txt,do not know i was at the front door for this n...,okay sir we'll get someone out as soon as we can,caller male operator female,"[2.150390625, -1.9609375, 0.298583984375, -0.2...","[2.05078125, -2.91015625, 0.57666015625, 0.415...",0.563802
3314,operator female,okay sir we'll get someone out as soon as we can,"['okay', 'sir', 'we', 'will', 'get', 'someone'...","['okay', 'sir', 'we', 'will', 'get', 'someone'...","[('okay', 'JJ'), ('sir', 'NN'), ('we', 'PRP'),...","[('okay', 'JJ'), ('sir', 'NN'), ('we', 'PRP'),...","[('okay', 'JJ'), ('sir', 'NNP'), ('we', 'PRP')...","[('okay', 'JJ'), ('sir', 'NNP'), ('we', 'PRP')...",9-TUC_0.txt,okay sir we'll get someone out as soon as we can,thank you,operator female caller male,"[2.05078125, -2.91015625, 0.57666015625, 0.415...","[1.6240234375, -1.8408203125, 0.6923828125, -0...",0.372024
3315,caller male,thank you,"['thank', 'you']","['thank', 'you']","[('thank', 'NN'), ('you', 'PRP')]","[('thank', 'NN'), ('you', 'PRP')]","[('thank', 'VB'), ('you', 'PRP')]","[('thank', 'VB'), ('you', 'PRP')]",9-TUC_0.txt,thank you,you're welcome,caller male operator female,"[1.6240234375, -1.8408203125, 0.6923828125, -0...","[1.462890625, -2.310546875, 0.017913818359375,...",0.651379


In [42]:
concatenated_df[concatenated_df['cosine_similarity'] > 0.75].shape[0]

830

In [43]:
concatenated_df.columns

Index(['participant', 'content', 'token', 'lemma', 'tagged_token',
       'tagged_lemma', 'tagged_stan_token', 'tagged_stan_lemma', 'file',
       'utter1', 'utter2', 'utter_order', 'utter1_embedding',
       'utter2_embedding', 'cosine_similarity'],
      dtype='object')

In [44]:
Llama_columns = ['participant', 'content', 'token', 'lemma', 'tagged_token',
       'tagged_lemma', 'tagged_stan_token', 'tagged_stan_lemma', 'file',
       'Llama_utter1', 'Llama_utter2', 'Llama_utter_order', 'Llama_utter1_embedding',
       'Llama_utter2_embedding', 'Llama_cosine_similarity']

In [45]:
concatenated_df.columns = Llama_columns

In [46]:
concatenated_df.head()

Unnamed: 0,participant,content,token,lemma,tagged_token,tagged_lemma,tagged_stan_token,tagged_stan_lemma,file,Llama_utter1,Llama_utter2,Llama_utter_order,Llama_utter1_embedding,Llama_utter2_embedding,Llama_cosine_similarity
0,Operator female,tuscon police xxx,"['tucson', 'police', 'xxx']","['tucson', 'police', 'xxx']","[('tucson', 'NN'), ('police', 'NN'), ('xxx', '...","[('tucson', 'NN'), ('police', 'NN'), ('xxx', '...","[('tucson', 'NN'), ('police', 'NN'), ('xxx', '...","[('tucson', 'NN'), ('police', 'NN'), ('xxx', '...",1-TUC_0.txt,tuscon police xxx,this is xxx,Operator female Operator 2 female,"[0.5087890625, -1.2724609375, 0.76220703125, 0...","[0.34619140625, -0.07781982421875, 0.751953125...",0.645413
1,Operator 2 female,this is xxx,"['this', 'is', 'xxx']","['this', 'be', 'xxx']","[('this', 'DT'), ('is', 'VBZ'), ('xxx', 'JJ')]","[('this', 'DT'), ('be', 'VB'), ('xxx', 'JJ')]","[('this', 'DT'), ('is', 'VBZ'), ('xxx', 'NN')]","[('this', 'DT'), ('be', 'VB'), ('xxx', 'NN')]",1-TUC_0.txt,this is xxx,i need somebody here,Operator 2 female Caller male,"[0.34619140625, -0.07781982421875, 0.751953125...","[1.3330078125, -0.22119140625, -0.256591796875...",0.44561
2,Caller male,i need somebody here,"['i', 'need', 'somebody', 'here']","['i', 'need', 'somebody', 'here']","[('i', 'NNS'), ('need', 'VBP'), ('somebody', '...","[('i', 'NNS'), ('need', 'VBP'), ('somebody', '...","[('i', 'LS'), ('need', 'MD'), ('somebody', 'NN...","[('i', 'LS'), ('need', 'MD'), ('somebody', 'NN...",1-TUC_0.txt,i need somebody here,sir hold on one second okay he saw a patient ...,Caller male Operator 2 female,"[1.3330078125, -0.22119140625, -0.256591796875...","[1.45703125, -2.55078125, 0.83984375, -0.18359...",0.58449
3,Operator 2 female,sir hold on one second okay he saw a patient ...,"['sir', 'hold', 'on', 'one', 'second', 'okay',...","['sir', 'hold', 'on', 'one', 'second', 'okay',...","[('sir', 'NN'), ('hold', 'NN'), ('on', 'IN'), ...","[('sir', 'NN'), ('hold', 'NN'), ('on', 'IN'), ...","[('sir', 'NNP'), ('hold', 'VBP'), ('on', 'IN')...","[('sir', 'NNP'), ('hold', 'VBP'), ('on', 'IN')...",1-TUC_0.txt,sir hold on one second okay he saw a patient ...,okay hello does it look like he's been stabbed...,Operator 2 female Operator female,"[1.45703125, -2.55078125, 0.83984375, -0.18359...","[1.8779296875, -2.7109375, 1.240234375, 0.0852...",0.748699
4,Operator female,okay hello does it look like he's been stabbed...,"['okay', 'hello', 'does', 'it', 'look', 'like'...","['okay', 'hello', 'do', 'it', 'look', 'like', ...","[('okay', 'JJ'), ('hello', 'NN'), ('does', 'VB...","[('okay', 'NN'), ('hello', 'NN'), ('do', 'VBP'...","[('okay', 'JJ'), ('hello', 'UH'), ('does', 'VB...","[('okay', 'JJ'), ('hello', 'UH'), ('do', 'VBP'...",1-TUC_0.txt,okay hello does it look like he's been stabbed...,honestly i couldn't i couldn't see anything i ...,Operator female Caller male,"[1.8779296875, -2.7109375, 1.240234375, 0.0852...","[1.5244140625, -2.16015625, 1.0234375, 0.51464...",0.818535


In [47]:
Llama_concatenated_df = concatenated_df.copy()

# End Llama 2

# Merging Output
the output would be a single spreadsheet with the cosine/ALIGN scores along four columns, corresponding to each method.

In [48]:
W2V_BERT_GPT_Llama_concatenated_df = pd.concat([W2V_concatenated_df, BERT_concatenated_df, GPT_concatenated_df, Llama_concatenated_df], axis=1)

In [49]:
W2V_BERT_GPT_Llama_concatenated_df = W2V_BERT_GPT_Llama_concatenated_df.loc[:, ~W2V_BERT_GPT_Llama_concatenated_df.columns.duplicated()]

In [50]:
W2V_BERT_GPT_Llama_concatenated_df.columns

Index(['participant', 'content', 'token', 'lemma', 'tagged_token',
       'tagged_lemma', 'tagged_stan_token', 'tagged_stan_lemma', 'file',
       'W2V_content1', 'W2V_content2', 'W2V_token1', 'W2V_token2',
       'W2V_lemma1', 'W2V_lemma2', 'W2V_tagged_token1', 'W2V_tagged_token2',
       'W2V_tagged_lemma1', 'W2V_tagged_lemma2', 'W2V_tagged_stan_token1',
       'W2V_tagged_stan_token2', 'W2V_tagged_stan_lemma1',
       'W2V_tagged_stan_lemma2', 'W2V_utter_order', 'W2V_lemma1_sum_embedding',
       'W2V_lemma2_sum_embedding', 'W2V_token1_sum_embedding',
       'W2V_token2_sum_embedding', 'W2V_lemma_cosine_similarity',
       'W2V_token_cosine_similarity', 'BERT_utter1', 'BERT_utter2',
       'BERT_utter_order', 'BERT_utter1_embedding', 'BERT_utter2_embedding',
       'BERT_cosine_similarity', 'GPT_utter1', 'GPT_utter2', 'GPT_utter_order',
       'GPT_utter1_embedding', 'GPT_utter2_embedding', 'GPT_cosine_similarity',
       'Llama_utter1', 'Llama_utter2', 'Llama_utter_order',
       '

In [51]:
W2V_BERT_GPT_Llama_concatenated_df = W2V_BERT_GPT_Llama_concatenated_df[['participant', 'content', 'token', 'lemma', 'tagged_token',
       'tagged_lemma', 'tagged_stan_token', 'tagged_stan_lemma', 'file', 'W2V_lemma_cosine_similarity',
       'W2V_token_cosine_similarity', 'BERT_cosine_similarity', 'GPT_cosine_similarity', 'Llama_cosine_similarity']]

In [52]:
W2V_BERT_GPT_Llama_concatenated_df

Unnamed: 0,participant,content,token,lemma,tagged_token,tagged_lemma,tagged_stan_token,tagged_stan_lemma,file,W2V_lemma_cosine_similarity,W2V_token_cosine_similarity,BERT_cosine_similarity,GPT_cosine_similarity,Llama_cosine_similarity
0,Operator female,tuscon police xxx,"[tucson, police, xxx]","[tucson, police, xxx]","[(tucson, NN), (police, NN), (xxx, NN)]","[(tucson, NN), (police, NN), (xxx, NN)]","[(tucson, NN), (police, NN), (xxx, NN)]","[(tucson, NN), (police, NN), (xxx, NN)]",1-TUC_0.txt,0.718296,0.718296,0.624709,0.808507,0.645413
1,Operator 2 female,this is xxx,"[this, xxx]","[this, xxx]","[(this, DT), (is, VBZ), (xxx, JJ)]","[(this, DT), (be, VB), (xxx, JJ)]","[(this, DT), (is, VBZ), (xxx, NN)]","[(this, DT), (be, VB), (xxx, NN)]",1-TUC_0.txt,0.239677,0.239677,0.537029,0.806422,0.445610
2,Caller male,i need somebody here,"[need, somebody, here]","[need, somebody, here]","[(i, NNS), (need, VBP), (somebody, NN), (here,...","[(i, NNS), (need, VBP), (somebody, NN), (here,...","[(i, LS), (need, MD), (somebody, NN), (here, RB)]","[(i, LS), (need, MD), (somebody, NN), (here, RB)]",1-TUC_0.txt,0.693040,0.669003,0.560584,0.757559,0.584490
3,Operator 2 female,sir hold on one second okay he saw a patient ...,"[sir, hold, one, second, saw, patient, in, roa...","[sir, hold, one, second, saw, patient, in, roa...","[(sir, NN), (hold, NN), (on, IN), (one, CD), (...","[(sir, NN), (hold, NN), (on, IN), (one, CD), (...","[(sir, NNP), (hold, VBP), (on, IN), (one, CD),...","[(sir, NNP), (hold, VBP), (on, IN), (one, CD),...",1-TUC_0.txt,0.675061,0.564028,0.778168,0.774660,0.748699
4,Operator female,okay hello does it look like he's been stabbed...,"[hello, look, like, or, shot, or]","[hello, look, like, or, shot, or, go]","[(okay, JJ), (hello, NN), (does, VBZ), (it, PR...","[(okay, NN), (hello, NN), (do, VBP), (it, PRP)...","[(okay, JJ), (hello, UH), (does, VBZ), (it, PR...","[(okay, JJ), (hello, UH), (do, VBP), (it, PRP)...",1-TUC_0.txt,0.703610,0.670530,0.770860,0.820494,0.818535
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3312,operator female,how many people assaulted him do you know,"[how, many, people, him, know]","[how, many, people, assault, him, know]","[(how, WRB), (many, JJ), (people, NNS), (assau...","[(how, WRB), (many, JJ), (people, NNS), (assau...","[(how, WRB), (many, JJ), (people, NNS), (assau...","[(how, WRB), (many, JJ), (people, NNS), (assau...",9-TUC_0.txt,0.534133,0.537692,0.617103,0.721233,0.587910
3313,caller male,do not know i was at the front door for this n...,"[know, at, front, door, for, this, now, they, ...","[know, at, front, door, for, this, now, they, ...","[(do, VB), (not, RB), (know, VB), (i, NN), (wa...","[(do, VB), (not, RB), (know, VB), (i, RB), (be...","[(do, VB), (not, RB), (know, VB), (i, FW), (wa...","[(do, VB), (not, RB), (know, VB), (i, FW), (be...",9-TUC_0.txt,0.616518,0.622951,0.623789,0.755941,0.563802
3314,operator female,okay sir we'll get someone out as soon as we can,"[sir, we, will, get, someone, out, as, soon, a...","[sir, we, will, get, someone, out, as, soon, w...","[(okay, JJ), (sir, NN), (we, PRP), (will, MD),...","[(okay, JJ), (sir, NN), (we, PRP), (will, MD),...","[(okay, JJ), (sir, NNP), (we, PRP), (will, MD)...","[(okay, JJ), (sir, NNP), (we, PRP), (will, MD)...",9-TUC_0.txt,0.211888,0.202332,0.483212,0.768013,0.372024
3315,caller male,thank you,[thank],[thank],"[(thank, NN), (you, PRP)]","[(thank, NN), (you, PRP)]","[(thank, VB), (you, PRP)]","[(thank, VB), (you, PRP)]",9-TUC_0.txt,0.347395,0.347395,0.747736,0.904503,0.651379


In [53]:
W2V_BERT_GPT_Llama_concatenated_df[W2V_BERT_GPT_Llama_concatenated_df['W2V_lemma_cosine_similarity'] > 0.75].shape[0]

395

In [54]:
W2V_BERT_GPT_Llama_concatenated_df[W2V_BERT_GPT_Llama_concatenated_df['W2V_token_cosine_similarity'] > 0.75].shape[0]

353

In [55]:
W2V_BERT_GPT_Llama_concatenated_df[W2V_BERT_GPT_Llama_concatenated_df['BERT_cosine_similarity'] > 0.75].shape[0]

481

In [56]:
W2V_BERT_GPT_Llama_concatenated_df[W2V_BERT_GPT_Llama_concatenated_df['GPT_cosine_similarity'] > 0.75].shape[0]

2495

In [57]:
W2V_BERT_GPT_Llama_concatenated_df[W2V_BERT_GPT_Llama_concatenated_df['Llama_cosine_similarity'] > 0.75].shape[0]

830