In [None]:
import os
from os import listdir
from os.path import isfile, join

import pickle
import random

import pandas as pd
import numpy as np

from tqdm import tqdm  # for progress bars

from sklearn.metrics.pairwise import cosine_similarity

## for integrating with BERT
import torch
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")

# https://www.geeksforgeeks.org/how-to-generate-word-embedding-using-bert/


## for integrating with LLAMA
# import transformers
# import torch

# model_id = "meta-llama/Meta-Llama-3-8B"

# pipeline = transformers.pipeline( 
#     "text-generation", model=model_id, model_kwargs={"torch_dtype": torch.bfloat16}, device_map="auto"
# )

# from transformers import AutoTokenizer, AutoModelForCausalLM
# tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-Guard-2-8B")
# model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-Guard-2-8B")


In [None]:
# Function to get lagged conversational turns
def process_input_data(df: pd.DataFrame) -> pd.DataFrame:
    df['utter1'] = df['content']
    df['utter2'] = df['content'].shift(-1)
    df['utter_order'] = df['participant'] + ' ' + df['participant'].shift(-1)
    return df

### NEEDS TO BE UPDATED FOR BERT ###
# Function to compute embeddings, but first checks if already in cache and if not, add them there afterward
default_embedding_engine = "text-embedding-ada-002"  # text-embedding-ada-002 is recommended
def get_embedding_with_cache(
    text: str,
    engine: str = default_embedding_engine
) -> list:
    # Skip if there is no text content for computing embedding
    if text is None:
        return None
    if (text, engine) not in embedding_cache.keys():
        # if not in cache, call API to get embedding
        embedding_cache[(text, engine)] = openai.embeddings.create(input=[text], model=engine).data[0].embedding
    return embedding_cache[(text, engine)]

# Function to process and get embeddings/cosines for a single file
def process_file(file_path, embedding_cache, default_embedding_engine):       
    df = pd.read_csv(file_path, sep='\t', encoding='utf-8')
    df = process_input_data(df)

   # Create column of embeddings
    for column in ["utter1", "utter2"]:
        df[f"{column}_embedding"] = df[column].apply(get_embedding_with_cache)
    
    # Create column of cosine similiarity
    df["cosine_similarity"] = df.apply(
        lambda row: cosine_similarity(
            np.array(row["utter1_embedding"]).reshape(1, -1),
            np.array(row["utter2_embedding"]).reshape(1, -1)
        )[0][0] if row["utter1_embedding"] is not None and row["utter2_embedding"] is not None else None,
        axis=1
    )

    return df

##########

# Load or initialize the embedding cache, if none there (first time running), then create empty cache to build
embedding_cache_path = "data/bert_embedding_cache.pkl"
try:
    with open(embedding_cache_path, "rb") as f:
        embedding_cache = pickle.load(f)
except FileNotFoundError:
    embedding_cache = {}

##########

# Path to the folder containing the text files
folder_path = "./data/prepped_stan_small"
text_files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f)) and f.endswith('.txt')]

# Process each file and update the cache
concatenated_df = pd.DataFrame()
for file_name in tqdm(text_files, desc="Processing files"):
    file_path = os.path.join(folder_path, file_name)
    df = process_file(file_path, embedding_cache, default_embedding_engine)
    concatenated_df = pd.concat([concatenated_df, df], ignore_index=True)

    # Save the updated embedding cache to disk after processing each file
    with open(embedding_cache_path, "wb") as embedding_cache_file:
        pickle.dump(embedding_cache, embedding_cache_file)

# concatenated_df now contains all the processed data

In [None]:
concatenated_df