In [1]:
import os
from os import listdir
from os.path import isfile, join

import pickle

import pandas as pd
import numpy as np

from tqdm import tqdm  # for progress bars

from sklearn.metrics.pairwise import cosine_similarity

import openai
# openai.api_key = "INSERT HERE"


In [4]:
# Function to get lagged conversational turns
def process_input_data(df: pd.DataFrame) -> pd.DataFrame:
    df['utter1'] = df['content']
    df['utter2'] = df['content'].shift(-1)
    df['utter_order'] = df['participant'] + ' ' + df['participant'].shift(-1)
    return df

# Function to compute embeddings, but first checks if already in cache and if not, add them there afterward
default_embedding_engine = "text-embedding-ada-002"  # text-embedding-ada-002 is recommended
def get_embedding_with_cache(
    text: str,
    engine: str = default_embedding_engine
) -> list:
    # Skip if there is no text content for computing embedding
    if text is None:
        return None
    if (text, engine) not in embedding_cache.keys():
        # if not in cache, call API to get embedding
        embedding_cache[(text, engine)] = openai.embeddings.create(input=[text], model=engine).data[0].embedding
    return embedding_cache[(text, engine)]

# Function to process and get embeddings/cosines for a single file
def process_file(file_path, embedding_cache, default_embedding_engine):       
    df = pd.read_csv(file_path, sep='\t', encoding='utf-8')
    df = process_input_data(df)

   # Create column of embeddings
    for column in ["utter1", "utter2"]:
        df[f"{column}_embedding"] = df[column].apply(get_embedding_with_cache)
    
    # Create column of cosine similiarity
    df["cosine_similarity"] = df.apply(
        lambda row: cosine_similarity(
            np.array(row["utter1_embedding"]).reshape(1, -1),
            np.array(row["utter2_embedding"]).reshape(1, -1)
        )[0][0] if row["utter1_embedding"] is not None and row["utter2_embedding"] is not None else None,
        axis=1
    )

    return df

##########

# Load or initialize the embedding cache, if none there (first time running), then create empty cache to build
embedding_cache_path = "data/gpt_embedding_cache.pkl"
try:
    with open(embedding_cache_path, "rb") as f:
        embedding_cache = pickle.load(f)
except FileNotFoundError:
    embedding_cache = {}

##########

# Path to the folder containing the text files
folder_path = "./data/prepped_stan_small"
text_files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f)) and f.endswith('.txt')]

# Process each file and update the cache
concatenated_df = pd.DataFrame()
for file_name in tqdm(text_files, desc="Processing files"):
    file_path = os.path.join(folder_path, file_name)
    df = process_file(file_path, embedding_cache, default_embedding_engine)
    concatenated_df = pd.concat([concatenated_df, df], ignore_index=True)

    # Save the updated embedding cache to disk after processing each file
    with open(embedding_cache_path, "wb") as embedding_cache_file:
        pickle.dump(embedding_cache, embedding_cache_file)

# concatenated_df now contains all the processed data

Processing files: 100%|██████████| 2/2 [00:20<00:00, 10.21s/it]


In [5]:
concatenated_df

Unnamed: 0,participant,content,token,lemma,tagged_token,tagged_lemma,tagged_stan_token,tagged_stan_lemma,file,utter1,utter2,utter_order,utter1_embedding,utter2_embedding,cosine_similarity
0,PC:,i thought that maybe that would do something,"['i', 'thought', 'that', 'maybe', 'that', 'wou...","['i', 'think', 'that', 'maybe', 'that', 'would...","[('i', 'NN'), ('thought', 'VBD'), ('that', 'IN...","[('i', 'NN'), ('think', 'VBP'), ('that', 'IN')...","[('i', 'LS'), ('thought', 'VBD'), ('that', 'IN...","[('i', 'FW'), ('think', 'VBP'), ('that', 'IN')...",ASU-T10_ExpBlock2-DolphinShow.txt,i thought that maybe that would do something,i think you have to c maybe close it and it wi...,PC: PA:,"[-0.032679855823516846, 0.00039228188688866794...","[0.0034637455828487873, 0.003785194829106331, ...",0.807814
1,PA:,i think you have to c maybe close it and it wi...,"['i', 'think', 'you', 'have', 'to', 'c', 'mayb...","['i', 'think', 'you', 'have', 'to', 'c', 'mayb...","[('i', 'NN'), ('think', 'VBP'), ('you', 'PRP')...","[('i', 'NN'), ('think', 'VBP'), ('you', 'PRP')...","[('i', 'LS'), ('think', 'VB'), ('you', 'PRP'),...","[('i', 'LS'), ('think', 'VB'), ('you', 'PRP'),...",ASU-T10_ExpBlock2-DolphinShow.txt,i think you have to c maybe close it and it wi...,should i restart this it's like down there,PA: PC:,"[0.0034637455828487873, 0.003785194829106331, ...","[-0.0030778965447098017, -0.018122322857379913...",0.781031
2,PC:,should i restart this it's like down there,"['should', 'i', 'start', 'this', 'it', 'is', '...","['should', 'i', 'start', 'this', 'it', 'be', '...","[('should', 'MD'), ('i', 'VB'), ('start', 'VB'...","[('should', 'MD'), ('i', 'VB'), ('start', 'VB'...","[('should', 'MD'), ('i', 'FW'), ('start', 'VB'...","[('should', 'MD'), ('i', 'FW'), ('start', 'VB'...",ASU-T10_ExpBlock2-DolphinShow.txt,should i restart this it's like down there,yeah i would just restart it,PC: PA:,"[-0.0030778965447098017, -0.018122322857379913...","[-0.003490022150799632, -0.007978125475347042,...",0.862541
3,PA:,yeah i would just restart it,"['yeah', 'i', 'would', 'just', 'start', 'it']","['yeah', 'i', 'would', 'just', 'start', 'it']","[('yeah', 'NN'), ('i', 'NN'), ('would', 'MD'),...","[('yeah', 'NN'), ('i', 'NN'), ('would', 'MD'),...","[('yeah', 'JJ'), ('i', 'FW'), ('would', 'MD'),...","[('yeah', 'JJ'), ('i', 'FW'), ('would', 'MD'),...",ASU-T10_ExpBlock2-DolphinShow.txt,yeah i would just restart it,okay now what,PA: PC:,"[-0.003490022150799632, -0.007978125475347042,...","[0.006083404179662466, -0.016767257824540138, ...",0.766547
4,PC:,okay now what,"['okay', 'now', 'what']","['okay', 'now', 'what']","[('okay', 'RB'), ('now', 'RB'), ('what', 'WP')]","[('okay', 'RB'), ('now', 'RB'), ('what', 'WP')]","[('okay', 'JJ'), ('now', 'RB'), ('what', 'WP')]","[('okay', 'JJ'), ('now', 'RB'), ('what', 'WP')]",ASU-T10_ExpBlock2-DolphinShow.txt,okay now what,you didn't close it,PC: PA:,"[0.006083404179662466, -0.016767257824540138, ...","[0.005405202973634005, -0.007813462056219578, ...",0.750888
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,PC:,yeah but i think it might have gotten stuck now,"['yeah', 'but', 'i', 'think', 'it', 'might', '...","['yeah', 'but', 'i', 'think', 'it', 'might', '...","[('yeah', 'NN'), ('but', 'CC'), ('i', 'JJ'), (...","[('yeah', 'NN'), ('but', 'CC'), ('i', 'JJ'), (...","[('yeah', 'NN'), ('but', 'CC'), ('i', 'FW'), (...","[('yeah', 'NN'), ('but', 'CC'), ('i', 'FW'), (...",ASU-T10_ExpBlock1-Oneatatime.txt,yeah but i think it might have gotten stuck now,if you hit the space bar i think it restarts it,PC: PA:,"[-0.03798016533255577, -0.010394293814897537, ...","[-0.015708738937973976, -0.0154748959466815, -...",0.773564
61,PA:,if you hit the space bar i think it restarts it,"['if', 'you', 'hit', 'the', 'space', 'bar', 'i...","['if', 'you', 'hit', 'the', 'space', 'bar', 'i...","[('if', 'IN'), ('you', 'PRP'), ('hit', 'VBP'),...","[('if', 'IN'), ('you', 'PRP'), ('hit', 'VBP'),...","[('if', 'IN'), ('you', 'PRP'), ('hit', 'VBP'),...","[('if', 'IN'), ('you', 'PRP'), ('hit', 'VBP'),...",ASU-T10_ExpBlock1-Oneatatime.txt,if you hit the space bar i think it restarts it,we got a gold one,PA: PC:,"[-0.015708738937973976, -0.0154748959466815, -...","[-0.008633834309875965, 0.005616842769086361, ...",0.701603
62,PC:,we got a gold one,"['we', 'got', 'a', 'gold', 'one']","['we', 'get', 'a', 'gold', 'one']","[('we', 'PRP'), ('got', 'VBD'), ('a', 'DT'), (...","[('we', 'PRP'), ('get', 'VBP'), ('a', 'DT'), (...","[('we', 'PRP'), ('got', 'VBD'), ('a', 'DT'), (...","[('we', 'PRP'), ('get', 'VBP'), ('a', 'DT'), (...",ASU-T10_ExpBlock1-Oneatatime.txt,we got a gold one,sweet come on,PC: PB:,"[-0.008633834309875965, 0.005616842769086361, ...","[0.011414420790970325, -0.02576693333685398, 0...",0.770138
63,PB:,sweet come on,"['sweet', 'come', 'on']","['sweet', 'come', 'on']","[('sweet', 'JJ'), ('come', 'NN'), ('on', 'IN')]","[('sweet', 'JJ'), ('come', 'NN'), ('on', 'IN')]","[('sweet', 'JJ'), ('come', 'VBN'), ('on', 'IN')]","[('sweet', 'JJ'), ('come', 'VBN'), ('on', 'IN')]",ASU-T10_ExpBlock1-Oneatatime.txt,sweet come on,i don't know what that means but gold is good,PB: PA:,"[0.011414420790970325, -0.02576693333685398, 0...","[-0.004764971323311329, -0.008828151039779186,...",0.774610
