### Setup Environment

In [1]:
# Get the requirements.txt file
# add the openAI API key as an environment variable

API Key loaded successfully!


### Import Necessary Libraries

In [2]:
import os  # Provides a way to use operating system dependent functionality like reading or writing to the file system
import pickle
import pandas as pd  # Powerful data structures for data analysis, time series, and statistics
import numpy as np  # Support for large, multi-dimensional arrays and matrices, along with a collection of mathematical functions to operate on these arrays
from tqdm import tqdm  # For progress bars
from sklearn.metrics.pairwise import cosine_similarity  # Computes the cosine similarity between samples in a matrix

# For W2V specifically
import re  # Provides regular expression matching operations
import ast  # Abstract Syntax Trees, used for parsing and analyzing Python source code
from collections import Counter  # Provides a way to count the frequency of elements in a collection
import gensim  # Library for unsupervised topic modeling and natural language processing
import gensim.downloader as api  # Downloads and loads pre-trained models and datasets
# from gensim.models import KeyedVectors  # Provides efficient word vector representation and storage

# For BERT specifically 
import torch
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")

# For GPT specifically
import openai
# Retrieve the API key from the environment variable
api_key = os.getenv("API_KEY")
openai.api_key = api_key

# For LLAMA specifically
# Going to skip LLAMA for now


  from .autonotebook import tqdm as notebook_tqdm


### Common functions

In [None]:
def pair_and_lag_columns(df: pd.DataFrame, columns_to_lag: list, suffix1: str = '1', suffix2: str = '2') -> pd.DataFrame:
    """
    Creates lagged pairs of specified columns, generating new columns with a 
    suffix of `suffix1` for the original content and `suffix2` for the lagged content. 
    Also adds a new column indicating the order of participants between successive rows.
    """
    for col in columns_to_lag:
        if col in df.columns:
            df[f'{col}{suffix1}'] = df[col]
            df[f'{col}{suffix2}'] = df[col].shift(-1)
    df['utter_order'] = df['participant'] + ' ' + df['participant'].shift(-1)
    return df

def calculate_cosine_similarity(df: pd.DataFrame, embedding_pairs: list) -> pd.DataFrame:
    """
    Computes cosine similarities between pairs of vectors in the specified columns 
    and adds the results as new columns in the DataFrame.
    """
    for col1, col2 in embedding_pairs:
        similarities = df.apply(
            lambda row: cosine_similarity(
                np.array(row[col1]).reshape(1, -1),
                np.array(row[col2]).reshape(1, -1)
            )[0][0] if row[col1] is not None and row[col2] is not None else None,
            axis=1
        )
        similarity_column_name = f"{col1}_{col2}_cosine_similarity"
        df[similarity_column_name] = similarities
    return df

# BEGIN WORD2VEC

#### Steps: Load in models → Aggregate conversations → Build vocabulary → Pair and lag columns → Compute embeddings → Compute cosine similarities.

### SETUP for W2V

In [3]:
# retrieve the curent working directory where the script is being executed
script_dir = os.getcwd()

# create a directory called "gensim-data" (if doesn't already exist)
local_cache_dir = os.path.join(script_dir, "gensim-data")
os.makedirs(local_cache_dir, exist_ok=True)
print(f"Local cache directory: {local_cache_dir}")

# configure Gensim to use local_cache_dir as base directory for downloading and storing models
api.BASE_DIR = local_cache_dir
print(f"Gensim BASE_DIR set to: {api.BASE_DIR}")

# checks if specified models are already downloaded to cache directory, if not, download them
def download_and_cache_models(models, cache_dir):
    api.BASE_DIR = cache_dir
    for model_name in models:
        model_path = os.path.join(cache_dir, model_name)
        if not os.path.exists(model_path):
            try:
                print(f"Downloading model: {model_name}")
                model = api.load(model_name)
                print(f"Downloaded and cached model: {model_name}")
            except Exception as e:
                print(f"Error downloading {model_name}: {e}")
        else:
            print(f"Model {model_name} already exists at: {model_path}")

# specifies the list of models to be cached locally, invoking the download_and_cache_models function 
models_to_cache = ['word2vec-google-news-300', 'glove-twitter-200']
download_and_cache_models(models_to_cache, local_cache_dir)

# attempts to load a model from specified file path
def load_model_if_not_exists(model_path, binary=True):
    try:
        if not os.path.exists(model_path):
            raise FileNotFoundError(f"Model file {model_path} does not exist.")
        return gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=binary)
    except Exception as e:
        print(f"Error loading model from {model_path}: {e}")
        return None

# checks if the w2v_google_model is already loaded in the global namespace. if not, attempts to load it from local cache directory.
if 'w2v_google_model' not in globals():
    w2v_google_model_path = os.path.join(local_cache_dir, 'word2vec-google-news-300', 'word2vec-google-news-300.gz')
    w2v_google_model = load_model_if_not_exists(w2v_google_model_path, binary=True)
    if w2v_google_model is not None:
        print("Word2Vec Google News model loaded from local cache successfully.")
    else:
        print("Failed to load Word2Vec Google News model.")

# note: possible todo: is it more efficient to use gensim.downloader.load(model_name)?
# note, downloading model, it downloads properly, but also throws the exception warning for some reason. 
# TODO: instead of just loading google news model into global workspace, load in all within "models_to_cache"


Local cache directory: /Users/nduran4/Desktop/GitProjects/llm-linguistic-alignment/gensim-data
Gensim BASE_DIR set to: /Users/nduran4/Desktop/GitProjects/llm-linguistic-alignment/gensim-data
Model word2vec-google-news-300 already exists at: /Users/nduran4/Desktop/GitProjects/llm-linguistic-alignment/gensim-data/word2vec-google-news-300
Model glove-twitter-200 already exists at: /Users/nduran4/Desktop/GitProjects/llm-linguistic-alignment/gensim-data/glove-twitter-200
Word2Vec Google News model loaded from local cache successfully.


### Aggregate conversations → Build vocabulary

In [4]:
def aggregate_conversations(folder_path: str) -> pd.DataFrame:
    """
    Aggregates multiple .txt files located in a specified folder 
    into a single pandas DataFrame. Each file is expected to be 
    tab-separated. 
    
    Returns a DataFrame containing the concatenated content of all 
    the .txt files
    """
    text_files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f)) and f.endswith('.txt')]
    concatenated_df = pd.DataFrame()

    for file_name in text_files:
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path, sep='\t', encoding='utf-8')
        concatenated_df = pd.concat([concatenated_df, df], ignore_index=True)
    
    return concatenated_df

def build_filtered_vocab(data: pd.DataFrame, output_file_directory: str, high_sd_cutoff: float = 3, low_n_cutoff: int = 1):
    """
    Constructs a vocabulary from the ‘lemma’ column of the input DataFrame, 
    applying frequency-based filtering: ords occurring less frequently 
    than low_n_cutoff or more frequently than a certain standard deviation 
    above the mean (high_sd_cutoff) are filtered out. 
    
    Returns: Two lists: one with all vocabulary words and another with filtered words
    Outputs: The vocabulary frequencies to files
    """ 

    all_sentences = [re.sub(r'[^\w\s]+', '', str(row)).split() for row in data['lemma']]
    all_words = [word for sentence in all_sentences for word in sentence]

    frequency = Counter(all_words)

    frequency_filt = {word: freq for word, freq in frequency.items() if len(word) > 1 and freq > low_n_cutoff}
    
    if high_sd_cutoff is not None:
        mean_freq = np.mean(list(frequency_filt.values()))
        std_freq = np.std(list(frequency_filt.values()))
        cutoff_freq = mean_freq + (std_freq * high_sd_cutoff)
        filteredWords = {word: freq for word, freq in frequency_filt.items() if freq < cutoff_freq}
    else:
        filteredWords = frequency_filt
  
    vocabfreq_all = pd.DataFrame(frequency.items(), columns=["word", "count"]).sort_values(by='count', ascending=False)
    vocabfreq_filt = pd.DataFrame(filteredWords.items(), columns=["word", "count"]).sort_values(by='count', ascending=False)
  
    vocabfreq_all.to_csv(os.path.join(output_file_directory, 'vocab_unfilt_freqs.txt'), encoding='utf-8', index=False, sep='\t')
    vocabfreq_filt.to_csv(os.path.join(output_file_directory, 'vocab_filt_freqs.txt'), encoding='utf-8', index=False, sep='\t')
    
    return list(frequency.keys()), list(filteredWords.keys())

def is_list_like_column(series):
    """
    Checks if a pandas Series contains list-like strings (i.e., strings that 
    look like lists).
    """    

    try:
        return series.apply(lambda x: x.strip().startswith("[")).all()
    except AttributeError:
        return False

def convert_columns_to_lists(df: pd.DataFrame) -> pd.DataFrame:
    """
    Converts any columns in a DataFrame that contain list-like strings into 
    actual Python lists using ast.literal_eval.
    """        

    columns_converted = []
    for col in df.columns:
        if is_list_like_column(df[col]):
            df[col] = df[col].apply(ast.literal_eval)
            columns_converted.append(col)
    return df, columns_converted


### Pair and lag columns → Compute embeddings → Compute cosine similarities.

In [6]:
def get_sum_embeddings(token_list, model):
    """
    Calculates the sum of word embeddings for a list of tokens using a 
    pre-trained Word2Vec model.
    """ 

    if token_list is None:
        return None    
    embeddings = []
    for word in token_list:
        if word in model.key_to_index:  
            embeddings.append(model[word])    
    if embeddings:
        sum_embedding = np.sum(embeddings, axis=0)
        return sum_embedding
    else:
        return None  
    
def process_file_for_W2V(file_path, vocab_list: list):
    """
    Processes a file containing conversation data, filters tokens based on a provided vocabulary list,
    pairs and lags columns, computes word embeddings, and then calculates cosine similarities 
    between the embeddings.
    """
    df = pd.read_csv(file_path, sep='\t', encoding='utf-8')

    df, columns_converted = convert_columns_to_lists(df)

    # Filter tokens based on the vocabulary list
    columns_to_filter = ['lemma', 'token']
    for col in columns_to_filter:
        df[col] = df[col].apply(lambda token_list: [word for word in token_list if word in vocab_list])

    # Pair and lag the columns
    columns_to_lag = ['content', 'token', 'lemma']
    df = pair_and_lag_columns(df, columns_to_lag)

    # Compute embeddings
    for column in ["lemma", "token"]:
        df[f"{column}1_sum_embedding"] = df[f"{column}1"].apply(lambda tokens: get_sum_embeddings(tokens, w2v_google_model))
        df[f"{column}2_sum_embedding"] = df[f"{column}2"].apply(lambda tokens: get_sum_embeddings(tokens, w2v_google_model))

    # Calculate cosine similarities
    embedding_columns = [
        ("lemma1_sum_embedding", "lemma2_sum_embedding"),
        ("token1_sum_embedding", "token2_sum_embedding")
    ]
    df = calculate_cosine_similarity(df, embedding_columns)

    return df

### Main Run

In [8]:
# Define folder paths
folder_path = "./data/prepped_stan_small"
output_file_directory = "outputW2V"

# Check if the output directory exists, create it if not
os.makedirs(output_file_directory, exist_ok=True)

# List all text files in the folder
text_files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f)) and f.endswith('.txt')]

# Aggregate the conversations into a single DataFrame
concatenated_text_files = aggregate_conversations(folder_path)

# Build the filtered vocabulary and save it to the output directory
vocab_all, vocab_filtered = build_filtered_vocab(concatenated_text_files, output_file_directory)

# Process each file and concatenate the results into a single DataFrame
concatenated_df = pd.DataFrame()
for file_name in tqdm(text_files, desc="Processing files"):
    file_path = os.path.join(folder_path, file_name)
    df = process_file_for_W2V(file_path, vocab_filtered)
    concatenated_df = pd.concat([concatenated_df, df], ignore_index=True)

Processing files: 100%|██████████| 2/2 [00:00<00:00, 34.06it/s]


In [None]:
concatenated_df.head()

# END WORD2VEC

# BEGIN BERT

#### Steps: Pair and lag columns → Compute embeddings → Compute cosine similarities.

In [13]:
def get_embedding_with_cache(text):
    """
    Generates a BERT embedding for a given text while utilizing a cache to avoid redundant computations:
	•	  Checks if the embedding for the given text is already in the cache. If so, returns it.
	•	  If not cached, tokenizes the text, converts tokens to IDs, and feeds them to the BERT model to get the embedding.
	•	  The embedding is then averaged over all tokens and stored in the cache for future use.
    """ 

    if text is None:
      return None

    if text in embedding_cache:
      return embedding_cache[text]

    tokens = tokenizer.tokenize(text)
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    token_ids = [tokenizer.cls_token_id] + token_ids + [tokenizer.sep_token_id]
    input_ids = torch.tensor([token_ids])
    with torch.no_grad():
        outputs = model(input_ids)
    last_hidden_states = outputs.last_hidden_state
    embedding = torch.mean(last_hidden_states, dim=1).numpy()
    embedding_cache[text] = embedding

    return embedding

def process_file(file_path, embedding_cache):
    """
    Processes a single file to compute BERT embeddings for pairs of utterances and 
    calculates the cosine similarity between these embeddings:
    • Reads the file into a DataFrame.
    • Pairs and lags the `content` column using `pair_and_lag_columns`.
    • Applies the `get_embedding_with_cache` function to each utterance pair 
      to compute embeddings.
    • Computes the cosine similarity between embeddings of successive utterances.
    """
    df = pd.read_csv(file_path, sep='\t', encoding='utf-8')

    # Pair and lag the columns
    df = pair_and_lag_columns(df, columns_to_lag=['content'])

    # Compute embeddings for the lagged columns
    for column in ["content1", "content2"]:
        df[f"{column}_embedding"] = df[column].apply(get_embedding_with_cache)

    # Calculate cosine similarities between embeddings
    embedding_columns = [("content1_embedding", "content2_embedding")]
    df = calculate_cosine_similarity(df, embedding_columns)

    return df


### Main Run

In [14]:
# Attempts to load a pre-existing embedding cache from a file (bert_embedding_cache.pkl).
embedding_cache_path = "data/bert_embedding_cache.pkl"
try:
    with open(embedding_cache_path, "rb") as f:
        embedding_cache = pickle.load(f)
except FileNotFoundError:
    embedding_cache = {}

# Path to the folder containing the text files
folder_path = "data/prepped_stan_small"
text_files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f)) and f.endswith('.txt')]

# This loop processes each text file one by one:
concatenated_df = pd.DataFrame()
for file_name in tqdm(text_files, desc="Processing files"):
    file_path = os.path.join(folder_path, file_name)
    df = process_file(file_path, embedding_cache)
    concatenated_df = pd.concat([concatenated_df, df], ignore_index=True)

    with open(embedding_cache_path, "wb") as embedding_cache_file:
        pickle.dump(embedding_cache, embedding_cache_file)


Processing files: 100%|██████████| 2/2 [00:00<00:00, 92.22it/s]


In [15]:
concatenated_df.head()

Unnamed: 0,participant,content,token,lemma,tagged_token,tagged_lemma,tagged_stan_token,tagged_stan_lemma,file,content1,content2,utter_order,content1_embedding,content2_embedding,content1_embedding_content2_embedding_cosine_similarity
0,PC:,i thought that maybe that would do something,"['i', 'thought', 'that', 'maybe', 'that', 'wou...","['i', 'think', 'that', 'maybe', 'that', 'would...","[('i', 'NN'), ('thought', 'VBD'), ('that', 'IN...","[('i', 'NN'), ('think', 'VBP'), ('that', 'IN')...","[('i', 'LS'), ('thought', 'VBD'), ('that', 'IN...","[('i', 'FW'), ('think', 'VBP'), ('that', 'IN')...",ASU-T10_ExpBlock2-DolphinShow.txt,i thought that maybe that would do something,i think you have to c maybe close it and it wi...,PC: PA:,"[[0.16115269, -0.18967551, -0.091776446, 0.072...","[[0.16965881, -0.09342009, 0.31895435, 0.05972...",0.631972
1,PA:,i think you have to c maybe close it and it wi...,"['i', 'think', 'you', 'have', 'to', 'c', 'mayb...","['i', 'think', 'you', 'have', 'to', 'c', 'mayb...","[('i', 'NN'), ('think', 'VBP'), ('you', 'PRP')...","[('i', 'NN'), ('think', 'VBP'), ('you', 'PRP')...","[('i', 'LS'), ('think', 'VB'), ('you', 'PRP'),...","[('i', 'LS'), ('think', 'VB'), ('you', 'PRP'),...",ASU-T10_ExpBlock2-DolphinShow.txt,i think you have to c maybe close it and it wi...,should i restart this it's like down there,PA: PC:,"[[0.16965881, -0.09342009, 0.31895435, 0.05972...","[[0.16538174, 0.040479627, 0.08581757, 0.03785...",0.730899
2,PC:,should i restart this it's like down there,"['should', 'i', 'start', 'this', 'it', 'is', '...","['should', 'i', 'start', 'this', 'it', 'be', '...","[('should', 'MD'), ('i', 'VB'), ('start', 'VB'...","[('should', 'MD'), ('i', 'VB'), ('start', 'VB'...","[('should', 'MD'), ('i', 'FW'), ('start', 'VB'...","[('should', 'MD'), ('i', 'FW'), ('start', 'VB'...",ASU-T10_ExpBlock2-DolphinShow.txt,should i restart this it's like down there,yeah i would just restart it,PC: PA:,"[[0.16538174, 0.040479627, 0.08581757, 0.03785...","[[0.42204928, -0.110650204, 0.27723312, 0.3426...",0.633439
3,PA:,yeah i would just restart it,"['yeah', 'i', 'would', 'just', 'start', 'it']","['yeah', 'i', 'would', 'just', 'start', 'it']","[('yeah', 'NN'), ('i', 'NN'), ('would', 'MD'),...","[('yeah', 'NN'), ('i', 'NN'), ('would', 'MD'),...","[('yeah', 'JJ'), ('i', 'FW'), ('would', 'MD'),...","[('yeah', 'JJ'), ('i', 'FW'), ('would', 'MD'),...",ASU-T10_ExpBlock2-DolphinShow.txt,yeah i would just restart it,okay now what,PA: PC:,"[[0.42204928, -0.110650204, 0.27723312, 0.3426...","[[0.29328585, -0.4999022, 0.16143966, 0.002446...",0.640285
4,PC:,okay now what,"['okay', 'now', 'what']","['okay', 'now', 'what']","[('okay', 'RB'), ('now', 'RB'), ('what', 'WP')]","[('okay', 'RB'), ('now', 'RB'), ('what', 'WP')]","[('okay', 'JJ'), ('now', 'RB'), ('what', 'WP')]","[('okay', 'JJ'), ('now', 'RB'), ('what', 'WP')]",ASU-T10_ExpBlock2-DolphinShow.txt,okay now what,you didn't close it,PC: PA:,"[[0.29328585, -0.4999022, 0.16143966, 0.002446...","[[0.29594588, 0.2687162, -0.15591031, 0.149712...",0.505967


# END BERT

# BEGIN GPT

#### Steps: Pair and lag columns → Compute embeddings → Compute cosine similarities.

In [16]:
default_embedding_engine = "text-embedding-ada-002"  
def get_embedding_with_cache(text: str, engine: str = default_embedding_engine) -> list:
    """
    Generates a GPT embedding for a given text while utilizing a cache to avoid redundant computations.
    """
    if text is None:
        return None
    if (text, engine) not in embedding_cache:
        embedding_cache[(text, engine)] = openai.embeddings.create(input=[text], model=engine).data[0].embedding
    return embedding_cache[(text, engine)]

def process_file_for_GPT(file_path, embedding_cache, engine=default_embedding_engine):
    """
    Processes a single file to compute GPT embeddings for pairs of utterances and 
    calculates the cosine similarity between these embeddings.
    """
    df = pd.read_csv(file_path, sep='\t', encoding='utf-8')

    # Pair and lag the columns
    df = pair_and_lag_columns(df, columns_to_lag=['content'])

    # Compute embeddings for the lagged columns
    for column in ["content1", "content2"]:
        df[f"{column}_embedding"] = df[column].apply(lambda x: get_embedding_with_cache(x, engine))

    # Calculate cosine similarities between embeddings
    embedding_columns = [("content1_embedding", "content2_embedding")]
    df = calculate_cosine_similarity(df, embedding_columns)

    return df

### Main Run

In [17]:
# Attempts to load a pre-existing embedding cache from a file (gpt_embedding_cache.pkl).
embedding_cache_path = "data/gpt_embedding_cache.pkl"
try:
    with open(embedding_cache_path, "rb") as f:
        embedding_cache = pickle.load(f)
except FileNotFoundError:
    embedding_cache = {}

# Path to the folder containing the text files
folder_path = "./data/prepped_stan_small"
text_files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f)) and f.endswith('.txt')]

# This loop processes each text file one by one:
concatenated_df = pd.DataFrame()
for file_name in tqdm(text_files, desc="Processing files"):
    file_path = os.path.join(folder_path, file_name)
    df = process_file_for_GPT(file_path, embedding_cache, default_embedding_engine)
    concatenated_df = pd.concat([concatenated_df, df], ignore_index=True)

    with open(embedding_cache_path, "wb") as embedding_cache_file:
        pickle.dump(embedding_cache, embedding_cache_file)

Processing files: 100%|██████████| 2/2 [00:00<00:00, 76.25it/s]


In [19]:
concatenated_df.head()

Unnamed: 0,participant,content,token,lemma,tagged_token,tagged_lemma,tagged_stan_token,tagged_stan_lemma,file,content1,content2,utter_order,content1_embedding,content2_embedding,content1_embedding_content2_embedding_cosine_similarity
0,PC:,i thought that maybe that would do something,"['i', 'thought', 'that', 'maybe', 'that', 'wou...","['i', 'think', 'that', 'maybe', 'that', 'would...","[('i', 'NN'), ('thought', 'VBD'), ('that', 'IN...","[('i', 'NN'), ('think', 'VBP'), ('that', 'IN')...","[('i', 'LS'), ('thought', 'VBD'), ('that', 'IN...","[('i', 'FW'), ('think', 'VBP'), ('that', 'IN')...",ASU-T10_ExpBlock2-DolphinShow.txt,i thought that maybe that would do something,i think you have to c maybe close it and it wi...,PC: PA:,"[-0.032679855823516846, 0.00039228188688866794...","[0.0034637455828487873, 0.003785194829106331, ...",0.807814
1,PA:,i think you have to c maybe close it and it wi...,"['i', 'think', 'you', 'have', 'to', 'c', 'mayb...","['i', 'think', 'you', 'have', 'to', 'c', 'mayb...","[('i', 'NN'), ('think', 'VBP'), ('you', 'PRP')...","[('i', 'NN'), ('think', 'VBP'), ('you', 'PRP')...","[('i', 'LS'), ('think', 'VB'), ('you', 'PRP'),...","[('i', 'LS'), ('think', 'VB'), ('you', 'PRP'),...",ASU-T10_ExpBlock2-DolphinShow.txt,i think you have to c maybe close it and it wi...,should i restart this it's like down there,PA: PC:,"[0.0034637455828487873, 0.003785194829106331, ...","[-0.0030778965447098017, -0.018122322857379913...",0.781031
2,PC:,should i restart this it's like down there,"['should', 'i', 'start', 'this', 'it', 'is', '...","['should', 'i', 'start', 'this', 'it', 'be', '...","[('should', 'MD'), ('i', 'VB'), ('start', 'VB'...","[('should', 'MD'), ('i', 'VB'), ('start', 'VB'...","[('should', 'MD'), ('i', 'FW'), ('start', 'VB'...","[('should', 'MD'), ('i', 'FW'), ('start', 'VB'...",ASU-T10_ExpBlock2-DolphinShow.txt,should i restart this it's like down there,yeah i would just restart it,PC: PA:,"[-0.0030778965447098017, -0.018122322857379913...","[-0.003490022150799632, -0.007978125475347042,...",0.862541
3,PA:,yeah i would just restart it,"['yeah', 'i', 'would', 'just', 'start', 'it']","['yeah', 'i', 'would', 'just', 'start', 'it']","[('yeah', 'NN'), ('i', 'NN'), ('would', 'MD'),...","[('yeah', 'NN'), ('i', 'NN'), ('would', 'MD'),...","[('yeah', 'JJ'), ('i', 'FW'), ('would', 'MD'),...","[('yeah', 'JJ'), ('i', 'FW'), ('would', 'MD'),...",ASU-T10_ExpBlock2-DolphinShow.txt,yeah i would just restart it,okay now what,PA: PC:,"[-0.003490022150799632, -0.007978125475347042,...","[0.006083404179662466, -0.016767257824540138, ...",0.766547
4,PC:,okay now what,"['okay', 'now', 'what']","['okay', 'now', 'what']","[('okay', 'RB'), ('now', 'RB'), ('what', 'WP')]","[('okay', 'RB'), ('now', 'RB'), ('what', 'WP')]","[('okay', 'JJ'), ('now', 'RB'), ('what', 'WP')]","[('okay', 'JJ'), ('now', 'RB'), ('what', 'WP')]",ASU-T10_ExpBlock2-DolphinShow.txt,okay now what,you didn't close it,PC: PA:,"[0.006083404179662466, -0.016767257824540138, ...","[0.005405202973634005, -0.007813462056219578, ...",0.750888
