In [82]:
import os
from os import listdir
from os.path import isfile, join

import pickle

import pandas as pd
import numpy as np

from collections import Counter
import re
import requests

import ast

from tqdm import tqdm  # for progress bars

from sklearn.metrics.pairwise import cosine_similarity

import gensim 
import gensim.downloader as api 
from gensim.models import KeyedVectors 

### STEP 0: Download and Cache the Models Locally

In [78]:
# Determine the current working directory
script_dir = os.getcwd()

# Define the local cache directory relative to the current working directory
local_cache_dir = os.path.join(script_dir, "gensim-data")
os.makedirs(local_cache_dir, exist_ok=True)
print(f"Local cache directory: {local_cache_dir}")

# Set the BASE_DIR for gensim data
api.BASE_DIR = local_cache_dir
print(f"Gensim BASE_DIR set to: {api.BASE_DIR}")

# Function to download and cache models
def download_and_cache_models(models, cache_dir):
    api.BASE_DIR = cache_dir
    for model_name in models:
        model_path = os.path.join(cache_dir, model_name)
        if not os.path.exists(model_path):
            try:
                print(f"Downloading model: {model_name}")
                model = api.load(model_name)
                print(f"Downloaded and cached model: {model_name}")
            except Exception as e:
                print(f"Error downloading {model_name}: {e}")
        else:
            print(f"Model {model_name} already exists at: {model_path}")

# List of models to download and cache
models_to_cache = ['word2vec-google-news-300', 'glove-twitter-200']
download_and_cache_models(models_to_cache, local_cache_dir)

# note, is it more efficient to use gensim.downloader.load(model_name)?

Local cache directory: /Users/nduran4/Desktop/GitProjects/llm-linguistic-alignment/gensim-data
Gensim BASE_DIR set to: /Users/nduran4/Desktop/GitProjects/llm-linguistic-alignment/gensim-data
Model word2vec-google-news-300 already exists at: /Users/nduran4/Desktop/GitProjects/llm-linguistic-alignment/gensim-data/word2vec-google-news-300
Model glove-twitter-200 already exists at: /Users/nduran4/Desktop/GitProjects/llm-linguistic-alignment/gensim-data/glove-twitter-200


### STEP 1: 

In [79]:
def aggregate_conversations(folder_path: str) -> pd.DataFrame:
    text_files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f)) and f.endswith('.txt')]
    
    concatenated_df = pd.DataFrame()

    for file_name in text_files:
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path, sep='\t', encoding='utf-8')
        concatenated_df = pd.concat([concatenated_df, df], ignore_index=True)
    
    return concatenated_df

### STEP 2:

In [80]:
def build_filtered_vocab(data: pd.DataFrame,
                         output_file_directory: str,
                         high_sd_cutoff: float = 3,
                         low_n_cutoff: int = 1):
    
    # Tokenize the lemmas
    all_sentences = [re.sub(r'[^\w\s]+', '', str(row)).split() for row in data['lemma']]
    all_words = [word for sentence in all_sentences for word in sentence]
    
    # Frequency count using Counter
    frequency = Counter(all_words)
    
    # Filter out one-letter words and those below low_n_cutoff
    frequency_filt = {word: freq for word, freq in frequency.items() if len(word) > 1 and freq > low_n_cutoff}
    
    # Remove high-frequency words if high_sd_cutoff is specified
    if high_sd_cutoff is not None:
        mean_freq = np.mean(list(frequency_filt.values()))
        std_freq = np.std(list(frequency_filt.values()))
        cutoff_freq = mean_freq + (std_freq * high_sd_cutoff)
        filteredWords = {word: freq for word, freq in frequency_filt.items() if freq < cutoff_freq}
    else:
        filteredWords = frequency_filt
    
    # Convert to DataFrames for exporting
    vocabfreq_all = pd.DataFrame(frequency.items(), columns=["word", "count"]).sort_values(by='count', ascending=False)
    vocabfreq_filt = pd.DataFrame(filteredWords.items(), columns=["word", "count"]).sort_values(by='count', ascending=False)
    
    # Save to files
    vocabfreq_all.to_csv(os.path.join(output_file_directory, 'vocab_unfilt_freqs.txt'), encoding='utf-8', index=False, sep='\t')
    vocabfreq_filt.to_csv(os.path.join(output_file_directory, 'vocab_filt_freqs.txt'), encoding='utf-8', index=False, sep='\t')
    
    return list(frequency.keys()), list(filteredWords.keys())

### STEP 3:

In [81]:
# Function to get lagged conversational turns, restructure dataframe
def process_input_data(df: pd.DataFrame) -> pd.DataFrame:
    df['utter1'] = df['content']
    df['utter2'] = df['content'].shift(-1)
    df['utter_order'] = df['participant'] + ' ' + df['participant'].shift(-1)
    return df

### turn the following into functions:

# Load the models from the local cache when needed
try:
    w2v_google_model_path = os.path.join(local_cache_dir, 'word2vec-google-news-300', 'word2vec-google-news-300.gz')
    w2v_google_model = gensim.models.KeyedVectors.load_word2vec_format(w2v_google_model_path, binary=True)
    print("Word2Vec Google News model loaded from local cache successfully.")
except Exception as e:
    print(f"Error loading word2vec-google-news-300: {e}")

# Load the models from the local cache when needed
try:
    w2v_twitter_model_path = os.path.join(local_cache_dir, 'glove-twitter-200', 'glove-twitter-200.gz')
    w2v_twitter_model = gensim.models.KeyedVectors.load_word2vec_format(w2v_twitter_model_path, binary=False)
    print("Word2Vec Twitter model loaded from local cache successfully.")
except Exception as e:
    print(f"Error loading glove-twitter-200: {e}")


### STEP 4: TODO NEXT!!! 

In [108]:
## Might need to set up a function here to do checks if word exists in each model?

# Function to process and get embeddings/cosines for a single file
# def process_file(file_path, embedding_cache, default_embedding_engine):       
def process_file(file_path, large_list: pd.DataFrame):       
    df = pd.read_csv(file_path, sep='\t', encoding='utf-8')
    df = process_input_data(df)
    ######### ABOVE IS FROM MAIN CODE

    filter_model = [word for word in large_list if w2v_google_model.has_index_for(word)]


    df["token"] = df["token"].apply(ast.literal_eval)    

    filter_df = [word for token_list in df["token"] for word in token_list if word in large_list]
    print(filter_df)


    # all_words = [word for sentence in all_sentences for word in sentence]

    # Convert string representations of lists to actual lists
    # concatenated_df["token"] = concatenated_df["token"].apply(ast.literal_eval)

    # Print the first few entries in the "token" column to verify conversion
    # print("First few entries in 'lemma' column after conversion:")
    # print(filtered_words["lemma"].head())

    # # Print the type of the first few entries to ensure they are now lists
    # print("\nTypes of the first few entries in 'token' column after conversion:")
    # print(words["lemma"].apply(type).head())

    # # Flatten the lists of tokens in the "token" column and filter the tokens
    # filter_vocablist = [word for token_list in words["token"] for word in token_list if word in large_list]

    # print("Filtered vocabulary")



    # return df ## <<< this needs to be updated, this assumes a dataframe to be processed over
    # Create column of embeddings

In [None]:
filter_model = [word for word in filter_vocablist if w2v_google_model.has_index_for(word)]

word_vectors = [w2v_google_model[word] for word in filter_model if w2v_google_model.has_index_for(word)]
vector_avg = np.mean(word_vectors, axis=0)
vector_sum = np.sum(word_vectors, axis=0)
vector1_norm = vector_sum / np.linalg.norm(vector_sum)
similarity = cosine_similarity([vector1_norm], [vector1_norm])
similarity[0][0]

## RUN EVERYTHING BELOW

In [109]:

# Path to the folder containing the text files
folder_path = "./data/prepped_stan_small"
output_file_directory = "output"
text_files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f)) and f.endswith('.txt')]

# Aggregate individual conversation files
concatenated_text_files = aggregate_conversations(folder_path)
    
# Build filtered vocabulary from aggregated data
vocab_all, vocab_filtered = build_filtered_vocab(concatenated_text_files, output_file_directory)

# Process each file and update the cache
concatenated_df = pd.DataFrame()
for file_name in tqdm(text_files, desc="Processing files"):
    file_path = os.path.join(folder_path, file_name)
    df = process_file(file_path, vocab_filtered)
    # concatenated_df = pd.concat([concatenated_df, df], ignore_index=True)
    


Processing files: 100%|██████████| 2/2 [00:00<00:00, 417.88it/s]

['that', 'maybe', 'that', 'would', 'do', 'something', 'think', 'you', 'have', 'maybe', 'close', 'and', 'will', 'you', 'not', 'close', 'the', 'all', 'the', 'start', 'this', 'like', 'down', 'there', 'yeah', 'would', 'just', 'start', 'okay', 'now', 'what', 'you', 'not', 'close', 'yeah', 'but', 'what', 'if', 'close', 'we', 'could', 'make', 'another', 'and', 'could', 'swing', 'up', 'okay', 'okay', 'you', 'on', 'na', 'have', 'swing', 'up', 'well', 'do', 'not', 'really', 'know', 'how', 'we', 'on', 'na', 'get', 'swing', 'though', 'because', 'on', 'na', 'heavy', 'maybe', 'pin', 'like', 'that', 'blue', 'wave', 'this', 'one', 'no', 'the', 'blue', 'wave', 'the', 'balloon', 'well', 'there', 'blue', 'wave', 'or', 'mean', 'there', 'pin', 'on', 'the', 'blue', 'wave', 'not', 'though', 'yeah', 'look', 'this', 'one', 'on', 'not', 'the', 'blow', 'okay', 'think', 'you', 'need', 'what', 'you', 'say', 'pin', 'the', 'blow', 'the', 'blue', 'wave', 'this', 'do', 'not', 'think', 'can', 'attach', 'pin', 'like', '




In [118]:
df = pd.read_csv(file_path, sep='\t', encoding='utf-8')
df = process_input_data(df)

# Convert 'token' column to list of words
df['lemma'] = df['lemma'].apply(ast.literal_eval)

# Large list of words to check against 
vocab_filtered # Use a set for faster look-up

def filter_words(token_list):
    return [word for word in token_list if word in vocab_filtered]

# Apply the filtering function to the 'token' column
df['lemma'] = df['lemma'].apply(filter_words)

print(df)


    
# Retrieve the Word2Vec vectors for each word in the sentence; ignores any words not in the pre-trained model vocabulary
word_vectors = [w2v_google_model[word] for word in df['lemma'] if w2v_google_model.has_index_for(word)]



# word_vectors = [w2v_google_model[word] for word in filter_model if w2v_google_model.has_index_for(word)]




   participant                                            content  \
0          PC:                            yeah let's try that one   
1          PB:                                           this one   
2          PA:                 yeah that one doesn't look too bad   
3          PB:                                  dropped something   
4          PA:    i feel like that's the solution to all of these   
5          PB:   boy yeah just make a thing like a weight like...   
6          PC:      yeah how is this any different from last time   
7          PA:                                         don't know   
8          PC:  k wait now try and delete the red thing and se...   
9          PA:  i think it needs to swing out to the right sid...   
10         PC:    yeah but i think it might have gotten stuck now   
11         PA:    if you hit the space bar i think it restarts it   
12         PC:                                  we got a gold one   
13         PB:                    

TypeError: unhashable type: 'list'

In [120]:
# Load the pre-trained Word2Vec model
# w2v_google_model_path = 'path/to/GoogleNews-vectors-negative300.bin.gz'
# w2v_google_model = gensim.models.KeyedVectors.load_word2vec_format(w2v_google_model_path, binary=True)

# Sample DataFrame
# data = {
#     'token': ['["this", "is", "a", "sample", "sentence"]', '["word2vec", "is", "a", "powerful", "tool"]', '["gensim", "makes", "it", "easy", "to", "use"]']
# }
# df = pd.DataFrame(data)
# df['lemma']

# Function to get embeddings for each word in the list

# def get_embeddings(token_list, model):
#     embeddings = []
#     for word in token_list:
#         if word in model.key_to_index:  # Check if word is in the model vocabulary
#             embeddings.append(model[word])
#         else:
#             embeddings.append(None)  # Or handle unknown words as you see fit
#     return embeddings

# # Apply the function to the 'token' column and store in a new column
# df['embeddings'] = df['lemma'].apply(lambda tokens: get_embeddings(tokens, w2v_google_model))

# Function to get the mean and sum embeddings for each list of tokens
def get_mean_and_sum_embeddings(token_list, model):
    embeddings = []
    for word in token_list:
        if word in model.key_to_index:  # Check if word is in the model vocabulary
            embeddings.append(model[word])
    if embeddings:
        mean_embedding = np.mean(embeddings, axis=0)
        sum_embedding = np.sum(embeddings, axis=0)
        return mean_embedding, sum_embedding
    else:
        return None, None  # Or handle empty embeddings as you see fit

# Apply the function to the 'token' column and store the results in new columns
df['mean_embedding'], df['sum_embedding'] = zip(*df['lemma'].apply(lambda tokens: get_mean_and_sum_embeddings(tokens, w2v_google_model)))


# Create column of embeddings
for column in ["utter1", "utter2"]:
    df[f"{column}_embedding"] = df[column].apply(get_embedding_with_cache)



# Print the DataFrame with embeddings
print(df)

   participant                                            content  \
0          PC:                            yeah let's try that one   
1          PB:                                           this one   
2          PA:                 yeah that one doesn't look too bad   
3          PB:                                  dropped something   
4          PA:    i feel like that's the solution to all of these   
5          PB:   boy yeah just make a thing like a weight like...   
6          PC:      yeah how is this any different from last time   
7          PA:                                         don't know   
8          PC:  k wait now try and delete the red thing and se...   
9          PA:  i think it needs to swing out to the right sid...   
10         PC:    yeah but i think it might have gotten stuck now   
11         PA:    if you hit the space bar i think it restarts it   
12         PC:                                  we got a gold one   
13         PB:                    

NOT SURE IF BELOW IS NECESSARY, JUST FOR PLAYING WITH THINGS

In [6]:
# Verify if models are downloaded
for model_name in models_to_cache:
    model_file = os.path.join(local_cache_dir, model_name, f"{model_name}.gz")
    if os.path.exists(model_file):
        print(f"Model {model_name} exists at: {model_file}")
    else:
        print(f"Model {model_name} not found at: {model_file}")

Model word2vec-google-news-300 exists at: /Users/nduran4/Desktop/GitProjects/llm-linguistic-alignment/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz
Model glove-twitter-200 exists at: /Users/nduran4/Desktop/GitProjects/llm-linguistic-alignment/gensim-data/glove-twitter-200/glove-twitter-200.gz


In [75]:
vector1_norm

array([ 0.05844001,  0.01495306,  0.04695388,  0.13069172, -0.06963488,
       -0.01979959,  0.05705287, -0.06206347,  0.0486094 ,  0.05643959,
       -0.04690355, -0.11485673, -0.04162246, -0.00579954, -0.12551017,
        0.07829428,  0.05789276,  0.07433286,  0.03147802, -0.08316714,
       -0.04800898,  0.04535313,  0.06810266, -0.03640107,  0.05034975,
        0.00141228, -0.06145631,  0.01032026,  0.03499172,  0.01205086,
       -0.04704227,  0.07149567, -0.01036917, -0.05731743, -0.004206  ,
       -0.01163774,  0.05081906,  0.0198385 ,  0.03804623,  0.08060738,
        0.09270204, -0.05585926,  0.16217482, -0.03939044, -0.01891305,
        0.00020894, -0.0025041 , -0.02201688,  0.05255189,  0.00180374,
       -0.00704225,  0.07698126, -0.01503362, -0.02992011,  0.01114059,
        0.03326823, -0.01235591, -0.02948997,  0.07253172, -0.0603702 ,
       -0.02808144,  0.06454868, -0.07118034, -0.06671734, -0.01440898,
       -0.04135864, -0.04096557,  0.09711972, -0.06773841,  0.06

In [55]:
# Flatten the lists of tokens in the "token" column and filter the tokens
filter_vocablist = [word for token_list in concatenated_df["token"] for word in token_list if word in vocab_filtered]

print(filter_vocablist)

[]


In [21]:
# w2v_google_model

def get_embedding_with_cache(text):
    # Only consider the words that are in the vocablist after filtering for various criteria (e.g., only occur once, high frequency)
    filter_vocablist = [word for word in tok_seq if word in vocablist]
    
df = concatenated_df

# Create column of embeddings
for column in ["utter1", "utter2"]:
    df[f"{column}_embedding"] = df[column].apply(get_embedding_with_cache)
    








Unnamed: 0,participant,content,token,lemma,tagged_token,tagged_lemma,tagged_stan_token,tagged_stan_lemma,file,utter1,utter2,utter_order
0,PC:,i thought that maybe that would do something,"['i', 'thought', 'that', 'maybe', 'that', 'wou...","['i', 'think', 'that', 'maybe', 'that', 'would...","[('i', 'NN'), ('thought', 'VBD'), ('that', 'IN...","[('i', 'NN'), ('think', 'VBP'), ('that', 'IN')...","[('i', 'LS'), ('thought', 'VBD'), ('that', 'IN...","[('i', 'FW'), ('think', 'VBP'), ('that', 'IN')...",ASU-T10_ExpBlock2-DolphinShow.txt,i thought that maybe that would do something,i think you have to c maybe close it and it wi...,PC: PA:
1,PA:,i think you have to c maybe close it and it wi...,"['i', 'think', 'you', 'have', 'to', 'c', 'mayb...","['i', 'think', 'you', 'have', 'to', 'c', 'mayb...","[('i', 'NN'), ('think', 'VBP'), ('you', 'PRP')...","[('i', 'NN'), ('think', 'VBP'), ('you', 'PRP')...","[('i', 'LS'), ('think', 'VB'), ('you', 'PRP'),...","[('i', 'LS'), ('think', 'VB'), ('you', 'PRP'),...",ASU-T10_ExpBlock2-DolphinShow.txt,i think you have to c maybe close it and it wi...,should i restart this it's like down there,PA: PC:
2,PC:,should i restart this it's like down there,"['should', 'i', 'start', 'this', 'it', 'is', '...","['should', 'i', 'start', 'this', 'it', 'be', '...","[('should', 'MD'), ('i', 'VB'), ('start', 'VB'...","[('should', 'MD'), ('i', 'VB'), ('start', 'VB'...","[('should', 'MD'), ('i', 'FW'), ('start', 'VB'...","[('should', 'MD'), ('i', 'FW'), ('start', 'VB'...",ASU-T10_ExpBlock2-DolphinShow.txt,should i restart this it's like down there,yeah i would just restart it,PC: PA:
3,PA:,yeah i would just restart it,"['yeah', 'i', 'would', 'just', 'start', 'it']","['yeah', 'i', 'would', 'just', 'start', 'it']","[('yeah', 'NN'), ('i', 'NN'), ('would', 'MD'),...","[('yeah', 'NN'), ('i', 'NN'), ('would', 'MD'),...","[('yeah', 'JJ'), ('i', 'FW'), ('would', 'MD'),...","[('yeah', 'JJ'), ('i', 'FW'), ('would', 'MD'),...",ASU-T10_ExpBlock2-DolphinShow.txt,yeah i would just restart it,okay now what,PA: PC:
4,PC:,okay now what,"['okay', 'now', 'what']","['okay', 'now', 'what']","[('okay', 'RB'), ('now', 'RB'), ('what', 'WP')]","[('okay', 'RB'), ('now', 'RB'), ('what', 'WP')]","[('okay', 'JJ'), ('now', 'RB'), ('what', 'WP')]","[('okay', 'JJ'), ('now', 'RB'), ('what', 'WP')]",ASU-T10_ExpBlock2-DolphinShow.txt,okay now what,you didn't close it,PC: PA:
...,...,...,...,...,...,...,...,...,...,...,...,...
60,PC:,yeah but i think it might have gotten stuck now,"['yeah', 'but', 'i', 'think', 'it', 'might', '...","['yeah', 'but', 'i', 'think', 'it', 'might', '...","[('yeah', 'NN'), ('but', 'CC'), ('i', 'JJ'), (...","[('yeah', 'NN'), ('but', 'CC'), ('i', 'JJ'), (...","[('yeah', 'NN'), ('but', 'CC'), ('i', 'FW'), (...","[('yeah', 'NN'), ('but', 'CC'), ('i', 'FW'), (...",ASU-T10_ExpBlock1-Oneatatime.txt,yeah but i think it might have gotten stuck now,if you hit the space bar i think it restarts it,PC: PA:
61,PA:,if you hit the space bar i think it restarts it,"['if', 'you', 'hit', 'the', 'space', 'bar', 'i...","['if', 'you', 'hit', 'the', 'space', 'bar', 'i...","[('if', 'IN'), ('you', 'PRP'), ('hit', 'VBP'),...","[('if', 'IN'), ('you', 'PRP'), ('hit', 'VBP'),...","[('if', 'IN'), ('you', 'PRP'), ('hit', 'VBP'),...","[('if', 'IN'), ('you', 'PRP'), ('hit', 'VBP'),...",ASU-T10_ExpBlock1-Oneatatime.txt,if you hit the space bar i think it restarts it,we got a gold one,PA: PC:
62,PC:,we got a gold one,"['we', 'got', 'a', 'gold', 'one']","['we', 'get', 'a', 'gold', 'one']","[('we', 'PRP'), ('got', 'VBD'), ('a', 'DT'), (...","[('we', 'PRP'), ('get', 'VBP'), ('a', 'DT'), (...","[('we', 'PRP'), ('got', 'VBD'), ('a', 'DT'), (...","[('we', 'PRP'), ('get', 'VBP'), ('a', 'DT'), (...",ASU-T10_ExpBlock1-Oneatatime.txt,we got a gold one,sweet come on,PC: PB:
63,PB:,sweet come on,"['sweet', 'come', 'on']","['sweet', 'come', 'on']","[('sweet', 'JJ'), ('come', 'NN'), ('on', 'IN')]","[('sweet', 'JJ'), ('come', 'NN'), ('on', 'IN')]","[('sweet', 'JJ'), ('come', 'VBN'), ('on', 'IN')]","[('sweet', 'JJ'), ('come', 'VBN'), ('on', 'IN')]",ASU-T10_ExpBlock1-Oneatatime.txt,sweet come on,i don't know what that means but gold is good,PB: PA:


In [16]:
def load_w2v_trained(pretrained_input_file):

    model = api.load(pretrained_input_file)

    return model

In [19]:
# w2v_Google
w2v_model_goog = load_w2v_trained(
    pretrained_input_file=MODEL_w2v_google
    )        
    
# w2v_Twitter
w2v_model_twit = load_w2v_trained(
    pretrained_input_file=MODEL_w2v_google
    )


ValueError: unable to read local cache '/Users/nduran4/gensim-data/information.json' during fallback, connect to the Internet and retry

In [20]:
model = api.load(MODEL_w2v_google)

ValueError: unable to read local cache '/Users/nduran4/gensim-data/information.json' during fallback, connect to the Internet and retry

In [15]:
concatenated_df

Unnamed: 0,participant,content,token,lemma,tagged_token,tagged_lemma,tagged_stan_token,tagged_stan_lemma,file,utter1,utter2,utter_order
0,PC:,i thought that maybe that would do something,"['i', 'thought', 'that', 'maybe', 'that', 'wou...","['i', 'think', 'that', 'maybe', 'that', 'would...","[('i', 'NN'), ('thought', 'VBD'), ('that', 'IN...","[('i', 'NN'), ('think', 'VBP'), ('that', 'IN')...","[('i', 'LS'), ('thought', 'VBD'), ('that', 'IN...","[('i', 'FW'), ('think', 'VBP'), ('that', 'IN')...",ASU-T10_ExpBlock2-DolphinShow.txt,i thought that maybe that would do something,i think you have to c maybe close it and it wi...,PC: PA:
1,PA:,i think you have to c maybe close it and it wi...,"['i', 'think', 'you', 'have', 'to', 'c', 'mayb...","['i', 'think', 'you', 'have', 'to', 'c', 'mayb...","[('i', 'NN'), ('think', 'VBP'), ('you', 'PRP')...","[('i', 'NN'), ('think', 'VBP'), ('you', 'PRP')...","[('i', 'LS'), ('think', 'VB'), ('you', 'PRP'),...","[('i', 'LS'), ('think', 'VB'), ('you', 'PRP'),...",ASU-T10_ExpBlock2-DolphinShow.txt,i think you have to c maybe close it and it wi...,should i restart this it's like down there,PA: PC:
2,PC:,should i restart this it's like down there,"['should', 'i', 'start', 'this', 'it', 'is', '...","['should', 'i', 'start', 'this', 'it', 'be', '...","[('should', 'MD'), ('i', 'VB'), ('start', 'VB'...","[('should', 'MD'), ('i', 'VB'), ('start', 'VB'...","[('should', 'MD'), ('i', 'FW'), ('start', 'VB'...","[('should', 'MD'), ('i', 'FW'), ('start', 'VB'...",ASU-T10_ExpBlock2-DolphinShow.txt,should i restart this it's like down there,yeah i would just restart it,PC: PA:
3,PA:,yeah i would just restart it,"['yeah', 'i', 'would', 'just', 'start', 'it']","['yeah', 'i', 'would', 'just', 'start', 'it']","[('yeah', 'NN'), ('i', 'NN'), ('would', 'MD'),...","[('yeah', 'NN'), ('i', 'NN'), ('would', 'MD'),...","[('yeah', 'JJ'), ('i', 'FW'), ('would', 'MD'),...","[('yeah', 'JJ'), ('i', 'FW'), ('would', 'MD'),...",ASU-T10_ExpBlock2-DolphinShow.txt,yeah i would just restart it,okay now what,PA: PC:
4,PC:,okay now what,"['okay', 'now', 'what']","['okay', 'now', 'what']","[('okay', 'RB'), ('now', 'RB'), ('what', 'WP')]","[('okay', 'RB'), ('now', 'RB'), ('what', 'WP')]","[('okay', 'JJ'), ('now', 'RB'), ('what', 'WP')]","[('okay', 'JJ'), ('now', 'RB'), ('what', 'WP')]",ASU-T10_ExpBlock2-DolphinShow.txt,okay now what,you didn't close it,PC: PA:
...,...,...,...,...,...,...,...,...,...,...,...,...
60,PC:,yeah but i think it might have gotten stuck now,"['yeah', 'but', 'i', 'think', 'it', 'might', '...","['yeah', 'but', 'i', 'think', 'it', 'might', '...","[('yeah', 'NN'), ('but', 'CC'), ('i', 'JJ'), (...","[('yeah', 'NN'), ('but', 'CC'), ('i', 'JJ'), (...","[('yeah', 'NN'), ('but', 'CC'), ('i', 'FW'), (...","[('yeah', 'NN'), ('but', 'CC'), ('i', 'FW'), (...",ASU-T10_ExpBlock1-Oneatatime.txt,yeah but i think it might have gotten stuck now,if you hit the space bar i think it restarts it,PC: PA:
61,PA:,if you hit the space bar i think it restarts it,"['if', 'you', 'hit', 'the', 'space', 'bar', 'i...","['if', 'you', 'hit', 'the', 'space', 'bar', 'i...","[('if', 'IN'), ('you', 'PRP'), ('hit', 'VBP'),...","[('if', 'IN'), ('you', 'PRP'), ('hit', 'VBP'),...","[('if', 'IN'), ('you', 'PRP'), ('hit', 'VBP'),...","[('if', 'IN'), ('you', 'PRP'), ('hit', 'VBP'),...",ASU-T10_ExpBlock1-Oneatatime.txt,if you hit the space bar i think it restarts it,we got a gold one,PA: PC:
62,PC:,we got a gold one,"['we', 'got', 'a', 'gold', 'one']","['we', 'get', 'a', 'gold', 'one']","[('we', 'PRP'), ('got', 'VBD'), ('a', 'DT'), (...","[('we', 'PRP'), ('get', 'VBP'), ('a', 'DT'), (...","[('we', 'PRP'), ('got', 'VBD'), ('a', 'DT'), (...","[('we', 'PRP'), ('get', 'VBP'), ('a', 'DT'), (...",ASU-T10_ExpBlock1-Oneatatime.txt,we got a gold one,sweet come on,PC: PB:
63,PB:,sweet come on,"['sweet', 'come', 'on']","['sweet', 'come', 'on']","[('sweet', 'JJ'), ('come', 'NN'), ('on', 'IN')]","[('sweet', 'JJ'), ('come', 'NN'), ('on', 'IN')]","[('sweet', 'JJ'), ('come', 'VBN'), ('on', 'IN')]","[('sweet', 'JJ'), ('come', 'VBN'), ('on', 'IN')]",ASU-T10_ExpBlock1-Oneatatime.txt,sweet come on,i don't know what that means but gold is good,PB: PA:
