In [15]:
import os  # Provides a way to use operating system dependent functionality like reading or writing to the file system
from os import listdir, path  # listdir lists the files in a directory, path provides functions to manipulate file paths
# import pickle  # Implements binary protocols for serializing and de-serializing Python object structures
import pandas as pd  # Powerful data structures for data analysis, time series, and statistics
import numpy as np  # Support for large, multi-dimensional arrays and matrices, along with a collection of mathematical functions to operate on these arrays
from collections import Counter  # Provides a way to count the frequency of elements in a collection
import re  # Provides regular expression matching operations
# import requests  # Allows sending HTTP requests to interact with web services
import ast  # Abstract Syntax Trees, used for parsing and analyzing Python source code
from tqdm import tqdm  # For progress bars
from sklearn.metrics.pairwise import cosine_similarity  # Computes the cosine similarity between samples in a matrix
import gensim  # Library for unsupervised topic modeling and natural language processing
import gensim.downloader as api  # Downloads and loads pre-trained models and datasets
# from gensim.models import KeyedVectors  # Provides efficient word vector representation and storage

### SETUP for W2V

In [16]:
# Determine the current working directory
script_dir = os.getcwd()

# Define the local cache directory relative to the current working directory
local_cache_dir = os.path.join(script_dir, "gensim-data")
os.makedirs(local_cache_dir, exist_ok=True)
print(f"Local cache directory: {local_cache_dir}")

# Set the BASE_DIR for gensim data
api.BASE_DIR = local_cache_dir
print(f"Gensim BASE_DIR set to: {api.BASE_DIR}")

# Function to download and cache models
def download_and_cache_models(models, cache_dir):
    api.BASE_DIR = cache_dir
    for model_name in models:
        model_path = os.path.join(cache_dir, model_name)
        if not os.path.exists(model_path):
            try:
                print(f"Downloading model: {model_name}")
                model = api.load(model_name)
                print(f"Downloaded and cached model: {model_name}")
            except Exception as e:
                print(f"Error downloading {model_name}: {e}")
        else:
            print(f"Model {model_name} already exists at: {model_path}")

# List of models to download and cache
models_to_cache = ['word2vec-google-news-300', 'glove-twitter-200']
download_and_cache_models(models_to_cache, local_cache_dir)

# Function to load models if they are not already loaded
def load_model_if_not_exists(model_path, binary=True):
    try:
        if not os.path.exists(model_path):
            raise FileNotFoundError(f"Model file {model_path} does not exist.")
        return gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=binary)
    except Exception as e:
        print(f"Error loading model from {model_path}: {e}")
        return None

# Load the Google News model if it is not already loaded
if 'w2v_google_model' not in globals():
    w2v_google_model_path = os.path.join(local_cache_dir, 'word2vec-google-news-300', 'word2vec-google-news-300.gz')
    w2v_google_model = load_model_if_not_exists(w2v_google_model_path, binary=True)
    if w2v_google_model is not None:
        print("Word2Vec Google News model loaded from local cache successfully.")
    else:
        print("Failed to load Word2Vec Google News model.")

# note: possible todo: is it more efficient to use gensim.downloader.load(model_name)?
# note, downloading model, it downloads properly, but also throws the exception warning for some reason. 

Local cache directory: /Users/nduran4/Desktop/GitProjects/llm-linguistic-alignment/gensim-data
Gensim BASE_DIR set to: /Users/nduran4/Desktop/GitProjects/llm-linguistic-alignment/gensim-data
Model word2vec-google-news-300 already exists at: /Users/nduran4/Desktop/GitProjects/llm-linguistic-alignment/gensim-data/word2vec-google-news-300
Model glove-twitter-200 already exists at: /Users/nduran4/Desktop/GitProjects/llm-linguistic-alignment/gensim-data/glove-twitter-200


## FUNCTION DEFINITIONS

In [17]:
# Function to aggregate conversations
def aggregate_conversations(folder_path: str) -> pd.DataFrame:
    text_files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f)) and f.endswith('.txt')]
    concatenated_df = pd.DataFrame()

    for file_name in text_files:
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path, sep='\t', encoding='utf-8')
        concatenated_df = pd.concat([concatenated_df, df], ignore_index=True)
    
    return concatenated_df

# Function to build filtered vocabulary from aggegrated conversations
def build_filtered_vocab(data: pd.DataFrame, output_file_directory: str, high_sd_cutoff: float = 3, low_n_cutoff: int = 1):
    # Tokenize the lemmas
    all_sentences = [re.sub(r'[^\w\s]+', '', str(row)).split() for row in data['lemma']]
    all_words = [word for sentence in all_sentences for word in sentence]
    
    # Frequency count using Counter
    frequency = Counter(all_words)
    
    # Filter out one-letter words and those below low_n_cutoff
    frequency_filt = {word: freq for word, freq in frequency.items() if len(word) > 1 and freq > low_n_cutoff}
    
    # Remove high-frequency words if high_sd_cutoff is specified
    if high_sd_cutoff is not None:
        mean_freq = np.mean(list(frequency_filt.values()))
        std_freq = np.std(list(frequency_filt.values()))
        cutoff_freq = mean_freq + (std_freq * high_sd_cutoff)
        filteredWords = {word: freq for word, freq in frequency_filt.items() if freq < cutoff_freq}
    else:
        filteredWords = frequency_filt
    
    # Convert to DataFrames for exporting
    vocabfreq_all = pd.DataFrame(frequency.items(), columns=["word", "count"]).sort_values(by='count', ascending=False)
    vocabfreq_filt = pd.DataFrame(filteredWords.items(), columns=["word", "count"]).sort_values(by='count', ascending=False)
    
    # Save to files
    vocabfreq_all.to_csv(os.path.join(output_file_directory, 'vocab_unfilt_freqs.txt'), encoding='utf-8', index=False, sep='\t')
    vocabfreq_filt.to_csv(os.path.join(output_file_directory, 'vocab_filt_freqs.txt'), encoding='utf-8', index=False, sep='\t')
    
    return list(frequency.keys()), list(filteredWords.keys())

# Function to check if a column contains list-like strings
def is_list_like_column(series):
    try:
        return series.apply(lambda x: x.strip().startswith("[")).all()
    except AttributeError:
        return False

# Function to convert columns with list-like strings to actual lists
def convert_columns_to_lists(df: pd.DataFrame) -> pd.DataFrame:
    columns_converted = []
    for col in df.columns:
        if is_list_like_column(df[col]):
            df[col] = df[col].apply(ast.literal_eval)
            columns_converted.append(col)
    return df, columns_converted

# Function to get lagged conversational turns and restructure dataframe
def process_input_data(df: pd.DataFrame, include_stan: bool = True) -> pd.DataFrame:
    # Base columns to lag
    columns_to_lag = ['content', 'token', 'lemma', 'tagged_token', 'tagged_lemma']
    
    # Optionally include "stan" columns if they exist
    if include_stan:
        stan_columns = [col for col in df.columns if 'stan' in col]
        columns_to_lag.extend(stan_columns)
    
    for col in columns_to_lag:
        if col in df.columns:  # Ensure the column exists in the DataFrame
            df[f'{col}1'] = df[col]
            df[f'{col}2'] = df[col].shift(-1)
    
    df['utter_order'] = df['participant'] + ' ' + df['participant'].shift(-1)
    
    return df

# Main function to process the file
def process_file(file_path, large_list: list):       
    df = pd.read_csv(file_path, sep='\t', encoding='utf-8')
    
    # Convert columns with list-like strings to actual lists
    df, columns_converted = convert_columns_to_lists(df)

    # Filtering based on user-specified requests
    columns_to_filter = ['lemma','token']
    for col in columns_to_filter:
        # First filter: Keep only words in filter_model
        df[col] = df[col].apply(lambda token_list: [word for word in token_list if word in large_list])
    
    # Do the lagging
    df = process_input_data(df)

    return df

# Function to compute cosine similarities between embeddings
def compute_cosine_similarities(df: pd.DataFrame, columns: list):
    for col1, col2 in columns:
        similarities = []
        for i in range(len(df)):
            vec1 = df.iloc[i][col1]
            vec2 = df.iloc[i][col2]
            if vec1 is not None and vec2 is not None:
                similarity = cosine_similarity([vec1], [vec2])[0][0]
            else:
                similarity = None
            similarities.append(similarity)

        # Determine whether this is for "token" or "lemma" based on the column name
        if 'token' in col1:
            similarity_column_name = 'token_cosine_similarity'
        elif 'lemma' in col1:
            similarity_column_name = 'lemma_cosine_similarity'

        df[similarity_column_name] = similarities
    return df

# Function sum embeddings for each list of tokens
def get_sum_embeddings(token_list, model):
    if token_list is None:
        return None    
    embeddings = []
    for word in token_list:
        if word in model.key_to_index:  # Check if word is in the model vocabulary
            embeddings.append(model[word])    
    if embeddings:
        sum_embedding = np.sum(embeddings, axis=0)
        return sum_embedding
    else:
        return None  # Or handle empty embeddings as you see fit


## Main Run

In [18]:
# Path to the folder containing the text files
folder_path = "./data/prepped_stan_small"
output_file_directory = "output"
text_files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f)) and f.endswith('.txt')]

# Aggregate individual conversation files
concatenated_text_files = aggregate_conversations(folder_path)
    
# Build filtered vocabulary from aggregated data
vocab_all, vocab_filtered = build_filtered_vocab(concatenated_text_files, output_file_directory)

# Process each file and update the cache
concatenated_df = pd.DataFrame()
for file_name in tqdm(text_files, desc="Processing files"):
    file_path = os.path.join(folder_path, file_name)
    df = process_file(file_path, vocab_filtered)

    # Create columns of embeddings
    for column in ["lemma", "token"]:
        df[f"{column}1_sum_embedding"] = df[f"{column}1"].apply(lambda tokens: get_sum_embeddings(tokens, w2v_google_model))
        df[f"{column}2_sum_embedding"] = df[f"{column}2"].apply(lambda tokens: get_sum_embeddings(tokens, w2v_google_model))

    # Columns to compute similarities
    embedding_columns = [
        ("lemma1_sum_embedding", "lemma2_sum_embedding"),
        ("token1_sum_embedding", "token2_sum_embedding")
    ]

    # Compute cosine similarities
    df = compute_cosine_similarities(df, embedding_columns)
    concatenated_df = pd.concat([concatenated_df, df], ignore_index=True)

Processing files: 100%|██████████| 2/2 [00:00<00:00, 33.29it/s]


In [20]:
concatenated_df.head()

Unnamed: 0,participant,content,token,lemma,tagged_token,tagged_lemma,tagged_stan_token,tagged_stan_lemma,file,content1,...,tagged_stan_token2,tagged_stan_lemma1,tagged_stan_lemma2,utter_order,lemma1_sum_embedding,lemma2_sum_embedding,token1_sum_embedding,token2_sum_embedding,lemma_cosine_similarity,token_cosine_similarity
0,PC:,i thought that maybe that would do something,"[that, maybe, that, would, do, something]","[think, that, maybe, that, would, do, something]","[(i, NN), (thought, VBD), (that, IN), (maybe, ...","[(i, NN), (think, VBP), (that, IN), (maybe, RB...","[(i, LS), (thought, VBD), (that, IN), (maybe, ...","[(i, FW), (think, VBP), (that, IN), (maybe, RB...",ASU-T10_ExpBlock2-DolphinShow.txt,i thought that maybe that would do something,...,"[(i, LS), (think, VB), (you, PRP), (have, VBP)...","[(i, FW), (think, VBP), (that, IN), (maybe, RB...","[(i, LS), (think, VB), (you, PRP), (have, VBP)...",PC: PA:,"[0.33276367, 0.18133545, 0.54364014, 1.277832,...","[0.71087646, 0.19366455, 0.7921753, 2.4785156,...","[0.37963867, 0.11444092, 0.53430176, 1.0141602...","[0.5878296, 0.18084717, 0.7727661, 2.2480469, ...",0.861987,0.826057
1,PA:,i think you have to c maybe close it and it wi...,"[think, you, have, maybe, close, and, will, yo...","[think, you, have, maybe, close, and, will, yo...","[(i, NN), (think, VBP), (you, PRP), (have, VBP...","[(i, NN), (think, VBP), (you, PRP), (have, VBP...","[(i, LS), (think, VB), (you, PRP), (have, VBP)...","[(i, LS), (think, VB), (you, PRP), (have, VBP)...",ASU-T10_ExpBlock2-DolphinShow.txt,i think you have to c maybe close it and it wi...,...,"[(should, MD), (i, FW), (start, VB), (this, DT...","[(i, LS), (think, VB), (you, PRP), (have, VBP)...","[(should, MD), (i, FW), (start, VB), (this, DT...",PA: PC:,"[0.71087646, 0.19366455, 0.7921753, 2.4785156,...","[0.2972412, 0.25402832, 0.30085754, 0.6894531,...","[0.5878296, 0.18084717, 0.7727661, 2.2480469, ...","[0.2972412, 0.25402832, 0.30085754, 0.6894531,...",0.749735,0.751777
2,PC:,should i restart this it's like down there,"[start, this, like, down, there]","[start, this, like, down, there]","[(should, MD), (i, VB), (start, VB), (this, DT...","[(should, MD), (i, VB), (start, VB), (this, DT...","[(should, MD), (i, FW), (start, VB), (this, DT...","[(should, MD), (i, FW), (start, VB), (this, DT...",ASU-T10_ExpBlock2-DolphinShow.txt,should i restart this it's like down there,...,"[(yeah, JJ), (i, FW), (would, MD), (just, RB),...","[(should, MD), (i, FW), (start, VB), (this, DT...","[(yeah, JJ), (i, FW), (would, MD), (just, RB),...",PC: PA:,"[0.2972412, 0.25402832, 0.30085754, 0.6894531,...","[0.40307617, 0.18392944, 0.17602539, 0.7675781...","[0.2972412, 0.25402832, 0.30085754, 0.6894531,...","[0.40307617, 0.18392944, 0.17602539, 0.7675781...",0.777175,0.777175
3,PA:,yeah i would just restart it,"[yeah, would, just, start]","[yeah, would, just, start]","[(yeah, NN), (i, NN), (would, MD), (just, RB),...","[(yeah, NN), (i, NN), (would, MD), (just, RB),...","[(yeah, JJ), (i, FW), (would, MD), (just, RB),...","[(yeah, JJ), (i, FW), (would, MD), (just, RB),...",ASU-T10_ExpBlock2-DolphinShow.txt,yeah i would just restart it,...,"[(okay, JJ), (now, RB), (what, WP)]","[(yeah, JJ), (i, FW), (would, MD), (just, RB),...","[(okay, JJ), (now, RB), (what, WP)]",PA: PC:,"[0.40307617, 0.18392944, 0.17602539, 0.7675781...","[0.24768066, -0.050964355, 0.26208496, 0.27709...","[0.40307617, 0.18392944, 0.17602539, 0.7675781...","[0.24768066, -0.050964355, 0.26208496, 0.27709...",0.720251,0.720251
4,PC:,okay now what,"[okay, now, what]","[okay, now, what]","[(okay, RB), (now, RB), (what, WP)]","[(okay, RB), (now, RB), (what, WP)]","[(okay, JJ), (now, RB), (what, WP)]","[(okay, JJ), (now, RB), (what, WP)]",ASU-T10_ExpBlock2-DolphinShow.txt,okay now what,...,"[(you, PRP), (did, VBD), (not, RB), (close, VB...","[(okay, JJ), (now, RB), (what, WP)]","[(you, PRP), (do, VBP), (not, RB), (close, VB)...",PC: PA:,"[0.24768066, -0.050964355, 0.26208496, 0.27709...","[0.40890503, -0.14001465, 0.30407715, 0.839355...","[0.24768066, -0.050964355, 0.26208496, 0.27709...","[0.28585815, -0.15283203, 0.28466797, 0.608886...",0.661706,0.616577
