In [1]:
import os
from os import listdir
from os.path import isfile, join

import pickle

import pandas as pd
import numpy as np

from tqdm import tqdm  # for progress bars

from sklearn.metrics.pairwise import cosine_similarity

import gensim 
import gensim.downloader as api 
from gensim.models import KeyedVectors 

In [None]:
# for loading in the gensim w2v models
MODEL_w2v_google = 'word2vec-google-news-300'
MODEL_w2v_twitter = 'glove-twitter-200'

How necessary is the below set of codes. It is every single conversation, utterance by utterance, with tokens and lemmas.... is this not equivalent to... 

taking the folder with all the files and concatenating them and that is that... and from there, easily running the `build_filtered_vocab` function

In [None]:
# set standards for which words will be evaluated in word2vec analysis
TRANSCRIPTS_CONCAT_VOCAB_FILE = "/Users/nduran4/Dropbox (ASU)/Mac/Desktop/GitProjects/align-linguistic-alignment/sandbox/transformers/align_concatenated_dataframe.txt"
HIGH_SD_CUTOFF = None
LOW_N_CUTOFF = 1


Consulting the above processed list of filtered words, below gets the embeddings stored in word2vec and then aggregates them... 

In [None]:
def build_filtered_vocab(data: pd.DataFrame,
                         output_file_directory: str,
                         high_sd_cutoff: float = 3,
                         low_n_cutoff: int = 1):
    
    # Tokenize the lemmas
    all_sentences = [re.sub('[^\w\s]+','', str(row)).split() for row in data['lemma']]
    all_words = [word for sentence in all_sentences for word in sentence]
    
    # Frequency count using Counter
    frequency = Counter(all_words)
    
    # Filter out one-letter words and those below low_n_cutoff
    frequency_filt = {word: freq for word, freq in frequency.items() if len(word) > 1 and freq > low_n_cutoff}
    
    # Remove high-frequency words if high_sd_cutoff is specified
    if high_sd_cutoff is not None:
        mean_freq = np.mean(list(frequency_filt.values()))
        std_freq = np.std(list(frequency_filt.values()))
        cutoff_freq = mean_freq + (std_freq * high_sd_cutoff)
        filteredWords = {word: freq for word, freq in frequency_filt.items() if freq < cutoff_freq}
    else:
        filteredWords = frequency_filt
    
    # Convert to DataFrames for exporting
    vocabfreq_all = pd.DataFrame(frequency.items(), columns=["word", "count"]).sort_values(by='count', ascending=False)
    vocabfreq_filt = pd.DataFrame(filteredWords.items(), columns=["word", "count"]).sort_values(by='count', ascending=False)
    
    # Save to files
    vocabfreq_all.to_csv(os.path.join(output_file_directory, 'vocab_unfilt_freqs.txt'), encoding='utf-8', index=False, sep='\t')
    vocabfreq_filt.to_csv(os.path.join(output_file_directory, 'vocab_filt_freqs.txt'), encoding='utf-8', index=False, sep='\t')
    
    return list(frequency.keys()), list(filteredWords.keys())

def aggregate_conversations(folder_path: str) -> pd.DataFrame:
    text_files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f)) and f.endswith('.txt')]
    
    concatenated_df = pd.DataFrame()

    for file_name in text_files:
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path, sep='\t', encoding='utf-8')
        concatenated_df = pd.concat([concatenated_df, df], ignore_index=True)
    
    return concatenated_df

# Main execution
if __name__ == "__main__":
    folder_path = "data/prepped_stan_small"
    output_file_directory = "output"
    
    # Aggregate individual conversation files
    concatenated_df = aggregate_conversations(folder_path)
    
    # Build filtered vocabulary from aggregated data
    vocab_all, vocab_filtered = build_filtered_vocab(concatenated_df, output_file_directory)