## A. Kevin's NLTK NLP Text Analytics Pipeline V1

### 1.00 Installing Libraries and Dependencies


libraries needed:
gensim
nltk
spacy
pandas
numpy

In [None]:
%pip install --upgrade nltk
%pip install --upgrade pandas
%pip install --upgrade numpy
%pip install --upgrade plotly
%pip install --upgrade matplotlib
%pip install --upgrade jupyter ipywidgets

### 1.01 NLTK Preprocessing and tokenization for Peeking under the Hood Text Analytics

Running the code cell below will result in us pulling the following token types for each of our seven corpora.
1. word tokens - formed from decomposing sentences into their constituent pieces.
2. NLTK text objects - The NLTK library has a unique tokenizer that adds additional metadata to the word token which allows for unique analysis as compared to normal word tokens.
3. original sentence tokens - based on numerous features, but normally bounded by punctuation marks. This is why we normally tokenize this first before other text processing.
4. normalized sentence tokens - original sentence tokens that have been lowercased, and had stop words, punctuations, and special chracters removed.

You can always figure out what type of data corpus you are dealing with by running these print checks. It is also extremely important to also note the importance of keeping your documents categorized, lest they get out of control. The more processing and feature extractions you do, the more you may end up with more different buckets of data to keep up with.
* We will look at the utility of each token type in these modules

CSV to corpus
folder_path = 'data/outputFiles/csvOutputs/diarizedTranscripts'

## New code

1. needs 3.10 or 3.11 for spacy 
2. conda install ipykernel pandas numpy nltk spacy sentence-transformers scikit-learn
3. python -m spacy download en_core_web_sm

## Load CSV files from folder

In [1]:
import os
import glob
import pandas as pd

def read_csv_files(directory):
    csv_files = glob.glob(os.path.join(directory, '**', '*.csv'), recursive=True)
    data_frames = []
    for file in csv_files:
        df = pd.read_csv(file)
        df['source_file'] = file  # Keep track of the source file
        data_frames.append(df)
    return data_frames

# Example usage:
directory = 'data/rawTranscriptFiles'
transcript_dfs = read_csv_files(directory)


## Processing each transcription seperately

In [3]:
import re
import nltk
import spacy

nltk.download('punkt')
nltk.download('stopwords')
nlp = spacy.load('en_core_web_sm')

stop_words = set(nltk.corpus.stopwords.words('english'))

def preprocess_text(text):
    # Lowercase conversion
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def tokenize_text(text):
    tokens = nltk.word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

def spacy_analysis(text):
    doc = nlp(text)
    pos_tags = [(token.text, token.pos_) for token in doc]
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return pos_tags, entities


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mrhal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mrhal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
print(transcript_dfs[0].head())

   sentence_number   start     end  \
0                1   1.733   2.453   
1                2   6.116  10.678   
2                3  12.339  17.542   
3                4  18.823  20.023   
4                5  22.885  24.066   

                                                text  \
0                                           the job.   
1  Glad to see things are going well and business...   
2  Andrea told me about your outstanding numbers ...   
3                             Keep up the good work.   
4                             Now to other business.   

                                               words     speaker  \
0  [{'word': 'the', 'start': 1.733, 'end': 1.853,...  SPEAKER_00   
1  [{'word': 'Glad', 'start': 6.116, 'end': 6.496...  SPEAKER_00   
2  [{'word': 'Andrea', 'start': 12.339, 'end': 12...  SPEAKER_00   
3  [{'word': 'Keep', 'start': 18.823, 'end': 19.0...  SPEAKER_00   
4  [{'word': 'Now', 'start': 22.885, 'end': 23.10...  SPEAKER_00   

                         

In [11]:
for df in transcript_dfs:
    df['cleaned_transcript'] = df['text'].apply(preprocess_text)
    df['tokens'] = df['cleaned_transcript'].apply(tokenize_text)
    df[['pos_tags', 'entities']] = df['cleaned_transcript'].apply(
        lambda x: pd.Series(spacy_analysis(x))
    )


# Cursory Topic modeling

# Using Latent Dirichlet Allocation (LDA)
# First, you need to prepare the data for LDA:

## Now, apply LDA to each transcription:

In [None]:
lda_models = []

for df in transcript_dfs:
    tokens_list = df['tokens'].tolist()
    lda_model, corpus, dictionary = perform_lda(tokens_list)
    lda_models.append({
        'model': lda_model,
        'corpus': corpus,
        'dictionary': dictionary,
        'source_file': df['source_file'].iloc[0]
    })


## Old need to fix

In [None]:
import os
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords

# Ensure you have the necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Folder containing CSV files
folder_path = 'data/rawTranscriptFiles'

# Column name to extract text from
column_name = 'text'

# Function to clean text
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove non-alphanumeric characters
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

# List to hold the variable names and their lengths
variable_names_and_lengths = []

# Counter to create corpus variable names like corpus1, corpus2, etc.
counter = 1

# Stop words
stop_words = set(stopwords.words('english'))

# Iterate over each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_csv(file_path)
        
        # Create a variable name based on the counter
        base_var_name = f'corpus{counter}'
        
        # Step 1: Convert CSV file into a dataframe variable
        globals()[base_var_name] = df

        # Step 2: Extract and store raw text
        raw_text_var_name = f'{base_var_name}_rawText'
        raw_text = ' '.join(df[column_name].astype(str).tolist())
        globals()[raw_text_var_name] = raw_text
        
        # Step 3: Tokenize raw text
        raw_sentence_tokens = nltk.sent_tokenize(raw_text)
        raw_word_tokens = nltk.word_tokenize(raw_text)
        raw_text_obj = nltk.Text(raw_word_tokens)
        
        df['rawSentenceTokens'] = df[column_name].apply(nltk.sent_tokenize)
        df['rawWordTokens'] = df[column_name].apply(nltk.word_tokenize)
        df['rawTextObjects'] = df['rawWordTokens'].apply(nltk.Text)
        
        globals()[f'{base_var_name}_rawSentenceTokens'] = raw_sentence_tokens
        globals()[f'{base_var_name}_rawWordTokens'] = raw_word_tokens
        globals()[f'{base_var_name}_rawTextObjects'] = raw_text_obj
        
        # Step 4: Clean text and create processed text
        df['processedText'] = df[column_name].apply(clean_text)
        
        processed_text_var_name = f'{base_var_name}_processedText'
        processed_text = ' '.join(df['processedText'].astype(str).tolist())
        globals()[processed_text_var_name] = processed_text
        
        # Step 5: Tokenize processed text
        processed_sentence_tokens = [nltk.sent_tokenize(clean_text(sent)) for sent in raw_sentence_tokens]
        processed_sentence_tokens_flat = [sent for sublist in processed_sentence_tokens for sent in sublist]
        processed_word_tokens = [nltk.word_tokenize(sent) for sent in processed_sentence_tokens_flat]
        processed_word_tokens_flat = [word for sublist in processed_word_tokens for word in sublist]
        processed_text_obj = nltk.Text(processed_word_tokens_flat)
        
        df['processedSentenceTokens'] = df['rawSentenceTokens'].apply(lambda x: [nltk.sent_tokenize(clean_text(sent)) for sent in x])
        df['processedSentenceTokens'] = df['processedSentenceTokens'].apply(lambda x: [item for sublist in x for item in sublist])
        df['processedWordTokens'] = df['processedSentenceTokens'].apply(lambda x: [nltk.word_tokenize(sent) for sent in x])
        df['processedWordTokens'] = df['processedWordTokens'].apply(lambda x: [item for sublist in x for item in sublist])
        df['processedTextObjects'] = df['processedWordTokens'].apply(nltk.Text)
        
        globals()[f'{base_var_name}_processedSentenceTokens'] = processed_sentence_tokens_flat
        globals()[f'{base_var_name}_processedWordTokens'] = processed_word_tokens_flat
        globals()[f'{base_var_name}_processedTextObjects'] = processed_text_obj
        
        # Step 6: Remove stop words and create fully processed text
        df['fullyProcessedText'] = df['processedText'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
        
        fully_processed_text_var_name = f'{base_var_name}_fullyProcessedText'
        fully_processed_text = ' '.join(df['fullyProcessedText'].astype(str).tolist())
        globals()[fully_processed_text_var_name] = fully_processed_text
        
        # Step 7: Tokenize fully processed text
        fully_processed_sentence_tokens = [nltk.sent_tokenize(sent) for sent in processed_sentence_tokens_flat]
        fully_processed_sentence_tokens_flat = [sent for sublist in fully_processed_sentence_tokens for sent in sublist]
        fully_processed_word_tokens = [nltk.word_tokenize(sent) for sent in fully_processed_sentence_tokens_flat]
        fully_processed_word_tokens_flat = [word for sublist in fully_processed_word_tokens for word in sublist]
        fully_processed_text_obj = nltk.Text(fully_processed_word_tokens_flat)
        
        df['fullyProcessedSentenceTokens'] = df['fullyProcessedText'].apply(nltk.sent_tokenize)
        df['fullyProcessedWordTokens'] = df['fullyProcessedSentenceTokens'].apply(lambda x: [nltk.word_tokenize(sent) for sent in x])
        df['fullyProcessedWordTokens'] = df['fullyProcessedWordTokens'].apply(lambda x: [item for sublist in x for item in sublist])
        df['fullyProcessedTextObjects'] = df['fullyProcessedWordTokens'].apply(nltk.Text)
        
        globals()[f'{base_var_name}_fullyProcessedSentenceTokens'] = fully_processed_sentence_tokens_flat
        globals()[f'{base_var_name}_fullyProcessedWordTokens'] = fully_processed_word_tokens_flat
        globals()[f'{base_var_name}_fullyProcessedTextObjects'] = fully_processed_text_obj
        
        # Save the updated dataframe to a new CSV file
        new_file_path = os.path.join(folder_path, f'processed_{filename}')
        df.to_csv(new_file_path, index=False)
        
        # Store the lengths of each created variable
        variable_names_and_lengths.append((raw_text_var_name, len(raw_text)))
        variable_names_and_lengths.append((f'{base_var_name}_rawSentenceTokens', len(raw_sentence_tokens)))
        variable_names_and_lengths.append((f'{base_var_name}_rawWordTokens', len(raw_word_tokens)))
        variable_names_and_lengths.append((f'{base_var_name}_processedText', len(processed_text)))
        variable_names_and_lengths.append((f'{base_var_name}_processedSentenceTokens', len(processed_sentence_tokens_flat)))
        variable_names_and_lengths.append((f'{base_var_name}_processedWordTokens', len(processed_word_tokens_flat)))
        variable_names_and_lengths.append((fully_processed_text_var_name, len(fully_processed_text)))
        variable_names_and_lengths.append((f'{base_var_name}_fullyProcessedSentenceTokens', len(fully_processed_sentence_tokens_flat)))
        variable_names_and_lengths.append((f'{base_var_name}_fullyProcessedWordTokens', len(fully_processed_word_tokens_flat)))
        
        # Increment the counter
        counter += 1

# Print the lengths of all created variables
for var_name, var_length in variable_names_and_lengths:
    print(f'{var_name}: {var_length}')


In [None]:
import os
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords

# Ensure you have the necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Folder containing CSV files
folder_path = 'data/outputFiles/csvOutputs/diarizedTranscripts'

# Column name to extract text from
column_name = 'text'

# Function to clean text
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove non-alphanumeric characters
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

# List to hold the variable names and their lengths
variable_names_and_lengths = []

# Counter to create corpus variable names like corpus1, corpus2, etc.
counter = 1

# Stop words
stop_words = set(stopwords.words('english'))

# Iterate over each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_csv(file_path)
        
        # Create a variable name based on the counter
        base_var_name = f'corpus{counter}'
        
        # Step 1: Convert CSV file into a dataframe variable
        globals()[base_var_name] = df

        # Step 2: Extract and store raw text
        raw_text_var_name = f'{base_var_name}_rawText'
        raw_text = ' '.join(df[column_name].astype(str).tolist())
        globals()[raw_text_var_name] = raw_text
        
        # Step 3: Tokenize raw text
        raw_sentence_tokens = nltk.sent_tokenize(raw_text)
        raw_word_tokens = nltk.word_tokenize(raw_text)
        raw_text_obj = nltk.Text(raw_word_tokens)
        
        df['rawSentenceTokens'] = df[column_name].apply(nltk.sent_tokenize)
        df['rawWordTokens'] = df[column_name].apply(nltk.word_tokenize)
        df['rawTextObjects'] = df['rawWordTokens'].apply(nltk.Text)
        
        globals()[f'{base_var_name}_rawSentenceTokens'] = raw_sentence_tokens
        globals()[f'{base_var_name}_rawWordTokens'] = raw_word_tokens
        globals()[f'{base_var_name}_rawTextObjects'] = raw_text_obj
        
        # Step 4: Clean text and create processed text
        df['processedText'] = df[column_name].apply(clean_text)
        
        processed_text_var_name = f'{base_var_name}_processedText'
        processed_text = ' '.join(df['processedText'].astype(str).tolist())
        globals()[processed_text_var_name] = processed_text
        
        # Step 5: Tokenize processed text
        processed_sentence_tokens = nltk.sent_tokenize(processed_text)
        processed_word_tokens = nltk.word_tokenize(processed_text)
        processed_text_obj = nltk.Text(processed_word_tokens)
        
        df['processedSentenceTokens'] = df['processedText'].apply(nltk.sent_tokenize)
        df['processedWordTokens'] = df['processedText'].apply(nltk.word_tokenize)
        df['processedTextObjects'] = df['processedWordTokens'].apply(nltk.Text)
        
        globals()[f'{base_var_name}_processedSentenceTokens'] = processed_sentence_tokens
        globals()[f'{base_var_name}_processedWordTokens'] = processed_word_tokens
        globals()[f'{base_var_name}_processedTextObjects'] = processed_text_obj
        
        # Step 6: Remove stop words and create fully processed text
        df['fullyProcessedText'] = df['processedText'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
        
        fully_processed_text_var_name = f'{base_var_name}_fullyProcessedText'
        fully_processed_text = ' '.join(df['fullyProcessedText'].astype(str).tolist())
        globals()[fully_processed_text_var_name] = fully_processed_text
        
        # Step 7: Tokenize fully processed text
        fully_processed_sentence_tokens = nltk.sent_tokenize(fully_processed_text)
        fully_processed_word_tokens = nltk.word_tokenize(fully_processed_text)
        fully_processed_text_obj = nltk.Text(fully_processed_word_tokens)
        
        df['fullyProcessedSentenceTokens'] = df['fullyProcessedText'].apply(nltk.sent_tokenize)
        df['fullyProcessedWordTokens'] = df['fullyProcessedText'].apply(nltk.word_tokenize)
        df['fullyProcessedTextObjects'] = df['fullyProcessedWordTokens'].apply(nltk.Text)
        
        globals()[f'{base_var_name}_fullyProcessedSentenceTokens'] = fully_processed_sentence_tokens
        globals()[f'{base_var_name}_fullyProcessedWordTokens'] = fully_processed_word_tokens
        globals()[f'{base_var_name}_fullyProcessedTextObjects'] = fully_processed_text_obj
        
        # Save the updated dataframe to a new CSV file
        new_file_path = os.path.join(folder_path, f'processed_{filename}')
        df.to_csv(new_file_path, index=False)
        
        # Store the lengths of each created variable
        variable_names_and_lengths.append((raw_text_var_name, len(raw_text)))
        variable_names_and_lengths.append((f'{base_var_name}_rawSentenceTokens', len(raw_sentence_tokens)))
        variable_names_and_lengths.append((f'{base_var_name}_rawWordTokens', len(raw_word_tokens)))
        variable_names_and_lengths.append((f'{base_var_name}_processedText', len(processed_text)))
        variable_names_and_lengths.append((f'{base_var_name}_processedSentenceTokens', len(processed_sentence_tokens)))
        variable_names_and_lengths.append((f'{base_var_name}_processedWordTokens', len(processed_word_tokens)))
        variable_names_and_lengths.append((fully_processed_text_var_name, len(fully_processed_text)))
        variable_names_and_lengths.append((f'{base_var_name}_fullyProcessedSentenceTokens', len(fully_processed_sentence_tokens)))
        variable_names_and_lengths.append((f'{base_var_name}_fullyProcessedWordTokens', len(fully_processed_word_tokens)))
        
        # Increment the counter
        counter += 1

# Print the lengths of all created variables
for var_name, var_length in variable_names_and_lengths:
    print(f'{var_name}: {var_length}')



In [None]:
import os
import pandas as pd
import nltk

# Ensure you have the necessary NLTK data
nltk.download('punkt')

# Folder containing CSV files
folder_path = 'data/outputFiles/csvOutputs/diarizedTranscripts'

# Column name to extract text from
column_name = 'text'

# List to hold the variable names
variable_names = []

# Counter to create corpus variable names like corpus1, corpus2, etc.
counter = 1

# Iterate over each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_csv(file_path)
        
        # Convert the column data to a single string
        text = ' '.join(df[column_name].astype(str).tolist())
        
        # Create a variable name based on the counter
        variable_name = f'corpus{counter}'
        
        # Store the text in the global namespace
        globals()[variable_name] = text
        
        # Append the variable name to the list
        variable_names.append(variable_name)
        
        # Increment the counter
        counter += 1

# Print the names of all variables
for variable_name in variable_names:
    print(variable_name)

# Example of how to access the variables
# print(corpus1)
# print(corpus2)
# ...



For CSV folder instead of text

In [None]:
import os
import nltk
import string
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Initialize stop words
stop_words_set = set(stopwords.words('english'))

# Set the path to your text files directory
input_directory = 'data/outputFiles/csvOutputs/diarizedTranscripts'
output_directory = 'data/outputFiles/processedFiles/newpipline'

# Automatically list all CSV files in the input directory
files_to_process = [(os.path.join(input_directory, f), os.path.join(output_directory, f'{os.path.splitext(f)[0]}_processed.csv'))
                    for f in os.listdir(input_directory) if f.endswith('.csv')]

# Function to process a file
def process_file(input_path, output_path):
    results = {'sentence_tokens': [], 'cleaned_sentences': [], 'word_tokens': [], 'text_objects': None, 'removed_elements': {'punctuation': [], 'non_alpha': [], 'stop_words': []}}
    try:
        df = pd.read_csv(input_path)
        df['processed_text'] = None  # Initialize a new column for processed text

        for index, row in df.iterrows():
            raw_text = row['text']
            sent_tokens = sent_tokenize(raw_text)
            cleaned_sent_tokens = []

            for sentence in sent_tokens:
                words = word_tokenize(sentence.lower())
                cleaned_words = []
                for word in words:
                    if word.isalpha():
                        if word not in stop_words_set:
                            cleaned_words.append(word)
                        else:
                            results['removed_elements']['stop_words'].append(word)
                    else:
                        results['removed_elements']['non_alpha'].append(word)
                        if any(char in string.punctuation for char in word):
                            results['removed_elements']['punctuation'].append(word)

                cleaned_sentence = ' '.join(cleaned_words)
                cleaned_sent_tokens.append(cleaned_sentence)

            cleaned_text = ' '.join(cleaned_sent_tokens)
            df.at[index, 'processed_text'] = cleaned_text

        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        df.to_csv(output_path, index=False)

        word_tokens = word_tokenize(cleaned_text)
        text_objects = nltk.Text(word_tokens)

        results.update({
            'sentence_tokens': sent_tokens,
            'cleaned_sentences': cleaned_sent_tokens,
            'word_tokens': word_tokens,
            'text_objects': text_objects
        })

    except FileNotFoundError:
        print(f"Error: The file '{input_path}' was not found. Check the file path.")
    except Exception as e:
        print(f"An error occurred: {e}")
    return results

# Dictionary to hold all results
all_results = {}
corpus_number = 1  # Initialize a counter for the corpus number

# Process each file
for input_path, output_path in files_to_process:
    results = process_file(input_path, output_path)
    corpus_key = f'corpus{corpus_number}'  # Generate a corpus key like "corpus1", "corpus2", etc.
    all_results[corpus_key] = results
    corpus_number += 1  # Increment the corpus number for the next iteration

# Example of accessing the results
for key, value in all_results.items():
    if 'word_tokens' in value and value['word_tokens']:
        print(f"Results for {key}:")
        print(value['word_tokens'][:5])  # Display first 5 word tokens
    else:
        print(f"No word tokens available for {key}")


In [None]:
import os
import nltk
import string 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Initialize stop words
stop_words_set = set(stopwords.words('english'))

# Set the path to your text files directory
input_directory = 'data/rawTextCorpora/summerPD2023/Transcriptions/NSF_Transcripts/mergedtranscript'
output_directory = 'data/outputFiles/processedFiles/newpipline'

# Automatically list all text files in the input directory
files_to_process = [(os.path.join(input_directory, f), os.path.join(output_directory, f'{os.path.splitext(f)[0]}_processed.txt'))
                    for f in os.listdir(input_directory) if f.endswith('.txt')]

# Function to process a file
def process_file(input_path, output_path):
    results = {'sentence_tokens': [], 'cleaned_sentences': [], 'word_tokens': [], 'text_objects': None, 'removed_elements': {'punctuation': [], 'non_alpha': [], 'stop_words': []}}
    try:
        with open(input_path, 'rt', encoding='utf-8', errors='replace') as file:
            raw_text = file.read().replace("\n", " ").replace('yeah', '').replace('like', '').replace('Yeah', '')

        sent_tokens = sent_tokenize(raw_text)
        cleaned_sent_tokens = []

        for sentence in sent_tokens:
            words = word_tokenize(sentence.lower())
            cleaned_words = []
            for word in words:
                if word.isalpha():
                    if word not in stop_words_set:
                        cleaned_words.append(word)
                    else:
                        results['removed_elements']['stop_words'].append(word)
                else:
                    results['removed_elements']['non_alpha'].append(word)
                    if any(char in string.punctuation for char in word):
                        results['removed_elements']['punctuation'].append(word)

            cleaned_sentence = ' '.join(cleaned_words)
            cleaned_sent_tokens.append(cleaned_sentence)

        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        with open(output_path, 'w', encoding='utf-8') as file:
            for sentence in cleaned_sent_tokens:
                file.write(sentence + '\n')

        cleaned_text = ' '.join(cleaned_sent_tokens)
        word_tokens = word_tokenize(cleaned_text)
        text_objects = nltk.Text(word_tokens)

        results.update({
            'sentence_tokens': sent_tokens,
            'cleaned_sentences': cleaned_sent_tokens,
            'word_tokens': word_tokens,
            'text_objects': text_objects
        })

    except FileNotFoundError:
        print(f"Error: The file '{input_path}' was not found. Check the file path.")
    except Exception as e:
        print(f"An error occurred: {e}")
    return results

# Dictionary to hold all results
all_results = {}
corpus_number = 1  # Initialize a counter for the corpus number

# Process each file
for input_path, output_path in files_to_process:
    results = process_file(input_path, output_path)
    corpus_key = f'corpus{corpus_number}'  # Generate a corpus key like "corpus1", "corpus2", etc.
    all_results[corpus_key] = results
    corpus_number += 1  # Increment the corpus number for the next iteration

# Example of accessing the results
for key, value in all_results.items():
    if 'word_tokens' in value and value['word_tokens']:
        print(f"Results for {key}:")
        print(value['word_tokens'][:5])  # Display first 5 word tokens
    else:
        print(f"No word tokens available for {key}")


# Assuming 'all_results' is a dictionary structured like {'corpus1': {...}, 'corpus2': {...}, ...}
corpora_keys = list(all_results.keys())  # Get all the keys which are corpus identifiers

# For each corpus key, create variables for different elements
for corpus_key in corpora_keys:
    index = corpora_keys.index(corpus_key) + 1  # To match 'corpus1' with index 1, 'corpus2' with index 2, etc.

    # Dynamically create variable names and assign data
    globals()[f'word_tokens_corpus{index}'] = all_results[corpus_key]['word_tokens']
    globals()[f'text_objects_corpus{index}'] = all_results[corpus_key]['text_objects']
    globals()[f'sentence_tokens_corpus{index}'] = all_results[corpus_key]['sentence_tokens']
    globals()[f'normalized_sentences_corpus{index}'] = all_results[corpus_key]['cleaned_sentences']
    globals()[f'removed_elements_corpus{index}'] = all_results[corpus_key]['removed_elements']

# Example of how to access these variables dynamically
for i in range(1, len(corpora_keys) + 1):
    print(f"word_tokens_corpus{i} is a:", type(globals()[f'word_tokens_corpus{i}']), "containing", len(globals()[f'word_tokens_corpus{i}']), "tokens")
    print(f"sentence_tokens_corpus{i} is a:", type(globals()[f'sentence_tokens_corpus{i}']), "containing", len(globals()[f'sentence_tokens_corpus{i}']), "tokens")

### 1.02 Splitting by Sections 

In [None]:
import re

transcript_text = "Section 1: Introduction to AI. AI is a broad field. Section 2: Applications of AI. AI is used in many industries."

# Split the transcript by "Section" followed by any digit and a colon
docs = re.split(r"Section \d+: ", transcript_text)
# Remove any empty strings that might have occurred during splitting
docs = [doc.strip() for doc in docs if doc.strip()]

print(docs)


## 2.00 Peeking Under The Hood Text Analytics 

### 2.01 Average Sentence Length
Average Sentence Length uses the total number of words and total number of sentences
in a corpus to calculate exactly what it says: the average sentence length.
While the equation is very basic and straightforward it provides information that can
be used to infer, for example, how complex sentences are on average throughout a
given text corpus.

In [None]:
# Assuming `all_results` is populated with multiple corpus entries
for corpus_key, data in all_results.items():
    # Retrieve sentence and word tokens from the dictionary
    sentence_tokens = data.get('sentence_tokens', [])
    word_tokens = data.get('word_tokens', [])

    # Ensure there are sentences to avoid division by zero
    if sentence_tokens:
        average_sentence_length = len(word_tokens) / len(sentence_tokens)
    else:
        average_sentence_length = 0  # Default to zero if no sentences

    # Print results for each group
    print(f"\n{corpus_key}:")
    print("Number of sentences:", len(sentence_tokens))
    print("Number of word tokens:", len(word_tokens))
    print("Average sentence length:", average_sentence_length)


### 2.02 Average Word Length Distribution
Another fairly straightforward measure that can provide insight into how long, on average, words are in a given corpus.

In [None]:
# Assuming `all_results` is populated with multiple corpus entries
for corpus_key, data in all_results.items():
    word_tokens = data.get('word_tokens', [])

    # Calculate the average word length if there are words in the corpus
    if word_tokens:
        avg_word_length = sum(len(word) for word in word_tokens) / len(word_tokens)
    else:
        avg_word_length = 0  # Default to zero if no words to avoid division by zero

    # Print results for each group
    print(f"\n{corpus_key}:")
    print("Number of word tokens:", len(word_tokens))
    print("Average word length:", avg_word_length)



### 2.03 Lexical Diversity
Lexical diversity quantifies the variety of unique words found in a document. It produces a numerical measure that indicates how diverse the vocabulary is that is used in a text. Broadly speaking, scores of (0.8 - 1) are considered extremely high and difficult to maintain in typical communicative texts. Scores of 0.4-0.79 are considered moderate to high; most high-quality texts fall in this range. Scores of (0 - 0.39) are considered low lexical diversity and tend to suggest highly specialized or technical language usage (e.g., instruction manuals) or language aimed at young readers. This measure is sensitive to corpus length (longer corpora have more opportunities to repeat words), but comparing lexical diversity scores can allow for quantitative comparison that might suggest potential changes in how the usage of language may differ between groups.

In [None]:
# Assuming `all_results` is populated with multiple corpus entries
for corpus_key, data in all_results.items():
    word_tokens = data.get('word_tokens', [])

    # Calculate the lexical diversity if there are words in the corpus
    if word_tokens:
        lexical_diversity = len(set(word_tokens)) / len(word_tokens)
    else:
        lexical_diversity = 0  # Default to zero if no words to avoid division by zero

    # Print results for each group
    print(f"\n{corpus_key}:")
    print("Number of word tokens:", len(word_tokens))
    print("Lexical diversity:", lexical_diversity)



### 2.04 Unique Words Over Time
Unique words can be used to identify the frequency of words that appear only once in a given corpus. We can also print a list of these word tokens. Looking at unique words between or across text corpora can allow us to look for the appearances and disappearances of specialized educational terminology over time. To find the frequency (number) of unique words, use the following code:

In [None]:
# Assuming `all_results` contains data for multiple groups
for corpus_key, data in all_results.items():
    word_tokens = data.get('word_tokens', [])

    # Calculate the number of unique words
    unique_words = set(word_tokens)
    unique_word_count = len(unique_words)

    # Print results for each group
    print(f"\n{corpus_key}:")
    print("Number of unique words:", unique_word_count)



### 2.05 Twenty-Five Most Frequent Words

In [None]:
import nltk

# Assuming `all_results` contains data for multiple groups
for corpus_key, data in all_results.items():
    text_objects = data.get('text_objects', None)

    # Check if text_objects exist
    if text_objects:
        # Generate a frequency distribution for the text objects
        freq_dist = nltk.FreqDist(text_objects)

        # Get the top 25 most common words
        most_common_words = freq_dist.most_common(25)

        # Print results for each group
        print(f"\nMost common words in {corpus_key}:")
        print(most_common_words)
    else:
        print(f"No text objects available for {corpus_key}")


### 2.06 Display all unique words found in a corpus

In [None]:
# Example using all_results dictionary which contains multiple corpora
corpora_tokens = {key: set(data['word_tokens']) for key, data in all_results.items() if 'word_tokens' in data}

# Function to find words unique to each corpus compared to others
def find_unique_words(corpora_tokens):
    unique_words = {}
    for corpus_name, tokens in corpora_tokens.items():
        # Start with the current corpus tokens
        all_other_tokens = set()
        for other_corpus_name, other_tokens in corpora_tokens.items():
            if corpus_name != other_corpus_name:
                all_other_tokens.update(other_tokens)
        
        # Unique words are those not in the union of all other tokens
        unique_words[corpus_name] = tokens - all_other_tokens
    return unique_words

# Find words unique to each corpus
unique_words_by_corpus = find_unique_words(corpora_tokens)

# Print unique words for each corpus
for corpus_name, unique_words in unique_words_by_corpus.items():
    print(f"Words exclusive to {corpus_name}:", sorted(unique_words))


### 2.07 Most frequently used words across all corpora:

In [None]:
from collections import Counter
import nltk

# Assuming all_results contains multiple corpora with their respective word tokens
corpora_tokens = {key: data['word_tokens'] for key, data in all_results.items() if 'word_tokens' in data}

# Aggregate all tokens from all corpora into a single list
all_tokens = []
for tokens in corpora_tokens.values():
    all_tokens.extend(tokens)

# Calculate the frequency distribution of all tokens
token_freq_dist = Counter(all_tokens)

# Find the most common words across all corpora
most_common_words = token_freq_dist.most_common(100)  # Adjust the number as needed

def print_in_columns(data, columns=3):
    # Split the data into chunks of size 'columns'
    for i in range(0, len(data), columns):
        chunk = data[i:i + columns]
        # Format and print each chunk
        print("  ".join(f"{word}: {freq}" for word, freq in chunk))

# Print the most common words in columns
print("Most frequently used words across all corpora:")
print_in_columns(most_common_words, columns=1)


### 2.08 N-grams and collocations
N-grams point out recurring word combinations found throughout the text corpus. For example, "spring break" is an example of a bigrams while "New York City" is a trigrams. Bigrams and repeated collocations of words convey a lot of information about the contents of the text corpus.
To generate an ordered list of the most common bigrams, use the following code:

In [None]:
import nltk
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder
import pandas as pd
import os

# Assuming all_results contains multiple corpora with their respective text objects
output_directory = 'data/outputFiles/ngramFrequencies'
os.makedirs(output_directory, exist_ok=True)  # Ensure the output directory exists

for key, data in all_results.items():
    if 'text_objects' in data:
        text_object = data['text_objects']
        
        # Find bigrams
        bigram_finder = BigramCollocationFinder.from_words(text_object)
        bigrams = bigram_finder.ngram_fd.items()
        bigrams_sorted = sorted(bigrams, key=lambda item: item[1], reverse=True)
        bigram_df = pd.DataFrame(bigrams_sorted, columns=['Bigram', 'Frequency'])
        
        # Save bigrams to CSV
        bigram_filename = os.path.join(output_directory, f"{key}_bigrams.csv")
        bigram_df.to_csv(bigram_filename, index=False)
        print(f"Top 50 bigrams for {key} saved to {bigram_filename}")
        
        # Find trigrams
        trigram_finder = TrigramCollocationFinder.from_words(text_object)
        trigrams = trigram_finder.ngram_fd.items()
        trigrams_sorted = sorted(trigrams, key=lambda item: item[1], reverse=True)
        trigram_df = pd.DataFrame(trigrams_sorted, columns=['Trigram', 'Frequency'])
        
        # Save trigrams to CSV
        trigram_filename = os.path.join(output_directory, f"{key}_trigrams.csv")
        trigram_df.to_csv(trigram_filename, index=False)
        print(f"Top 50 trigrams for {key} saved to {trigram_filename}")

        print("\n")


In [None]:
import nltk
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder

# Assuming all_results contains multiple corpora with their respective text objects
for key, data in all_results.items():
    if 'text_objects' in data:
        text_object = data['text_objects']
        
        # Display frequency of highest 50 bigrams
        print(f"Top 50 bigrams for {key}:")
        bigram_finder = BigramCollocationFinder.from_words(text_object)
        bigram_finder.ngram_fd.tabulate(25)
        
        # Display frequency of highest 50 trigrams
        print(f"Top 50 trigrams for {key}:")
        trigram_finder = TrigramCollocationFinder.from_words(text_object)
        trigram_finder.ngram_fd.tabulate(5)
        print("\n")


### 2.09 Concordance
Concordance is an NLTK Text object method that also looks for word distribution, but specifically searches for words found before and after a specific word of choice. Concordance allows us to find out how words are used contextually throughout a corpus. This can be particularly powerful when looking at trends over time or between groups. For example, in the sample below we search for the all the contextual occurrences of the word “pi” in our seven separate corpora.

In [None]:
# Assuming all_results is a dictionary with keys as corpus names and values containing NLTK text objects among other details
for key, data in all_results.items():
    if 'text_objects' in data:
        text_object = data['text_objects']
        print(f"Concordance for 'students' in {key}:")
        text_object.concordance("email", width=150)
        print("\n")  # Adding a newline for better readability between results


## 3.00 Word Frequency Distribution

### 3.01 Bag of Words Frequency distribution

This will search for each word in the bag of words to find its frequncy in each text corpus.

In [None]:
import nltk
from nltk.probability import FreqDist
import pandas as pd


# Initialize a dictionary to hold all word tokens for each corpus
all_word_tokens = {}

# Extract word tokens from each result in all_results and store them in all_word_tokens
for file_key, result in all_results.items():
    all_word_tokens[file_key] = result['word_tokens']

# At this point, all_word_tokens will have file keys as keys and lists of word tokens as values
# Assuming `all_word_tokens` is a dictionary where keys are corpus names and values are lists of word tokens
# For example:
# all_word_tokens = {
#     'corpus1': ['word1', 'word2', ...],
#     'corpus2': ['word1', 'word2', ...],
#     ...
# }

# Keywords to track across corpora
keywords = sorted(['students', "science", 'engagement', 'ability', 'community', 'talk',
            'culture', 'ownership', 'regular', 'successful', 'participation', 'expectations', 
            'ap', 'assessment', 'cultural', 'phenomena', 'important', 'know', 'standards', 'relationship',
            "engaging", 'literacy', 'relationship', 'kids', 'connect', 'student', 'classroom', 
            'teacher', 'teaching', 'school', 'class', 'curriculum', 'learn', 'approach', 'talking', 
            'discussion', 'love', 'proud', 'like', 'difficult', 'actually', 'know', 'questions', 'know',
            'think', 'want', 'kind','time', 'grade', 'thinking', 'different', 'talk', 'conversation', 'discussion', 'hard', 'saying'])

# Initialize a dictionary to hold frequency distributions
freq_distributions = {}

# Calculate frequency distribution for each corpus
for corpus_name, tokens in all_word_tokens.items():
    freq_dist = FreqDist(tokens)
    freq_distributions[corpus_name] = {word: freq_dist[word] for word in keywords}

# Convert the frequency distributions to a DataFrame for easy visualization and analysis
freq_df = pd.DataFrame(freq_distributions)

print(freq_df[:35])


### 3.02 Bag of Words Frequency Distribution with Plots

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Example keywords to track
keywords = sorted(['students', "science", 'engagement', 'ability', 'community', 'talk',
            'culture', 'ownership', 'regular', 'successful', 'participation', 'expectations', 
            'ap', 'assessment', 'cultural', 'phenomena', 'important', 'know', 'standards', 'relationship',
            "engaging", 'literacy', 'relationship', 'kids', 'connect', 'student', 'classroom', 
            'teacher', 'teaching', 'school', 'class', 'curriculum', 'learn', 'approach', 'talking', 'discussion', 'love', 'proud', 'like', 'difficult', 'actually', 'know', 'questions', 'know',
            'think', 'want', 'kind','time', 'grade', 'thinking', 'different', 'talk', 'conversation', 'discussion', 'hard', 'saying'], reverse=True)

fifteen_minute_marks = {
    'mathTalk_file1': [15, 23, 43, 45, 67, 85, 190],
    'mathTalk_file2': [205, 443, 520, 723, 986, 1222, 1517],

}

# Assuming all_results is defined and populated as per your previous code

# Number of corpora
num_corpora = len(all_results)

# Create subplots for each corpus
fig, axes = plt.subplots(num_corpora, 1, figsize=(25, num_corpora*9), sharex=True)

# Convert axes to an array if it's not (happens when num_corpora is 1)
if not isinstance(axes, np.ndarray):
    axes = np.array([axes])

# Mapping keywords to numeric values for plotting
keyword_mapping = {keyword: i for i, keyword in enumerate(keywords)}

for ax, (file_key, results) in zip(axes, all_results.items()):
    ax.set_title(f"Corpus: {file_key}")
    for keyword in keywords:
        occurrences = [(file_key, i+1) for i, sentence in enumerate(results['sentence_tokens']) if keyword in sentence.lower()]
        corpus_names = [occ[0] for occ in occurrences]
        sentence_nums = [occ[1] for occ in occurrences]
        
        y_values = np.full_like(sentence_nums, keyword_mapping[keyword], dtype=float)
        ax.scatter(sentence_nums, y_values, label=keyword, alpha=0.6, edgecolors='none')


    ax.set_yticks(list(keyword_mapping.values()))
    ax.set_yticklabels(list(keyword_mapping.keys()))

# Adjust layout
plt.xlabel('Sentence Number')
plt.ylabel('Keywords')

# Add common legend and labels
# fig.legend(keywords, loc='upper center', bbox_to_anchor=(0.5, 1.05), ncol=len(keywords))

plt.tight_layout()

# Save the figure to an image file before displaying
plt.savefig('my_plots.png', bbox_inches='tight')

plt.show()

### 3.03 Combined

In [None]:
import plotly.graph_objects as go
import pandas as pd
import numpy as np

# Assuming all_results is defined and populated as per your previous code

# Example keywords to track
keywords = ['students', "science", 'engagement', 'ability', 'community', 'talk',
            'culture', 'ownership', 'regular', 'successful', 'participation', 'expectations', 
            'ap', 'assessment', 'cultural', 'phenomena', 'important', 'know', 'standards', 'relationship',
            "engaging", 'literacy', 'relationship', 'kids', 'connect', 'student', 'classroom', 
            'teacher', 'teaching', 'school', 'class', 'curriculum', 'learn', 'approach', 'talking', 'discussion', 'love', 'proud', 'like', 'difficult', 'actually', 'know', 'questions', 'know',
            'think', 'want', 'kind','time', 'grade', 'thinking', 'different', 'talk', 'conversation', 'discussion', 'hard', 'saying']

fifteen_minute_marks = {
    'mathTalk_file1': [15, 23, 43, 45, 67, 85, 190],
    'mathTalk_file2': [205, 443, 520, 723, 986, 1222, 1517],
    'mathTalk_file3': [174, 430, 521, 731, 986, 1198, 1557],
    'mathTalk_file4': [52, 273, 300, 352, 406, 486, 534],
    'mathTalk_file5': [66, 169, 250, 355, 482, 649, 760],
    'mathTalk_file6': [316, 654, 800, 1159, 1575, 1884, 2200],
    'mathTalk_file7': [114, 312, 381, 723, 1027, 1255, 1519],
}


# Mapping keywords to numeric values for plotting
keyword_mapping = {keyword: i for i, keyword in enumerate(keywords)}

# Create a figure with an increased height to better fit the number of keywords
fig = go.Figure()

# Loop through each corpus
for file_key, results in all_results.items():
    for keyword in keywords:
        occurrences = [(file_key, i+1) for i, sentence in enumerate(results['sentence_tokens']) if keyword in sentence.lower()]
        sentence_nums = [occ[1] for occ in occurrences if occ[0] == file_key]
        y_values = np.full_like(sentence_nums, keyword_mapping[keyword], dtype=float)

        # Adding traces for each keyword
        fig.add_trace(go.Scatter(x=sentence_nums, y=y_values, mode='markers', name=keyword,
                                 text=['Sentence: {}'.format(num) for num in sentence_nums],
                                 marker=dict(size=8, opacity=0.6)))


# Update layout with an appropriate height
fig.update_layout(title='Keyword Occurrence Across Sentences in Multiple Corpora',
                  xaxis_title='Sentence Number',
                  yaxis=dict(tickmode='array', tickvals=list(keyword_mapping.values()), ticktext=list(keyword_mapping.keys())),
                  legend_title='Keywords',
                  height=1200)  # Set a larger height depending on the number of keywords

# Save the figure to an HTML file for interactive viewing
fig.write_html('my_interactive_plots.html')

# Show the figure
fig.show()


### 3.04 Side to side

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
import numpy as np

# Assuming all_results is defined and populated as per your previous code
num_corpora = len(all_results)  # Count of corpora to be displayed
fig = make_subplots(rows=1, cols=num_corpora, subplot_titles=[f"Corpus {i+1}" for i in range(num_corpora)])

# Example keywords to track, sorted to maintain consistency across the plot
keywords = sorted(['students', "science", 'engagement', 'ability', 'community', 'talk',
            'culture', 'ownership', 'regular', 'successful', 'participation', 'expectations', 
            'ap', 'assessment', 'cultural', 'phenomena', 'important', 'know', 'standards', 'relationship',
            "engaging", 'literacy', 'relationship', 'kids', 'connect', 'student', 'classroom', 
            'teacher', 'teaching', 'school', 'class', 'curriculum', 'learn', 'approach', 'talking', 
            'discussion', 'love', 'proud', 'like', 'difficult', 'actually', 'know', 'questions', 
            'think', 'want', 'kind', 'time', 'grade', 'thinking', 'different', 'talk', 'conversation', 
            'discussion', 'hard', 'saying'], reverse=True)

# Mapping keywords to numeric values for plotting
keyword_mapping = {keyword: i for i, keyword in enumerate(keywords)}

# Index for the current subplot
col_index = 1

# Loop through each corpus
for file_key, results in all_results.items():
    for keyword in keywords:
        occurrences = [(file_key, i+1) for i, sentence in enumerate(results['sentence_tokens']) if keyword in sentence.lower()]
        sentence_nums = [occ[1] for occ in occurrences if occ[0] == file_key]
        y_values = np.full_like(sentence_nums, keyword_mapping[keyword], dtype=float)

        # Adding traces for each keyword to the respective subplot
        fig.add_trace(go.Scatter(x=sentence_nums, y=y_values, mode='markers', name=keyword,
                                 text=['Sentence: {}'.format(num) for num in sentence_nums],
                                 marker=dict(size=10, opacity=0.5)),  # Adjusted marker size for visibility
                      row=1, col=col_index)

    col_index += 1  # Move to the next subplot for the next corpus

# Update layout to make sure all keywords are visible
fig.update_layout(
    title='Keyword Occurrence Across Sentences in Multiple Corpora',
    xaxis_title='Sentence Number',
    yaxis=dict(
        tickmode='array',
        tickvals=list(keyword_mapping.values()),
        ticktext=list(keyword_mapping.keys())
    ),
    legend_title='Keywords',
    height=1200,  # Increased height to accommodate all keywords
    showlegend=True
)

# Save the figure to an HTML file for interactive viewing
fig.write_html('my_interactive_plots.html')

# Show the figure
fig.show()



### 3.05 Needs fixing

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
import numpy as np

# Assuming all_results is defined and populated as per your previous code
num_corpora = len(all_results)  # Count of corpora to be displayed
fig = make_subplots(rows=1, cols=num_corpora, subplot_titles=[f"Corpus {i+1}" for i in range(num_corpora)])

# Example keywords to track, sorted to maintain consistency across the plot
keywords = sorted(['students', "science", 'engagement', 'ability', 'community', 'talk',
            'culture', 'ownership', 'regular', 'successful', 'participation', 'expectations', 
            'ap', 'assessment', 'cultural', 'phenomena', 'important', 'know', 'standards', 'relationship',
            "engaging", 'literacy', 'relationship', 'kids', 'connect', 'student', 'classroom', 
            'teacher', 'teaching', 'school', 'class', 'curriculum', 'learn', 'approach', 'talking', 
            'discussion', 'love', 'proud', 'like', 'difficult', 'actually', 'know', 'questions', 
            'think', 'want', 'kind', 'time', 'grade', 'thinking', 'different', 'talk', 'conversation', 
            'discussion', 'hard', 'saying'], reverse=True)

# Mapping keywords to numeric values for plotting
keyword_mapping = {keyword: i for i, keyword in enumerate(keywords)}

# Loop through each corpus
for file_key, results in all_results.items():
    for keyword in keywords:
        occurrences = [(file_key, i+1) for i, sentence in enumerate(results['sentence_tokens']) if keyword in sentence.lower()]
        sentence_nums = [occ[1] for occ in occurrences if occ[0] == file_key]
        y_values = np.full_like(sentence_nums, keyword_mapping[keyword], dtype=float)

        # Adding traces for each keyword
        fig.add_trace(go.Scatter(x=sentence_nums, y=y_values, mode='markers',
                                 text=[f'{keyword} (Sentence: {num})' for num in sentence_nums],
                                 marker=dict(size=10, opacity=0.5),
                                 showlegend=False),  # This prevents adding to the legend
                      row=1, col=1)

# Update layout
fig.update_layout(
    title='Keyword Occurrence Across Sentences in Multiple Corpora',
    xaxis_title='Sentence Number',
    yaxis=dict(
        tickmode='array',
        tickvals=list(keyword_mapping.values()),
        ticktext=list(keyword_mapping.keys())
    ),
    height=1200,  # Increased height to accommodate all keywords
)

# Save the figure to an HTML file for interactive viewing
fig.write_html('my_interactive_plots.html')

# Show the figure
fig.show()


## 4.00 Additional Resources

### 4.01 Importing data from csv file in Google Docs

In [None]:
import nltk
import requests
import pandas as pd
from nltk.tokenize import word_tokenize
nltk.download('punkt')

# This is the full shared Drive link, the file ID starts at "1i" and ends at "8S"
# https://docs.google.com/spreadsheets/d/1iJ4SG-QXfY4zw5K9B7Ununv3rb3iBj8S/edit?usp=drive_link&ouid=106477043869312333876&rtpof=true&sd=true
# https://drive.google.com/file/d/1hLRRRvawjxrdI141_bT5QXELb0jk9Jhg/view?usp=sharing

# the file ID from the shareable link is pasted below in orange.
file_id = "1hLRRRvawjxrdI141_bT5QXELb0jk9Jhg"

# construct the download URL, you would not need to change anything here.
download_url = f"https://docs.google.com/uc?export=download&id={file_id}"

# send a GET request to the download URL and save the response content
response = requests.get(download_url)

# The next line names the file after download. If you change it here, you will also need to change in the subsequent fields.
# If you click on the folder icon in Colab you should see a file now appear called "uncertaintyText.xlsx"
# These names can be changed to suit you own data
with open("uncertaintyText.xlsx", "wb") as f:
    f.write(response.content)


# Specify the path to the Excel file this where it was placed in 2.4 so that is the file and path you want to open
excel_file_path = '/content/uncertaintyText.xlsx'

# Specify the column name you want to pull the data corpus from
column_name = 'transcript'

# Read the Excel file and extract the specified column
data = pd.read_excel(excel_file_path, engine='openpyxl')
text_column = data[column_name]


# Convert each item in the column to a string and then join them together to be saved as a text file containing all data in the transcript column.
raw_uncertaintyText = ' '.join(map(str, text_column))


# Save the string to a text file in your Google Drive
with open('/content/raw_uncertaintyText.txt', 'w') as file:
  file.write(raw_uncertaintyText)

print("Text saved to raw_uncertaintyText.txt")
print("Raw text file is a: ",type(raw_uncertaintyText), "It contains: ",len(raw_uncertaintyText), "characters")
print("Here are the first 251 characters in the raw text file: ", raw_uncertaintyText[0:250])

## Step 3: Topic Modeling and Semantic Similarities Across Transcriptions
Generating Sentence Embeddings for Each Transcription
We'll generate embeddings for each transcription and store them.

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer('all-MiniLM-L6-v2')

# Store embeddings and related info
embeddings_list = []
file_names = []

for df in transcript_dfs:
    text = ' '.join(df['cleaned_transcript'])
    embedding = model.encode(text)
    embeddings_list.append(embedding)
    file_names.append(df['source_file'])


## Comparing Transcriptions Using Semantic Similarity
We'll compute the cosine similarity between each pair of transcriptions.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Convert list of embeddings to a numpy array
embeddings_array = np.vstack(embeddings_list)

# Calculate cosine similarity matrix
similarity_matrix = cosine_similarity(embeddings_array)

# Create a DataFrame for better visualization
similarity_df = pd.DataFrame(similarity_matrix, index=file_names, columns=file_names)

print(similarity_df)


## Visualizing the Similarity Matrix

# Topic Modeling on Individual Transcriptions
Using Latent Dirichlet Allocation (LDA)
First, you need to prepare the data for LDA:

In [None]:
from gensim.corpora import Dictionary
from gensim.models import LdaModel

def perform_lda(tokens_list, num_topics=5):
    # Create a dictionary representation of the documents.
    dictionary = Dictionary(tokens_list)

    # Filter out extremes to limit the number of features
    dictionary.filter_extremes(no_below=1, no_above=0.5)

    # Create a bag-of-words representation of the documents.
    corpus = [dictionary.doc2bow(tokens) for tokens in tokens_list]

    # Train the LDA model
    lda = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, random_state=42)
    return lda, corpus, dictionary