In [None]:
import pickle
from sentence_transformers import SentenceTransformer
import tqdm as notebook_tqdm
from sklearn.datasets import fetch_20newsgroups
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import sys
sys.path.insert(0, 'Path')
from phd.letter import Letter
import pandas as pd
import re
import openai
import matplotlib.pyplot as plt

In [None]:
pd.set_option('display.max_rows', 500)

# Data Input (add recipients and time)

In [None]:
def load_data_from_pickle(file_path):
    with open(file_path, 'rb') as f:
        data = pickle.load(f)
    return data

def extract_paragraphs(letter_list):
    ids = []
    recipients = []
    years = []
    months = []
    paragraphs = []
    for letter in letter_list:
        if letter.paragraphs:
            for paragraph in letter.paragraphs:
                paragraphs.append(paragraph)
                ids.append(letter.id)
                recipients.append(letter.recipient)
                years.append(letter.year)
                months.append(letter.month)
        else:
            paragraphs.append(letter.text)
            ids.append(letter.id)
            recipients.append(letter.recipient)
            years.append(letter.year)
            months.append(letter.month)
    return paragraphs, ids, recipients, years, months

def extract_text(letter_list):
    ids = []
    recipients = []
    years = []
    months = []
    texts = []
    for letter in letter_list:
        texts.append(letter.text)
        ids.append(letter.id)
        recipients.append(letter.recipient)
        years.append(letter.year)
        months.append(letter.month)
    return texts, ids, recipients, years, months

In [None]:
pickle_file_path = "letters"

data = load_data_from_pickle(pickle_file_path)

# Extract Paragraphs

In [None]:
paragraphs, ids, recipients, years, months = extract_paragraphs(data)

In [None]:
len(paragraphs)

# Extract Whole Texts

In [None]:
texts, ids, recipients, years, months = extract_text(data)

In [None]:
len(texts)

# Define Topic Models, Load Embeddings

In [None]:
# Built-in English stop words
#english_stop_words = CountVectorizer(stop_words='english').get_stop_words()

# Custom stop words
#custom_stop_words = ['dickens', 'forster', 'coutts', 'couttss', 'mr', 'mrs', "dickenss", 'charles', 'kate', 'georgina', 'macready', 'arthur', 'smith', 'letter', 'write', 'forsters']

# Combine both lists
#all_stop_words = list(english_stop_words) + custom_stop_words

In [None]:
# KeyBERT
keybert_model = KeyBERTInspired()

# GPT-3.5
client = openai.OpenAI(api_key="drop_key_here")
prompt = """
I have a topic that contains the following documents: 
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
topic: <topic label>
"""
openai_model = OpenAI(client, model="gpt-3.5-turbo", exponential_backoff=True, chat=True, prompt=prompt)

# All representation models
representation_model = {
    "KeyBERT": keybert_model
    #,"OpenAI": openai_model,  # Uncomment if you will use OpenAI
}

In [None]:
#ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
#vectorizer_model = CountVectorizer(stop_words=all_stop_words)
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(paragraphs, show_progress_bar=True)
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)#

In [None]:
# We reduce our embeddings to 2D as it will allows us to quickly iterate later on
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, 
                          min_dist=0.0, metric='cosine').fit_transform(embeddings)

# Train Topic Models

In [None]:
# Train our topic model
topic_model = BERTopic(embedding_model=sentence_model, umap_model=umap_model, representation_model=representation_model,
                       vectorizer_model=vectorizer_model, calculate_probabilities=True)

topics, probs = topic_model.fit_transform(paragraphs, embeddings)

# Check Topics

In [None]:
df = topic_model.get_topic_info()

In [None]:
df

# Excel export

In [None]:
df.to_excel('topics_with labels.xlsx', index=False)

# Visualize Topics

In [None]:
topic_model.visualize_topics(custom_labels=True)

In [None]:
topic_model.visualize_hierarchy(custom_labels=True)

In [None]:
topic_model.visualize_heatmap()

# Visualize Documents

In [None]:
topic_model.visualize_documents(ids, reduced_embeddings=reduced_embeddings, custom_labels=True)

# Topics per Category

In [None]:
#Topics per category derived from the handbook

# Data Preparation
topic_counts = {
    'Dreams': 2,
    'Social Reform': 5,
    'Affect and Emotions': 13,
    'Animals and Nature': 4,
    'Religion and Christmas': 6,
    'Alcohol and Drinking': 3,
    'Theatre and Shakespeare': 14
}

# Sorting the data based on passage counts for consistency
sorted_categories = sorted(topic_counts, key=topic_counts.get, reverse=True)
topics = [topic_counts[category] for category in sorted_categories]

# Plotting the number of topics with a refined color palette
plt.figure(figsize=(8, 6))
plt.bar(sorted_categories, topics, color='#4a90e2', edgecolor='black')
plt.xlabel('Category derived from the Oxford Handbook', fontsize=12)
plt.ylabel('Number of assigned topics', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Display the plot
plt.tight_layout()
plt.show()


In [None]:
# letter paragraphs per category derived from the handbook

# Data Preparation
passage_counts = {
    'Dreams': 65,
    'Social Reform': 169,
    'Affect and Emotions': 500,
    'Animals and Nature': 162,
    'Religion and Christmas': 190,
    'Alcohol and Drinking': 122,
    'Theatre and Shakespeare': 563
}

# Sorting the data based on passage counts for consistency
sorted_categories = sorted(passage_counts, key=passage_counts.get, reverse=True)
passages = [passage_counts[category] for category in sorted_categories]

# Plotting the number of passages with a complementary color
plt.figure(figsize=(8, 6))
plt.bar(sorted_categories, passages, color='#e27d60', edgecolor='black')
plt.xlabel('Category derived from the Oxford Handbook', fontsize=12)
plt.ylabel('Number of assigned letter paragraphs', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Display the plot
plt.tight_layout()
plt.show()


In [None]:
# topics per category (other thematic trends)

import matplotlib.pyplot as plt
import numpy as np

# Data Preparation
topic_counts = {
    'Traveling and Places': 25,
    'Supernatural, Mysticism, and Ghosts': 7,
    'Clothing and Appearance': 3,
    'Portraits and Photographs': 1,
    'Health and Well-being': 6,
    'Copyright and Piracy': 3,
    'Poems, Poets, and Verses': 6
}

# Sorting the data based on passage counts for consistency
sorted_categories = sorted(topic_counts, key=topic_counts.get, reverse=True)
topics = [topic_counts[category] for category in sorted_categories]

# Plotting the number of topics with a refined color palette
plt.figure(figsize=(8, 6))
plt.bar(sorted_categories, topics, color='#4a90e2', edgecolor='black')
plt.xlabel('Other thematic trends', fontsize=12)
plt.ylabel('Number of assigned topics', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Display the plot
plt.tight_layout()
plt.show()


In [None]:
# letter paragraphs per category (other thematic trends)

# Data Preparation
passage_counts = {'Traveling and Places': 1788, 'Supernatural, Mysticism, and Ghosts': 122, 'Clothing and Appearance': 163, 'Portraits and Photographs': 266, 'Health and Well-being': 291, 'Copyright and Piracy': 107, 'Poems, Poets, and Verses': 328}


# Sorting the data based on passage counts for consistency
sorted_categories = sorted(passage_counts, key=passage_counts.get, reverse=True)
passages = [passage_counts[category] for category in sorted_categories]

# Plotting the number of passages with a complementary color
plt.figure(figsize=(8, 6))
plt.bar(sorted_categories, passages, color='#e27d60', edgecolor='black')
plt.xlabel('Other thematic trends', fontsize=12)
plt.ylabel('Number of assigned letter paragraphs', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Display the plot
plt.tight_layout()
plt.show()


# Topics over Time

In [None]:
topics_over_time = topic_model.topics_over_time(paragraphs, years, nr_bins=50)

In [None]:
time_poems = topic_model.visualize_topics_over_time(topics_over_time, topics=[270,264,29,207,24,69])

In [None]:
time_poems

In [None]:
time_all = topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=len(topics_over_time))

In [None]:
time_all

# Topics per Recipient

In [None]:
from pprint import pprint
pprint(set(recipients))

In [None]:
topics_per_class = topic_model.topics_per_class(paragraphs, classes=recipients)

In [None]:
poem_topics=[270,264,29,207,24,69]

In [None]:
# shows the recipient of every letter paragraph assigned to poem-related topics
poem_df = topics_per_class[topics_per_class['Topic'].isin(poem_topics)]

In [None]:
# Aggregate the classes across all topics and sum the frequencies
class_frequencies = poem_df.groupby('Class')['Frequency'].sum().reset_index()

# Determine the classes with a frequency of more than 3
major_classes = class_frequencies[class_frequencies['Frequency'] > 3]['Class'].tolist()

# Replace all other classes with "Misc."
poem_df['Class'] = poem_df['Class'].apply(lambda x: x if x in major_classes else 'Misc.')

# Aggregate the data by "Topic" and "Class"
agg_df_with_misc = poem_df.groupby(['Topic', 'Class'])['Frequency'].sum().unstack(fill_value=0)

# Sort the classes by their total frequency
sorted_classes = agg_df_with_misc.sum().sort_values(ascending=False).index
agg_df_with_misc = agg_df_with_misc[sorted_classes]

# Add "Misc." as the last category if it's not already there
if 'Misc.' not in agg_df_with_misc.columns:
    agg_df_with_misc['Misc.'] = 0

# Create a stacked bar chart
ax = agg_df_with_misc.plot(kind='bar', stacked=True, figsize=(12, 8), colormap='tab20')

# Sort the legend by the total frequency of the classes
handles, labels = ax.get_legend_handles_labels()
total_freq = agg_df_with_misc.sum().sort_values(ascending=False)
print(total_freq)
sorted_handles_labels = sorted(zip(handles, labels), key=lambda x: total_freq[x[1]], reverse=True)
sorted_handles, sorted_labels = zip(*sorted_handles_labels)

# Update the legend
ax.legend(sorted_handles, sorted_labels, title='Class', bbox_to_anchor=(1.05, 1), loc='upper left')

plt.xlabel('Topic')
plt.ylabel('Frequency')
plt.title('Frequency of Classes by Topic')
plt.tight_layout()

plt.show()


# Find similar Topics per Term

In [None]:
similar_topics, similarity = topic_model.find_topics("contract", top_n=5)
similar_topics