## Setup

In [None]:
!pip install BERTAgent

In [1]:
!pip install emoji

In [None]:
import spacy
import pandas as pd
import numpy as np
from bertagent import BERTAgent
import re
import html
import emoji
import json
from copy import deepcopy
from tqdm.notebook import tqdm

In [None]:
# PARAMETERS
MAX_SENT_LENGTH = 50  # characters or tokens

## Import Data

In [None]:
# Load from JSON file
with open("bertagent_output.json", "r", encoding="utf-8-sig") as f:
    json_data = json.load(f)

# Convert back to dictionary of DataFrames
bertagent_input = {
    topic: pd.DataFrame(records)
    for topic, records in json_data.items()
}

## Process Text

In [None]:
def clean_text(text):
    text = html.unescape(text) # Unescape HTML
    text = re.sub(r'http\S+|www\S+|https\S+', '', text) # Remove URLs
    text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text) # Remove markdown links
    text = emoji.replace_emoji(text, replace='') # Remove emojis
    text = re.sub(r'<[^>]+>', '', text) # Remove HTML tags
    text = re.sub(r'\s+', ' ', text).strip() # Normalize whitespace
    text = re.sub(r'^>+', '', text, flags=re.MULTILINE)  # Remove leading ">" used in blockquotes
    text = re.sub(r'>+', '', text)  # Remove any remaining ">" characters

    return text

def truncate_sentences_0(sents, max_len=MAX_SENT_LENGTH):
    short_sents = []
    for sent in sents:
        if len(sent) > max_len:
            chunks = re.split(r'[;,:-]', sent)  # Split on soft punctuation
            chunks = [chunk.strip() for chunk in chunks if chunk.strip()]
            short_sents.extend(chunks)
        else:
            short_sents.append(sent)
    return short_sents

def truncate_sentences(sents, max_len=MAX_SENT_LENGTH):
    short_sents = []
    split_pattern = r'(?<=[.!?;,:-])\s+'  # split at punctuation followed by whitespace

    for sent in sents:
        if len(sent) > max_len:
            # Split on punctuation followed by whitespace
            chunks = re.split(split_pattern, sent)
            chunks = [chunk.strip() for chunk in chunks if chunk.strip()]

            # Further split chunks if still too long
            for chunk in chunks:
                if len(chunk) > max_len:
                    # fallback: split roughly by max_len
                    subchunks = [chunk[i:i+max_len].strip() for i in range(0, len(chunk), max_len)]
                    short_sents.extend(subchunks)
                else:
                    short_sents.append(chunk)
        else:
            short_sents.append(sent)
    return short_sents

In [None]:
# Import data
df = pd.read_csv('repr_docs.csv', sep=',')
dfs_by_topic = {topic: group for topic, group in df.groupby("leiden_topic")}
topics = list(dfs_by_topic.keys())

In [None]:
# Process text
nlp = spacy.load("en_core_web_sm")

sentence_dfs_by_topic = {}
for topic in topics:
    df = dfs_by_topic[topic].copy()
    raw_texts = df['text'].fillna("").tolist()
    cleaned_texts = [clean_text(text) for text in raw_texts]

    sents_list = []
    for doc in nlp.pipe(cleaned_texts, batch_size=64):
        sents = [sent.text.strip() for sent in doc.sents]
        sents = truncate_sentences(sents)
        sents_list.append(sents)

    df['sents'] = sents_list
    sentence_dfs_by_topic[topic] = df

In [None]:
# Save to JSON
json_ready = {
    topic: df.to_dict(orient='records')
    for topic, df in sentence_dfs_by_topic.items()
}

# Save to JSON file
with open("sentence_data_by_topic.json", "w", encoding="utf-8-sig") as f:
    json.dump(json_ready, f, ensure_ascii=False, indent=2)

## BERTagent

In [None]:
# Prepare DF's for analysis
bertagent_input = {}
for topic in topics:
    df = sentence_dfs_by_topic[topic]
    df_sentences = df.explode('sents').reset_index(drop=True)
    bertagent_input[topic] = df_sentences

In [None]:
# Set model names
model_names_dict = {}
for i in range(len(topics)):
  model_names_dict[i] = f"ba{i}"

In [None]:
# Run BERTAgent
tqdm.pandas()
for topic, model_name in model_names_dict.items():
    model = BERTAgent()
    model_id = model_name
    df = bertagent_input[topic]
    df[model_id] = df['sents'].progress_apply(model.predict)
    df['BATot'] = df[model_id].apply(model.tot)
    df['BAPos'] = df[model_id].apply(model.pos)
    df['BANeg'] = df[model_id].apply(model.neg)
    bertagent_input[topic] = df

  0%|          | 0/1102 [00:00<?, ?it/s]

  0%|          | 0/2244 [00:00<?, ?it/s]

  0%|          | 0/1590 [00:00<?, ?it/s]

  0%|          | 0/966 [00:00<?, ?it/s]

  0%|          | 0/447 [00:00<?, ?it/s]

  0%|          | 0/923 [00:00<?, ?it/s]

  0%|          | 0/348 [00:00<?, ?it/s]

  0%|          | 0/286 [00:00<?, ?it/s]

  0%|          | 0/315 [00:00<?, ?it/s]

  0%|          | 0/182 [00:00<?, ?it/s]

  0%|          | 0/326 [00:00<?, ?it/s]

  0%|          | 0/367 [00:00<?, ?it/s]

  0%|          | 0/161 [00:00<?, ?it/s]

In [None]:
for topic in bertagent_input:
  print(f"Topic {topic}")
  print(f"Min: {bertagent_input[topic]['BATot'].min()}")
  print(f"Max: {bertagent_input[topic]['BATot'].max()}")
  print(f"Mean: {np.mean(bertagent_input[topic]['BATot'])}")
  print(f"Median: {np.median(bertagent_input[topic]['BATot'])}\n")

Topic 0
Min: -0.06934662908315659
Max: 0.0899746734648943
Mean: 0.03757372313016832
Median: 0.03826851965433928

Topic 1
Min: -0.018092921003699303
Max: 0.11292121186852455
Mean: 0.038358919286045956
Median: 0.03873917737044394

Topic 2
Min: -0.06934650987386703
Max: 0.0971817597746849
Mean: 0.038204166303930254
Median: 0.037988311605783365

Topic 3
Min: -0.004414821043610573
Max: 0.10330456495285034
Mean: 0.03845092275400215
Median: 0.03845676822536108

Topic 4
Min: -0.004414821043610573
Max: 0.08035795949399471
Mean: 0.03768188848330612
Median: 0.03823113767895848

Topic 5
Min: -0.0053747628116980195
Max: 0.19595731794834137
Mean: 0.04003256027144584
Median: 0.040100179513559694

Topic 6
Min: -0.017852515292664368
Max: 0.0842223472893238
Mean: 0.03810696135498633
Median: 0.03845404778157586

Topic 7
Min: -0.009719941454629103
Max: 0.06711390018463134
Mean: 0.0384217725735807
Median: 0.03893806237345958

Topic 8
Min: -0.0006665989756584167
Max: 0.0764682576732917
Mean: 0.0390554480379

In [None]:
# Save to JSON
json_ready_results = {
    topic: df.to_dict(orient='records')
    for topic, df in bertagent_input.items()
}

# Save to JSON file
with open("bertagent_output.json", "w", encoding="utf-8-sig") as f:
    json.dump(json_ready_results, f, ensure_ascii=False, indent=2)

In [None]:
# Save median scores as topic scores
bertagent_scores = {}
for topic in bertagent_input:
  bertagent_scores[topic] = np.median(bertagent_input[topic]['BATot'])

In [None]:
# Save to JSON file
with open("bertagent_scores.json", "w", encoding="utf-8-sig") as f:
    json.dump(bertagent_scores, f, ensure_ascii=False, indent=2)