# Topic Modeling with BERTopic

## Import Python packages

In [None]:
import os
import pandas as pd
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, OpenAI
import openai
import tiktoken

## Load Wikipedia protest preprocessed dataset

In [None]:
df = pd.read_csv(os.path.join('data','wikipedia_content_preprocessed.csv'))
df.head()

## Zero shot topic modeling

In [None]:
key_topics = ["race", "gender", "nationality", "ethnicity", "language", "religion", "disability"]

zeroshot_min_similarity = .85
min_topic_size = 50
representation = 'KeyBERTInspired' # KeyBERTInspired | gpt-3.5-turbo
embedding_model = "thenlper/gte-small" # "thenlper/gte-small" | "all-MiniLM-L6-v2" (default)

model_path = os.path.join('models','zeroshot_'+representation+'_minsize'+str(min_topic_size)+'_minsimilarity'+str(zeroshot_min_similarity)+'.pickle')

In [None]:
if representation == 'KeyBERTInspired':
    topic_model = BERTopic(
        embedding_model = embedding_model, 
        min_topic_size = min_topic_size,
        zeroshot_topic_list = key_topics,
        zeroshot_min_similarity = zeroshot_min_similarity,
        representation_model = KeyBERTInspired()
    )

if representation == 'gpt-3.5-turbo':
    client = openai.OpenAI(api_key=os.environ['OPENAI_API_KEY'])
    tokenizer = tiktoken.encoding_for_model(representation)
    representation_model = OpenAI(client, model=representation, doc_length=100, delay_in_seconds=2, tokenizer=tokenizer, chat=True)
    topic_model_AI = BERTopic(
        embedding_model = embedding_model, 
        zeroshot_topic_list = key_topics,
        zeroshot_min_similarity = zeroshot_min_similarity,
        min_topic_size = min_topic_size,
        representation_model = representation_model
    )

topics, _ = topic_model.fit_transform(df['content_preprocessed_short'])

In [None]:
topic_model.save(model_path, serialization="pickle")

In [None]:
topic_model = BERTopic.load(model_path)

In [None]:
df_topic_info = topic_model.get_topic_info()


In [None]:
df_topic_info.head(10)

In [None]:
df_topic_info.to_csv(os.path.join('outputs','topic_info_zeroshot_'+representation+'_minsize'+str(min_topic_size)+'_minsimilarity'+str(zeroshot_min_similarity)+'.csv'), index=False)

In [None]:
df_document_info = topic_model.get_document_info(df['content_preprocessed_short'])

In [None]:
df_document_info.head(10)

In [None]:
df_document_info.to_csv(os.path.join('outputs','document_topic_info_zeroshot_'+representation+'_minsize'+str(min_topic_size)+'_minsimilarity'+str(zeroshot_min_similarity)+'.csv'), index=False)