Load text via pandas from excel into spaCy - change the spaCy model to fit your corpus.
See https://spacy.io/models for the models that fit your case.

In [None]:
import pandas as pd
import spacy

# Load the Excel file into a DataFrame - change the path to your file
df = pd.read_excel('NA_remaining.xlsx')

# Extract the 'text' column as a list of documents
docs = df['text'].tolist()

# Load the spaCy model
nlp = spacy.load('en_core_web_sm')

#Process each document with spaCy
for doc in nlp.pipe(docs):
#Do something with the processed document
    print(doc)



Loading into BERTopic and ZeroShotClassification

In [None]:
from bertopic.representation import ZeroShotClassification
from bertopic import BERTopic

# Create your representation model
candidate_topics = ["Movies", "USA", "Music", "Finances", "Politics", "Education", "Sports", "Global Warming","Life Expectancy","Military","Economy","Student Loans","Climate Change","Employment","Stockmarket","Accident","Breastfeeding","Population Age","Social Media","Inflation","Medicine","Video Games","Animals","Election","Alcohol Consumption","Cancer","Health","Pandemic","Covid-19","xPost","Youtube", "Reddit", "Facebook","Apple","Public Transportation", "Housing","Airlines","Trading","Electric Vehicles","Religion","News","Crypto","Lifestyle","Population","Food","Refugees","Twitter","Natural Catastrophy","Fertility Rate","Awards","Police"]
representation_model = ZeroShotClassification(candidate_topics, model="facebook/bart-large-mnli")

# Use the representation model in BERTopic on top of the default pipeline
topic_model = BERTopic(representation_model=representation_model)

In [None]:
# Fit the BERTopic model on your documents
topics, _ = topic_model.fit_transform(docs)

# Get the most frequent topics and their frequency
topics, freq = topic_model.get_topic_freq()

# Print the topics and their frequency
for t, f in zip(topics, freq):
    print(f"Topic {t}: {f} documents")


Visualize the topics

In [None]:
#Barchart visualization of the topics - change the top_n_topics to the number of topics you want to visualize
topic_model.visualize_barchart(top_n_topics=100)

Fit the BERTopic model on the Reddit posts and export the topic names into a new Excel file

In [None]:
# Fit the BERTopic model on the Reddit posts
topics, _ = topic_model.fit_transform(df["text"])

# Map the topics back to the original posts
df["topic"] = topics

# Export the topic assignments and names to a new Excel file
topic_names = [topic_model.get_topic(topic) for topic in topics]
df["topic_name"] = topic_names
df.to_excel("remain_reddit_posts_with_topics.xlsx", index=False)

Visualize the topics and their distribution as barchart - change top_n_topics to the number of topics you want to visualize
You can change the amount of topics you have by using topic_model.get_topic_info().head() and see how many rows are available.

In [None]:
topic_model.visualize_barchart(top_n_topics=50)

In [None]:
topic_model.get_topic_info().head(220)

In [None]:
topic_model.visualize_topics()

Export your visualization to an HTML file - This can pe applied to all visualizations by adjusting the plot variable.

In [None]:
topic_model.visualize_topics()

# Visualize the topics
plot = topic_model.visualize_topics()
plot.write_html("bertopic_intertopic_map_tensorflow.html")

In [None]:
# Visualize the topics
plot = topic_model.visualize_barchart(top_n_topics=80)
plot.write_html("bertopic_visualization_Tensorflow.html")


Tensorflow - NPL/NLI and export the topics to a new excel file

In [None]:
from bertopic import BERTopic
import tensorflow_hub

# Load the BERT model trained on MNLI
embedding_model = tensorflow_hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")


# Use the representation model in BERTopic
topic_model = BERTopic(representation_model=representation_model)

# Fit the BERTopic model on your documents
topics, _ = topic_model.fit_transform(docs)

# Get the most frequent topics and their frequency
topics, freq = topic_model.get_topic_freq()

# Print the topics and their frequency
for t, f in zip(topics, freq):
    print(f"Topic {t}: {f} documents")

# Initialize BERTopic
topic_model = BERTopic()

# Fit the BERTopic model on the Reddit posts
topics, _ = topic_model.fit_transform(df["text"])

# Map the topics back to the original posts
df["topic"] = topics

# Export the topic assignments and names to a new Excel file
topic_names = [topic_model.get_topic(topic) for topic in topics]
df["topic_name"] = topic_names
df.to_excel("reddit_post_NA_Tensorflow2.xlsx", index=False)


Use USE 4

In [None]:
import pandas as pd
import tensorflow_hub as hub
from bertopic import BERTopic

# Load USE model
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)

# Read the excel file containing Reddit posts
df = pd.read_excel('NA_remaining.xlsx')

# Fit the BERTopic model on the Reddit posts
topic_model = BERTopic(language="english", embedding_model=model)
topics, _ = topic_model.fit_transform(df["text"])

# Map the topics back to the original posts
df["topic"] = topics

# Export the topic assignments and names to a new Excel file
topic_names = [topic_model.get_topic(topic) for topic in topics]
df["topic_name"] = topic_names
df.to_excel("NAreddit_post_topics.xlsx", index=False)


Use Gensim to train an LDA model on the Reddit posts - and visualize the topics - change the df readout to the excel file you want to use

In [None]:
import pandas as pd
import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.utils import simple_preprocess

# Load the Excel file into a pandas dataframe
df = pd.read_excel('NA_remaining.xlsx')

# Extract the text from the "title" column
titles = df['text'].tolist()

# Preprocess the text
texts = [simple_preprocess(title) for title in titles]

# Create a dictionary from the preprocessed text
dictionary = Dictionary(texts)

# Create a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

# Train an LDA model with 10 topics
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=10)

# Print the main topics and their top words
for topic_id, topic in lda_model.print_topics(num_topics=5):
    print(f'Topic {topic_id}: {topic}')


In [None]:
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()

# Create the pyLDAvis visualization
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)

# Display the visualization
