In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re
import unicodedata
from bertopic import BERTopic
import spacy
import en_core_web_md
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
import plotly.express as px
from sklearn.decomposition import PCA

2023-04-20 17:46:05.263751: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-20 17:46:05.368995: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-04-20 17:46:05.369010: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-04-20 17:46:05.395408: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-04-20 17:46:05.969784: W tensorflow/stream_executor/platform/de

In [2]:
amrhet_path = ['./speeches/']
all_paths = ['./Data/', './NYT/', './WSJ/', './GWB/', './speeches/', './Top10/']

def load_docs(paths):
    if not type(paths) == list:
        paths = list(paths)
    speeches = []
    for path in paths:
        list_of_files = []
        for root, dirs, files in os.walk(path):
            for file in files:
                if file.endswith('.txt'):
                    list_of_files.append(os.path.join(root,file))

        for file in list_of_files:
            with open(file, encoding='utf-8') as f:
                text = f.read()
            f.close()
            speeches.append(text)

    #clean out goofy unicode  space characters 
    speeches = [unicodedata.normalize("NFKD", speech) for speech in speeches if len(speech)>0 ]
    #clean out xa0 space characters
    [speech.replace(u'\xa0', '') for speech in speeches]; # ; supresses output
    # remove [stuff] in between square brackets
    def remove_bracket(text):
        return re.sub(r'(\[[^w]*\]\s)', '',text)
    speeches = [remove_bracket(speech) for speech in speeches]
    # Clean up whitespace
    speeches = [re.sub(r'[\s+]', ' ', speech) for speech in speeches]
    return(speeches, list_of_files)

all_speeches, all_files = load_docs(all_paths)
amrhet, amrhet_files = load_docs(amrhet_path)

In [3]:
nlp = spacy.load('en_core_web_md')

<A HREF="https://maartengr.github.io/BERTopic/index.html#quick-start">Bertopic quick start</A>

In [4]:
kaggle_file = open("./word_lists/kaggle_stopwords.txt", "r")
kaggle_data = kaggle_file.read()
kaggle_set = set([word for word in kaggle_data.split('\n')])
kaggle_file.close()
stop_words = list(text.ENGLISH_STOP_WORDS.union(kaggle_set))

<A HREF="https://maartengr.github.io/BERTopic/getting_started/parameter%20tuning/parametertuning.html">BERTopic parameter tuning</A>

In [11]:
# Use CountVectorizer as a parameter for BERTopic to remove stop words
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words=stop_words)
# nr_topics = "auto" combines two topics if similarity is 90%
topic_model = BERTopic(vectorizer_model=vectorizer_model,nr_topics=9,top_n_words=10)
topic_model.fit(amrhet) # learn model on one set of docs
topics, probs = topic_model.transform(amrhet) # predict topics for only subset

In [12]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,42,-1_people_going_know_want
1,0,16,0_jobs_americans_new_people
2,1,17,1_people_iraq_security_united
3,2,26,2_people_america_new_world


In [13]:
topic_model.visualize_barchart(n_words=10, height=300)

<A HREF="https://maartengr.github.io/BERTopic/api/bertopic.html">BERTopic API</A>

<A HREF="https://towardsdatascience.com/interactive-topic-modeling-with-bertopic-1ea55e7d73d8">Add tf-idf to BERTopics</A>

## I'm starting to like this one, with TF-IDF!!!

In [10]:
# Create TF-IDF sparse matrix
# tf-idf embeddings and documents must be same length, thus have to use same set of docs for both
vectorizer_tfidf = TfidfVectorizer(min_df=2, max_df=0.7, stop_words=stop_words)
embeddings = vectorizer_tfidf.fit_transform(amrhet)

# Model
model_tfidf = BERTopic(vectorizer_model=vectorizer_tfidf,nr_topics=9)
topics_tfidf, probs_tfidf = model_tfidf.fit_transform(amrhet, embeddings) 
topic_df = model_tfidf.get_topic_info()
topic_df

ValueError: max_df corresponds to < documents than min_df

<A HREF="https://maartengr.github.io/BERTopic/api/plotting/barchart.html">BERTopic plotting</A>

In [None]:
model_tfidf.visualize_barchart(n_words=10, height=300)

In [None]:
all_topics = model_tfidf.get_topics()
all_topics

In [None]:
topic_df['topic_name'] = ['other', 'intl relations', 'terrorism', 'tragedy', 'health care',
                          'civil rights', 'economy']

In [None]:
df = pd.DataFrame({'text': amrhet, 'file': amrhet_files, 'Topic': topics_tfidf})
df = pd.merge(df, topic_df[['Topic', 'topic_name']], how='left', on='Topic')

meta_info = pd.read_csv('date_file_text_speeches_oba.csv')
meta_info = meta_info.drop('text', axis=1)
meta_info.date = pd.to_datetime(meta_info.date, format='%Y-%m-%d')
df = pd.merge(df, meta_info, how='inner', on='file')

sp_enc = pd.read_csv('speeches_spacy_encodings.csv')
sp_enc.date = pd.to_datetime(sp_enc.date, format='%Y-%m-%d')
label_data = pd.merge(sp_enc, df, how='left', on='date')
pca_data = sp_enc.drop('date', axis=1)

In [None]:
pca = PCA(n_components = 2)
pcafit = pca.fit_transform(pca_data)

In [None]:
pca.explained_variance_

In [None]:
fig = px.scatter(pcafit, 
                 x=0, 
                 y=1, 
                 color=df.topic_name,
                 hover_name=df.date,
                 labels = {'0':'PC1', '1':'PC2', 'color':'topic_name'},
                 color_discrete_sequence=px.colors.qualitative.Bold)
fig.show()

In [None]:
#df.to_csv('bertopic_topics_amrhet.csv', index=False)

In [None]:
topic_dict = dict(topic_df.topic_name[1:].reset_index().drop('index', axis=1).topic_name)
#model_tfidf.set_topic_labels(topic_dict)
model_tfidf.visualize_documents(amrhet, embeddings=embeddings, custom_labels=True)

In [None]:
reducer = umap.UMAP()
umap_df = reducer.fit_transform(pca_data)
umap_df = pd.DataFrame(umap_df)
umap_df.shape

In [None]:
fig = px.scatter(umap_df, 
                 x=0, 
                 y=1, 
                 color=df.topic_name,
                 hover_name=df.date,
                 labels = {'0':'PC1', '1':'PC2', 'color':'topic_name'},
                 color_discrete_sequence=px.colors.qualitative.Bold)
fig.show()