# Assignment 2 - BERTOPIC

Explain here the assignment goal

### Importing libraries 

In [2]:
import Assignment2.df_filter_nvida as df_filter_nvida

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer



from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, BaseRepresentation
from bertopic.vectorizers import ClassTfidfTransformer
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

## Import data

In [4]:
news_articles_data = pd.read_csv("../data/us_equities_news_dataset.csv")
stock_data = pd.read_csv("../data/NVDA.csv")

## Filter the data for only NVIDIA and add the column that checks if the NVIDIA stock increased for the article's day

- The requirement of the analysis is that we only include articles that have the word \"NVIDIA\" or \"NVDA\" in the content of the article.

### Drop duplicates

we dropped a total of 16 rows that have identical content

In [5]:
nvida_news_articles_df = df_filter_nvida.filter_df_to_nvida(df=news_articles_data,
                                                            related_tickers="NVDA")
df_news = df_filter_nvida.add_nvidia_increase_decrease_bool_to_df(df_news=nvida_news_articles_df,
                                                                  df_stock_data=stock_data)
df_news = df_news.drop_duplicates(subset='content', keep='first').reset_index(drop=True)
df_news

nan
nan
nan
nan
nan
nan
nan
nan


Unnamed: 0,id,ticker,ticker_NVDA_improved,title,title_NVIDIA_topic,category,content,content_NVIDIA_topic,release_date,provider,url,article_id,NVIDIA_stock_increase
0,221539,NIO,True,A Central Bank War Just Started And Its Good F...,False,opinion,ECB Effects\nThe move in the euro was huge fa...,True,2019-03-07,Michael Kramer,https://www.investing.com/analysis/a-central-b...,200395687,0
1,221547,NIO,True,6 Stocks To Watch Nivida Could Be Falling,False,opinion,6 Stocks To Watch March 6 Trading Session\nSt...,True,2019-03-06,Michael Kramer,https://www.investing.com/analysis/6-stocks-to...,200394931,0
2,221572,NIO,True,Stocks Dow Drops Nearly 400 Points as Apple ...,False,news,Investing com A rout in Apple and Facebook ...,True,2018-11-19,Investing.com,https://www.investing.com/news/stock-market-ne...,1694042,0
3,221593,UBER,True,The Zacks Analyst Blog Highlights Advanced Mi...,True,opinion,For Immediate ReleaseChicago IL January 13 ...,True,2020-01-12,Zacks Investment Research,https://www.investing.com/analysis/the-zacks-a...,200498277,0
4,221597,UBER,True,The Best Of CES 2020 Revised,False,opinion,With 4 500 companies bringing their innovation...,True,2020-01-16,Zacks Investment Research,https://www.investing.com/analysis/the-best-of...,200499164,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3436,442657,AMD,True,Here s Why Nvidia NVDA Stock Is Gaining Today,True,opinion,Shares of Nvidia NASDAQ NVDA are up nearly...,True,2016-09-27,Zacks Investment Research,https://www.investing.com/analysis/here's-why-...,200155860,1
3437,442682,AMD,True,4 Stocks To Watch Today ATW CWEI MXL SLCA,False,opinion,It was a pretty good start to the week on Mond...,True,2016-05-17,Harry Boxer,"https://www.investing.com/analysis/atw,-cwei,-...",200130262,1
3438,442705,AMD,True,Here s What The Buy Side Expects From AMD Thur...,False,opinion,Advanced Micro Devices Inc NYSE AMD is set ...,True,2014-04-17,Estimize,https://www.investing.com/analysis/here’s-what...,209915,1
3439,442984,T,True,Zacks com Featured Highlights AT T Nu Skin E...,True,opinion,For Immediate Release\n\nChicago IL July 22...,True,2016-07-21,Zacks Investment Research,https://www.investing.com/analysis/zacks.com-f...,200143537,0


# BERTOPIC

In [18]:
model_settings = {
        "document_embedding": SentenceTransformer("all-MiniLM-L6-v2"),
        "reduce_dimensionality": UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine'),
        "clustering": HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True),
        "topic_tokenization": CountVectorizer(stop_words="english"),
        "topic_representation": ClassTfidfTransformer(),
        "representation_model": KeyBERTInspired()
    }
print(model_settings["reduce_dimensionality"])

UMAP(metric='cosine', min_dist=0.0, n_components=5)


In [32]:
def get_standard_bertopic(model_setting_index=0):
    """
    each of the steps in bertopic are assigned using the model settings, in the future this can be replaced with a grid search system to evaluate different model settings, but this is the most basic version with standard parameters. each of the model settings is in a list
    :param model_setting_index: defines which of the hyperparamter settings to pick
    :return: 
    """
    model_settings = [{
        "language": "english",
        "nr_topics": None,
        "document_embedding": SentenceTransformer("all-MiniLM-L6-v2"),
        "reduce_dimensionality": UMAP(),
        "clustering": HDBSCAN(),
        "topic_tokenization": CountVectorizer(stop_words="english"),
        "topic_representation": ClassTfidfTransformer(),
        "representation_model": KeyBERTInspired()
    },
    {
        "language": "english",
        "nr_topics": 25,
        "document_embedding": SentenceTransformer("all-MiniLM-L6-v2"),
        "reduce_dimensionality": UMAP(),
        "clustering": HDBSCAN(),
        "topic_tokenization": CountVectorizer(stop_words="english"),
        "topic_representation": ClassTfidfTransformer(),
        "representation_model": KeyBERTInspired()
    },
    {
        "language": "english",
        "nr_topics": 25,
        "document_embedding": SentenceTransformer("all-MiniLM-L6-v2"),
        "reduce_dimensionality": UMAP(),
        "clustering": HDBSCAN(min_cluster_size=15),
        "topic_tokenization": CountVectorizer(stop_words="english"),
        "topic_representation": ClassTfidfTransformer(),
        "representation_model": KeyBERTInspired()
    }
    ]
    # Step 1 - Extract embeddings
    embedding_model = model_settings[model_setting_index]["document_embedding"]
    
    # Step 2 - Reduce dimensionality
    umap_model = model_settings[model_setting_index]["reduce_dimensionality"]

    # Step 3 - Cluster reduced embeddings
    hdbscan_model = model_settings[model_setting_index]["clustering"]

    # Step 4 - Tokenize topics
    vectorizer_model = model_settings[model_setting_index]["topic_tokenization"]

    # Step 5 - Create topic representation
    ctfidf_model = model_settings[model_setting_index]["topic_representation"]

    # Step 6 - (Optional) Fine-tune topic representations with
    # a `bertopic.representation` model
    representation_model = model_settings[model_setting_index]["representation_model"]
    
    language = model_settings[model_setting_index]["language"]

    # All steps together
    topic_model = BERTopic(
        language=language,
        nr_topics=model_settings[model_setting_index]["nr_topics"],
        embedding_model=embedding_model,          # Step 1 - Extract embeddings
        umap_model=umap_model,                    # Step 2 - Reduce dimensionality
        hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings
        vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics
        ctfidf_model=ctfidf_model,                # Step 5 - Extract topic words
        representation_model=representation_model # Step 6 - (Optional) Fine-tune topic represenations
    )
    return topic_model


In [7]:
def train_bertopic_model(documents, model_setting_index=0):
    """
    trains the bertopic model from the documents that are given as a parameter
    :param documents: an array of documents to get topics made of
    :param model_setting_index: specifies which batch of hyperparameters to pick that define bertopic. Standard = 0
    :return: 
    """
    topic_model = get_standard_bertopic(model_setting_index=model_setting_index)
    topics, props = topic_model.fit_transform(documents)
    return topic_model, topics, props
    
topic_model_content, topics_numbers, probs_numbers = train_bertopic_model(df_news["content"], model_setting_index=0)


OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [8]:
def get_topics_that_relate_to_nvidia(topic_model):
    """
    this function finds the topics that relate most to the input words given which is NVIDIA
    :param topic_model: 
    :return: 
    """
    similar_topics, similarities = topic_model.find_topics("NVIDIA", top_n=3)
    for topic_nr, similarity in zip(similar_topics, similarities):
        print(f"topic_nr: {topic_nr} | similarity to NVIDIA: {similarity}")
        print(f"words with similarities for topic:")
        print(topic_model.get_topic(topic_nr))
        
get_topics_that_relate_to_nvidia(topic_model=topic_model_content)

topic_nr: 85 | similarity to NVIDIA: 0.6380770206451416
words with similarities for topic:
[('nvidia', 0.60379636), ('considernvidia', 0.57020915), ('gpu', 0.56894743), ('gpus', 0.5512327), ('geforce', 0.45177752), ('1070', 0.44811034), ('rtx', 0.43754262), ('gtx', 0.43479204), ('2070', 0.39475513), ('nvda', 0.3770643)]
topic_nr: 94 | similarity to NVIDIA: 0.5997005701065063
words with similarities for topic:
[('nvidia', 0.60366654), ('gpu', 0.48001462), ('gpus', 0.4441163), ('nvda', 0.43977684), ('processors', 0.3520819), ('gaming', 0.33173496), ('displays', 0.26054502), ('graphics', 0.25659275), ('cloud', 0.25354078), ('cryptocurrency', 0.24369599)]
topic_nr: 52 | similarity to NVIDIA: 0.595443844795227
words with similarities for topic:
[('nvidia', 0.58245635), ('gpu', 0.47601318), ('gpus', 0.44771707), ('nvda', 0.43490544), ('nasdaq', 0.42085326), ('cryptocurrencies', 0.3868573), ('cryptocurrency', 0.3726718), ('amd', 0.36018553), ('gaming', 0.35129237), ('enix', 0.32141173)]


### Topic explanation
the topic that is equal -1, is not a topic. The outliers in bertopic, that bertopic couldn't cluster are represented here. Reference to this statement can be found on Github, where the Author of BERTopic Maarten Grootendorst mentions it (https://github.com/MaartenGr/BERTopic/issues/90#issuecomment-1056958453). 



In [9]:
topic_model_content.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1041,-1_nasdaq_stock_stocks_nyse,"[nasdaq, stock, stocks, nyse, dow, investors, ...",[For Immediate ReleaseChicago IL Dec 18 20...
1,0,104,0_nasdaq_stocks_stock_markets,"[nasdaq, stocks, stock, markets, nyse, market,...",[The technology sector has been the investors ...
2,1,85,1_stocks_stock_market_trend,"[stocks, stock, market, trend, nyse, nasdaq, f...",[Stocks had a strong day with the S P 500 risi...
3,2,73,2_nvda_nasdaq_nvidia_stock,"[nvda, nasdaq, nvidia, stock, shares, traded, ...",[Investing com NVIDIA NASDAQ NVDA fell by ...
4,3,59,3_nyse_nasdaq_dow_shares,"[nyse, nasdaq, dow, shares, index, corporation...",[Investing com U S stocks were higher after...
...,...,...,...,...,...
139,138,5,138_recession_inflation_dow_unemployment,"[recession, inflation, dow, unemployment, econ...",[On Mar 8 a report of Federal Reserve stated ...
140,139,5,139_sectors_industries_industry_semiconductors,"[sectors, industries, industry, semiconductors...",[The semiconductor industry is made up of 15 s...
141,140,5,140_marketplace_prospects_partnership_partners...,"[marketplace, prospects, partnership, partners...",[Agilent Technologies NYSE A is an origi...
142,141,5,141_tariffs_nasdaq_tariff_nyse,"[tariffs, nasdaq, tariff, nyse, semiconductor,...",[The latest buzz is that the Trump administrat...


In [16]:
topic_model_content.visualize_topics().show()
topic_model_content.visualize_barchart(top_n_topics=32).show()


In [19]:
df_news["topic"] = topics_numbers
df_news

Unnamed: 0,id,ticker,ticker_NVDA_improved,title,title_NVIDIA_topic,category,content,content_NVIDIA_topic,release_date,provider,url,article_id,NVIDIA_stock_increase,topic
0,221539,NIO,True,A Central Bank War Just Started And Its Good F...,False,opinion,ECB Effects\nThe move in the euro was huge fa...,True,2019-03-07,Michael Kramer,https://www.investing.com/analysis/a-central-b...,200395687,0,-1
1,221547,NIO,True,6 Stocks To Watch Nivida Could Be Falling,False,opinion,6 Stocks To Watch March 6 Trading Session\nSt...,True,2019-03-06,Michael Kramer,https://www.investing.com/analysis/6-stocks-to...,200394931,0,93
2,221572,NIO,True,Stocks Dow Drops Nearly 400 Points as Apple ...,False,news,Investing com A rout in Apple and Facebook ...,True,2018-11-19,Investing.com,https://www.investing.com/news/stock-market-ne...,1694042,0,-1
3,221593,UBER,True,The Zacks Analyst Blog Highlights Advanced Mi...,True,opinion,For Immediate ReleaseChicago IL January 13 ...,True,2020-01-12,Zacks Investment Research,https://www.investing.com/analysis/the-zacks-a...,200498277,0,-1
4,221597,UBER,True,The Best Of CES 2020 Revised,False,opinion,With 4 500 companies bringing their innovation...,True,2020-01-16,Zacks Investment Research,https://www.investing.com/analysis/the-best-of...,200499164,1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3436,442657,AMD,True,Here s Why Nvidia NVDA Stock Is Gaining Today,True,opinion,Shares of Nvidia NASDAQ NVDA are up nearly...,True,2016-09-27,Zacks Investment Research,https://www.investing.com/analysis/here's-why-...,200155860,1,136
3437,442682,AMD,True,4 Stocks To Watch Today ATW CWEI MXL SLCA,False,opinion,It was a pretty good start to the week on Mond...,True,2016-05-17,Harry Boxer,"https://www.investing.com/analysis/atw,-cwei,-...",200130262,1,21
3438,442705,AMD,True,Here s What The Buy Side Expects From AMD Thur...,False,opinion,Advanced Micro Devices Inc NYSE AMD is set ...,True,2014-04-17,Estimize,https://www.investing.com/analysis/here’s-what...,209915,1,9
3439,442984,T,True,Zacks com Featured Highlights AT T Nu Skin E...,True,opinion,For Immediate Release\n\nChicago IL July 22...,True,2016-07-21,Zacks Investment Research,https://www.investing.com/analysis/zacks.com-f...,200143537,0,44


In [28]:
def get_coherence(df, topic_model):
    documents_per_topic = df.groupby(['topic'], as_index=False).agg({'content': ' '.join})
    cleaned_docs = topic_model._preprocess_text(documents_per_topic.content.values)
    
    # Extract vectorizer and analyzer from BERTopic
    vectorizer = topic_model.vectorizer_model
    analyzer = vectorizer.build_analyzer()
    
    
    # Extract features for Topic Coherence evaluation
    #words = vectorizer.get_feature_names()
    tokens = [analyzer(doc) for doc in cleaned_docs]
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]
    topic_words = [[words for words, _ in topic_model.get_topic(topic)] 
                   for topic in range(len(set(df["topic"]))-1)]
    
    # Evaluate
    coherence_model = CoherenceModel(topics=topic_words, 
                                     texts=tokens, 
                                     corpus=corpus,
                                     dictionary=dictionary, 
                                     coherence='c_v')
    coherence = coherence_model.get_coherence()
    return coherence

get_coherence(df_news, topic_model_content)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

0.020411626770612652

In [31]:
topic_model_content, topics_numbers, probs_numbers = train_bertopic_model(df_news["content"], model_setting_index=1)
topic_model_content.visualize_topics().show()
topic_model_content.visualize_barchart(top_n_topics=32).show()


In [33]:
topic_model_content, topics_numbers, probs_numbers = train_bertopic_model(df_news["content"], model_setting_index=2)
topic_model_content.visualize_topics().show()
topic_model_content.visualize_barchart(top_n_topics=32).show()


In [20]:
topics_numbers

[-1,
 93,
 -1,
 -1,
 -1,
 80,
 -1,
 120,
 120,
 5,
 1,
 44,
 -1,
 18,
 -1,
 27,
 69,
 18,
 -1,
 30,
 18,
 18,
 14,
 48,
 48,
 48,
 1,
 89,
 24,
 18,
 6,
 130,
 -1,
 48,
 87,
 18,
 -1,
 -1,
 29,
 -1,
 125,
 48,
 50,
 -1,
 50,
 48,
 14,
 24,
 -1,
 3,
 -1,
 -1,
 3,
 3,
 -1,
 3,
 3,
 6,
 9,
 36,
 36,
 -1,
 3,
 3,
 4,
 3,
 3,
 -1,
 21,
 74,
 6,
 124,
 19,
 -1,
 25,
 74,
 21,
 21,
 3,
 21,
 47,
 87,
 -1,
 140,
 -1,
 37,
 0,
 -1,
 56,
 56,
 61,
 119,
 80,
 8,
 21,
 0,
 99,
 -1,
 -1,
 3,
 21,
 93,
 21,
 31,
 71,
 -1,
 71,
 15,
 -1,
 -1,
 8,
 -1,
 116,
 13,
 60,
 31,
 15,
 -1,
 15,
 15,
 -1,
 15,
 72,
 15,
 -1,
 15,
 -1,
 15,
 15,
 19,
 4,
 15,
 87,
 71,
 -1,
 28,
 -1,
 -1,
 14,
 111,
 -1,
 -1,
 -1,
 29,
 21,
 68,
 93,
 1,
 83,
 39,
 31,
 87,
 -1,
 83,
 43,
 3,
 -1,
 41,
 41,
 83,
 91,
 21,
 106,
 85,
 75,
 75,
 75,
 75,
 -1,
 75,
 -1,
 75,
 75,
 75,
 16,
 -1,
 118,
 -1,
 74,
 6,
 -1,
 84,
 20,
 -1,
 -1,
 74,
 -1,
 35,
 -1,
 -1,
 -1,
 89,
 -1,
 -1,
 -1,
 53,
 25,
 -1,
 70,
 125,
 -1,
 10,
 -1,
