# Assignment 2 - BERTOPIC

Explain here the assignment goal

### Importing libraries 

In [3]:
import Assignment2.df_filter_nvida as df_filter_nvida

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer



from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, BaseRepresentation
from bertopic.vectorizers import ClassTfidfTransformer

## Import data

In [2]:
news_articles_data = pd.read_csv("data/us_equities_news_dataset.csv")
stock_data = pd.read_csv("data/NVDA.csv")

## Filter the data for only NVIDIA and add the column that checks if the NVIDIA stock increased for the article's day

- The requirement of the analysis is that we only include articles that have the word \"NVIDIA\" or \"NVDA\" in the content of the article.

In [3]:
nvida_news_articles_df = df_filter_nvida.filter_df_to_nvida(df=news_articles_data,
                                                            related_tickers="NVDA")
df_news = df_filter_nvida.add_nvidia_increase_decrease_bool_to_df(df_news=nvida_news_articles_df,
                                                                  df_stock_data=stock_data)
df_news

nan
nan
nan
nan
nan
nan
nan
nan


Unnamed: 0,id,ticker,ticker_NVDA_improved,title,title_NVIDIA_topic,category,content,content_NVIDIA_topic,release_date,provider,url,article_id,NVIDIA_stock_increase
24,221539,NIO,True,A Central Bank War Just Started And Its Good F...,False,opinion,ECB Effects\nThe move in the euro was huge fa...,True,2019-03-07,Michael Kramer,https://www.investing.com/analysis/a-central-b...,200395687,0
32,221547,NIO,True,6 Stocks To Watch Nivida Could Be Falling,False,opinion,6 Stocks To Watch March 6 Trading Session\nSt...,True,2019-03-06,Michael Kramer,https://www.investing.com/analysis/6-stocks-to...,200394931,0
57,221572,NIO,True,Stocks Dow Drops Nearly 400 Points as Apple ...,False,news,Investing com A rout in Apple and Facebook ...,True,2018-11-19,Investing.com,https://www.investing.com/news/stock-market-ne...,1694042,0
78,221593,UBER,True,The Zacks Analyst Blog Highlights Advanced Mi...,True,opinion,For Immediate ReleaseChicago IL January 13 ...,True,2020-01-12,Zacks Investment Research,https://www.investing.com/analysis/the-zacks-a...,200498277,0
82,221597,UBER,True,The Best Of CES 2020 Revised,False,opinion,With 4 500 companies bringing their innovation...,True,2020-01-16,Zacks Investment Research,https://www.investing.com/analysis/the-best-of...,200499164,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
221141,442657,AMD,True,Here s Why Nvidia NVDA Stock Is Gaining Today,True,opinion,Shares of Nvidia NASDAQ NVDA are up nearly...,True,2016-09-27,Zacks Investment Research,https://www.investing.com/analysis/here's-why-...,200155860,1
221166,442682,AMD,True,4 Stocks To Watch Today ATW CWEI MXL SLCA,False,opinion,It was a pretty good start to the week on Mond...,True,2016-05-17,Harry Boxer,"https://www.investing.com/analysis/atw,-cwei,-...",200130262,1
221189,442705,AMD,True,Here s What The Buy Side Expects From AMD Thur...,False,opinion,Advanced Micro Devices Inc NYSE AMD is set ...,True,2014-04-17,Estimize,https://www.investing.com/analysis/here’s-what...,209915,1
221468,442984,T,True,Zacks com Featured Highlights AT T Nu Skin E...,True,opinion,For Immediate Release\n\nChicago IL July 22...,True,2016-07-21,Zacks Investment Research,https://www.investing.com/analysis/zacks.com-f...,200143537,0


 # Latent Dirichlet Allocation (LDA)
 LDA allows to generate topics that are most commonly used to explain a matrix notations of document embeddings.
 
LDA works through three levels in the LDA representation. alpha and beta are sampled once for the whole corpus. theta is sampled once for each document. z_dn and w_dn are sampled for each word in each document. This allows documents to be associated with multiple topics.
 
LDA assumes that documents can be created by sampling
- from a distribution of topics over documents 
- distribution of words over topics
   

### Drop duplicates

we dropped a total of 16 rows that have identical content

In [4]:
df_news = df_news.drop_duplicates(subset='content', keep='first').reset_index(drop=True)
df_news

Unnamed: 0,id,ticker,ticker_NVDA_improved,title,title_NVIDIA_topic,category,content,content_NVIDIA_topic,release_date,provider,url,article_id,NVIDIA_stock_increase
0,221539,NIO,True,A Central Bank War Just Started And Its Good F...,False,opinion,ECB Effects\nThe move in the euro was huge fa...,True,2019-03-07,Michael Kramer,https://www.investing.com/analysis/a-central-b...,200395687,0
1,221547,NIO,True,6 Stocks To Watch Nivida Could Be Falling,False,opinion,6 Stocks To Watch March 6 Trading Session\nSt...,True,2019-03-06,Michael Kramer,https://www.investing.com/analysis/6-stocks-to...,200394931,0
2,221572,NIO,True,Stocks Dow Drops Nearly 400 Points as Apple ...,False,news,Investing com A rout in Apple and Facebook ...,True,2018-11-19,Investing.com,https://www.investing.com/news/stock-market-ne...,1694042,0
3,221593,UBER,True,The Zacks Analyst Blog Highlights Advanced Mi...,True,opinion,For Immediate ReleaseChicago IL January 13 ...,True,2020-01-12,Zacks Investment Research,https://www.investing.com/analysis/the-zacks-a...,200498277,0
4,221597,UBER,True,The Best Of CES 2020 Revised,False,opinion,With 4 500 companies bringing their innovation...,True,2020-01-16,Zacks Investment Research,https://www.investing.com/analysis/the-best-of...,200499164,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3436,442657,AMD,True,Here s Why Nvidia NVDA Stock Is Gaining Today,True,opinion,Shares of Nvidia NASDAQ NVDA are up nearly...,True,2016-09-27,Zacks Investment Research,https://www.investing.com/analysis/here's-why-...,200155860,1
3437,442682,AMD,True,4 Stocks To Watch Today ATW CWEI MXL SLCA,False,opinion,It was a pretty good start to the week on Mond...,True,2016-05-17,Harry Boxer,"https://www.investing.com/analysis/atw,-cwei,-...",200130262,1
3438,442705,AMD,True,Here s What The Buy Side Expects From AMD Thur...,False,opinion,Advanced Micro Devices Inc NYSE AMD is set ...,True,2014-04-17,Estimize,https://www.investing.com/analysis/here’s-what...,209915,1
3439,442984,T,True,Zacks com Featured Highlights AT T Nu Skin E...,True,opinion,For Immediate Release\n\nChicago IL July 22...,True,2016-07-21,Zacks Investment Research,https://www.investing.com/analysis/zacks.com-f...,200143537,0


# BERTOPIC

In [19]:
bert = BERTopic()
bert.get_params()

{'calculate_probabilities': False,
 'ctfidf_model': ClassTfidfTransformer(),
 'embedding_model': None,
 'hdbscan_model': HDBSCAN(min_cluster_size=10, prediction_data=True),
 'language': 'english',
 'low_memory': False,
 'min_topic_size': 10,
 'n_gram_range': (1, 1),
 'nr_topics': None,
 'representation_model': None,
 'seed_topic_list': None,
 'top_n_words': 10,
 'umap_model': UMAP(low_memory=False, metric='cosine', min_dist=0.0, n_components=5),
 'vectorizer_model': CountVectorizer(),
 'verbose': False,
 'zeroshot_min_similarity': 0.7,
 'zeroshot_topic_list': None}

In [None]:
ber

In [16]:
test = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')
repr(test)

"UMAP(metric='cosine', min_dist=0.0, n_components=5)"

In [11]:
SentenceTransformer("all-MiniLM-L6-v2")._get_name()
UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine').__class__.__name__
UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine').get_params()

{'a': None,
 'angular_rp_forest': False,
 'b': None,
 'dens_frac': 0.3,
 'dens_lambda': 2.0,
 'dens_var_shift': 0.1,
 'densmap': False,
 'disconnection_distance': None,
 'force_approximation_algorithm': False,
 'init': 'spectral',
 'learning_rate': 1.0,
 'local_connectivity': 1.0,
 'low_memory': True,
 'metric': 'cosine',
 'metric_kwds': None,
 'min_dist': 0.0,
 'n_components': 5,
 'n_epochs': None,
 'n_jobs': -1,
 'n_neighbors': 15,
 'negative_sample_rate': 5,
 'output_dens': False,
 'output_metric': 'euclidean',
 'output_metric_kwds': None,
 'precomputed_knn': (None, None, None),
 'random_state': None,
 'repulsion_strength': 1.0,
 'set_op_mix_ratio': 1.0,
 'spread': 1.0,
 'target_metric': 'categorical',
 'target_metric_kwds': None,
 'target_n_neighbors': -1,
 'target_weight': 0.5,
 'tqdm_kwds': None,
 'transform_mode': 'embedding',
 'transform_queue_size': 4.0,
 'transform_seed': 42,
 'unique': False,
 'verbose': False}

In [18]:


model_settings = {
        "document_embedding": SentenceTransformer("all-MiniLM-L6-v2"),
        "reduce_dimensionality": UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine'),
        "clustering": HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True),
        "topic_tokenization": CountVectorizer(stop_words="english"),
        "topic_representation": ClassTfidfTransformer(),
        "representation_model": KeyBERTInspired()
    }
print(model_settings["reduce_dimensionality"])

UMAP(metric='cosine', min_dist=0.0, n_components=5)


In [13]:
def get_standard_bertopic():
    model_settings = {
        "document_embedding": SentenceTransformer("all-MiniLM-L6-v2"),
        "reduce_dimensionality": UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine'),
        "clustering": HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True),
        "topic_tokenization": CountVectorizer(stop_words="english"),
        "topic_representation": ClassTfidfTransformer(),
        "representation_model": KeyBERTInspired()
    }
    # Step 1 - Extract embeddings
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

    # Step 2 - Reduce dimensionality
    umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

    # Step 3 - Cluster reduced embeddings
    hdbscan_model = HDBSCAN(max_cluster_size=25, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

    # Step 4 - Tokenize topics
    vectorizer_model = CountVectorizer(stop_words="english")

    # Step 5 - Create topic representation
    ctfidf_model = ClassTfidfTransformer()

    # Step 6 - (Optional) Fine-tune topic representations with
    # a `bertopic.representation` model
    representation_model = KeyBERTInspired()
    representation_model = BaseRepresentation()

    # All steps together
    topic_model = BERTopic(
      embedding_model=embedding_model,          # Step 1 - Extract embeddings
      umap_model=umap_model,                    # Step 2 - Reduce dimensionality
      hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings
      vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics
      ctfidf_model=ctfidf_model,                # Step 5 - Extract topic words
      representation_model=representation_model # Step 6 - (Optional) Fine-tune topic represenations
    )
    return topic_model


In [14]:
topic_model_content = get_standard_bertopic()


In [15]:
topics_numbers, probs_numbers = topic_model_content.fit_transform(df_news["content"])


In [16]:
topic_model_content.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1006,-1_stocks_stock_nasdaq_nyse,"[stocks, stock, nasdaq, nyse, investors, marke...",[For Immediate Release\n\n\tChicago IL Nove...
1,0,186,0_nvidia_earnings_nvda_gpu,"[nvidia, earnings, nvda, gpu, gpus, geforce, n...",[It has been about a month since the last earn...
2,1,170,1_amd_nasdaq_earnings_intel,"[amd, nasdaq, earnings, intel, stock, ryzen, r...",[Advanced Micro Devices Inc NASDAQ AMD is ...
3,2,144,2_earnings_stock_revenue_sales,"[earnings, stock, revenue, sales, revenues, na...",[Continuing its earnings streak Palo Alto Net...
4,3,141,3_nvda_nasdaq_nvidia_stocks,"[nvda, nasdaq, nvidia, stocks, stock, nyse, gp...",[For Immediate ReleaseChicago IL Jan 03 20...
5,4,135,4_stocks_markets_dow_stock,"[stocks, markets, dow, stock, futures, nyse, t...",[Equity markets remain choppy after earnings w...
6,5,115,5_automakers_automotive_nasdaq_intel,"[automakers, automotive, nasdaq, intel, vehicl...",[Intel Corporation s NASDAQ INTC recent de...
7,6,106,6_stocks_nasdaq_stock_tech,"[stocks, nasdaq, stock, tech, markets, investo...",[After a banner year Wall Street s tech big c...
8,7,84,7_stocks_stock_market_trend,"[stocks, stock, market, trend, nyse, nasdaq, t...",[11 Stock Market Predictions For The Week Of A...
9,8,79,8_nasdaq_apple_iphones_stocks,"[nasdaq, apple, iphones, stocks, stock, iphone...",[Apple NASDAQ AAPL is working to bring hig...


In [17]:
topic_model_content.get_topic(0)


[('nvidia', 0.5017726),
 ('earnings', 0.4277367),
 ('nvda', 0.40131322),
 ('gpu', 0.39101148),
 ('gpus', 0.37811893),
 ('geforce', 0.36121118),
 ('nasdaq', 0.35732678),
 ('revenues', 0.32696956),
 ('performance', 0.3114929),
 ('stock', 0.3068714)]

In [18]:
topic_model_content.visualize_topics()

In [19]:
topic_model_content.visualize_barchart(top_n_topics=32)

# BERTOPIC for titles

In [16]:
topic_model_title = get_standard_bertopic()

topics_title_numbers, probs_title_numbers = topic_model_title.fit_transform(df_news["title"])

topic_model_title.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,885,-1_italy_equities_markets_daily,"[italy, equities, markets, daily, stocks, mark...",[Daily Analysis Global Equities Rally on Hop...
1,0,216,0_earnings_revenues_revenue_estimates,"[earnings, revenues, revenue, estimates, stock...",[Cadence Design CDNS Q4 Earnings In Line Re...
2,1,211,1_intel_nvidia_microsoft_apple,"[intel, nvidia, microsoft, apple, ibm, faceboo...",[The Zacks Analyst Blog Highlights Apple QUA...
3,2,142,2_stocks_investors_invest_tech,"[stocks, investors, invest, tech, portfolio, m...",[3 Large Cap Tech Stocks For Dividend Investor...
4,3,132,3_etfs_etf_semiconductor_invest,"[etfs, etf, semiconductor, invest, stocks, inv...","[Can Semiconductor ETFs Continue Their Rally ,..."
5,4,100,4_intel_intc_invest_stock,"[intel, intc, invest, stock, cpus, ibm, analys...",[Intel Q3 2019 Earnings Preview Will INTC Sto...
6,5,80,5_nvda_nvidia_stock_buying,"[nvda, nvidia, stock, buying, buy, markets, ma...",[Should You Buy Nvidia NVDA Stock Before 201...
7,6,77,6_amd_intel_processors_micro,"[amd, intel, processors, micro, stock, ryzen, ...",[Shares Of Advanced Micro AMD Rally On New I...
8,7,72,7_nvidia_rises_nvda_11,"[nvidia, rises, nvda, 11, stock, good, , , , ]","[NVIDIA Rises 3 , NVIDIA Rises 3 , NVIDIA Rise..."
9,8,72,8_nvidia_falls_downgrade_disney,"[nvidia, falls, downgrade, disney, slips, rais...","[NVIDIA Falls 4 , NVIDIA Falls 4 , NVIDIA Fall..."


In [17]:
topic_model_title.visualize_topics()


In [18]:
topic_model_title.visualize_barchart(top_n_topics=32)