In [2]:
import Assignment2.df_filter_nvida as df_filter_nvida

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer



from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, BaseRepresentation
from bertopic.vectorizers import ClassTfidfTransformer
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mehdigreefhorst/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
news_articles_data = pd.read_csv("../data/us_equities_news_dataset.csv")
stock_data = pd.read_csv("../data/NVDA.csv")
nvida_news_articles_df = df_filter_nvida.filter_df_to_nvida(df=news_articles_data,
                                                            related_tickers="NVDA")
df_news = df_filter_nvida.add_nvidia_increase_decrease_bool_to_df(df_news=nvida_news_articles_df,
                                                                  df_stock_data=stock_data)
df_news = df_news.drop_duplicates(subset='content', keep='first').reset_index(drop=True)
df_news

nan
nan
nan
nan
nan
nan
nan
nan


Unnamed: 0,id,ticker,ticker_NVDA_improved,title,title_NVIDIA_topic,category,content,content_NVIDIA_topic,release_date,provider,url,article_id,NVIDIA_stock_increase
0,221539,NIO,True,A Central Bank War Just Started And Its Good F...,False,opinion,ECB Effects\nThe move in the euro was huge fa...,True,2019-03-07,Michael Kramer,https://www.investing.com/analysis/a-central-b...,200395687,0
1,221547,NIO,True,6 Stocks To Watch Nivida Could Be Falling,False,opinion,6 Stocks To Watch March 6 Trading Session\nSt...,True,2019-03-06,Michael Kramer,https://www.investing.com/analysis/6-stocks-to...,200394931,0
2,221572,NIO,True,Stocks Dow Drops Nearly 400 Points as Apple ...,False,news,Investing com A rout in Apple and Facebook ...,True,2018-11-19,Investing.com,https://www.investing.com/news/stock-market-ne...,1694042,0
3,221593,UBER,True,The Zacks Analyst Blog Highlights Advanced Mi...,True,opinion,For Immediate ReleaseChicago IL January 13 ...,True,2020-01-12,Zacks Investment Research,https://www.investing.com/analysis/the-zacks-a...,200498277,0
4,221597,UBER,True,The Best Of CES 2020 Revised,False,opinion,With 4 500 companies bringing their innovation...,True,2020-01-16,Zacks Investment Research,https://www.investing.com/analysis/the-best-of...,200499164,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3436,442657,AMD,True,Here s Why Nvidia NVDA Stock Is Gaining Today,True,opinion,Shares of Nvidia NASDAQ NVDA are up nearly...,True,2016-09-27,Zacks Investment Research,https://www.investing.com/analysis/here's-why-...,200155860,1
3437,442682,AMD,True,4 Stocks To Watch Today ATW CWEI MXL SLCA,False,opinion,It was a pretty good start to the week on Mond...,True,2016-05-17,Harry Boxer,"https://www.investing.com/analysis/atw,-cwei,-...",200130262,1
3438,442705,AMD,True,Here s What The Buy Side Expects From AMD Thur...,False,opinion,Advanced Micro Devices Inc NYSE AMD is set ...,True,2014-04-17,Estimize,https://www.investing.com/analysis/here’s-what...,209915,1
3439,442984,T,True,Zacks com Featured Highlights AT T Nu Skin E...,True,opinion,For Immediate Release\n\nChicago IL July 22...,True,2016-07-21,Zacks Investment Research,https://www.investing.com/analysis/zacks.com-f...,200143537,0


In [5]:
def get_standard_bertopic(model_setting_index=0, customer_model_settings = None):
    """
    each of the steps in bertopic are assigned using the model settings, in the future this can be replaced with a grid search system to evaluate different model settings, but this is the most basic version with standard parameters. each of the model settings is in a list
    :param model_setting_index: defines which of the hyperparamter settings to pick
    :return: 
    """
    model_settings = [{
        "language": "english",
        "nr_topics": None,
        "document_embedding": SentenceTransformer("all-MiniLM-L6-v2"),
        "reduce_dimensionality": UMAP(),
        "clustering": HDBSCAN(),
        "topic_tokenization": CountVectorizer(stop_words="english"),
        "topic_representation": ClassTfidfTransformer(),
        "representation_model": KeyBERTInspired()
    },
    {
        "language": "english",
        "nr_topics": 25,
        "document_embedding": SentenceTransformer("all-MiniLM-L6-v2"),
        "reduce_dimensionality": UMAP(),
        "clustering": HDBSCAN(),
        "topic_tokenization": CountVectorizer(stop_words="english"),
        "topic_representation": ClassTfidfTransformer(),
        "representation_model": KeyBERTInspired()
    },
    {
        "language": "english",
        "nr_topics": 25,
        "document_embedding": SentenceTransformer("all-MiniLM-L6-v2"),
        "reduce_dimensionality": UMAP(),
        "clustering": HDBSCAN(min_cluster_size=15),
        "topic_tokenization": CountVectorizer(stop_words="english"),
        "topic_representation": ClassTfidfTransformer(),
        "representation_model": KeyBERTInspired()
    },
    {
        "language": "english",
        "nr_topics": "auto",
        "document_embedding": SentenceTransformer("all-MiniLM-L6-v2"),
        "reduce_dimensionality": UMAP(min_dist=0.05, spread=0.5),
        "clustering": HDBSCAN(min_cluster_size=15),
        "topic_tokenization": CountVectorizer(stop_words="english"),
        "topic_representation": ClassTfidfTransformer(),
        "representation_model": KeyBERTInspired()
    }
    ]
    
    if customer_model_settings is not None:
        for key in model_settings[0].keys():
            if key not in customer_model_settings.keys():
                raise Exception(f"Missing key: {key}")
            
            
    # Step 1 - Extract embeddings
    embedding_model = model_settings[model_setting_index]["document_embedding"]
    
    # Step 2 - Reduce dimensionality
    umap_model = model_settings[model_setting_index]["reduce_dimensionality"]

    # Step 3 - Cluster reduced embeddings
    hdbscan_model = model_settings[model_setting_index]["clustering"]

    # Step 4 - Tokenize topics
    vectorizer_model = model_settings[model_setting_index]["topic_tokenization"]

    # Step 5 - Create topic representation
    ctfidf_model = model_settings[model_setting_index]["topic_representation"]

    # Step 6 - (Optional) Fine-tune topic representations with
    # a `bertopic.representation` model
    representation_model = model_settings[model_setting_index]["representation_model"]
    
    language = model_settings[model_setting_index]["language"]

    # All steps together
    topic_model = BERTopic(
        language=language,
        nr_topics=model_settings[model_setting_index]["nr_topics"],
        embedding_model=embedding_model,          # Step 1 - Extract embeddings
        umap_model=umap_model,                    # Step 2 - Reduce dimensionality
        hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings
        vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics
        ctfidf_model=ctfidf_model,                # Step 5 - Extract topic words
        representation_model=representation_model # Step 6 - (Optional) Fine-tune topic represenations
    )
    return topic_model

def train_bertopic_model(documents, model_setting_index=0, customer_model_settings = None):
    """
    trains the bertopic model from the documents that are given as a parameter
    :param documents: an array of documents to get topics made of
    :param model_setting_index: specifies which batch of hyperparameters to pick that define bertopic. Standard = 0
    :return: 
    """
    topic_model = get_standard_bertopic(model_setting_index=model_setting_index, 
                                        customer_model_settings=customer_model_settings)
    topics, props = topic_model.fit_transform(documents)
    return topic_model, topics, props
    
def get_topics_that_relate_to_nvidia(topic_model):
    """
    this function finds the topics that relate most to the input words given which is NVIDIA
    :param topic_model: 
    :return: 
    """
    similar_topics, similarities = topic_model.find_topics("NVIDIA", top_n=3)
    for topic_nr, similarity in zip(similar_topics, similarities):
        print(f"topic_nr: {topic_nr} | similarity to NVIDIA: {similarity}")
        print(f"words with similarities for topic:")
        print(topic_model.get_topic(topic_nr))
        

def get_coherence(df, _topic_model, _topics_numbers):
    # add the new topic numbers to the "topic" column
    df['topic'] = _topics_numbers
    documents_per_topic = df.groupby(['topic'], as_index=False).agg({'content': ' '.join})
    cleaned_docs = _topic_model._preprocess_text(documents_per_topic.content.values)
    
    # Extract vectorizer and analyzer from BERTopic
    vectorizer = _topic_model.vectorizer_model
    analyzer = vectorizer.build_analyzer()
    
    
    # Extract features for Topic Coherence evaluation
    #words = vectorizer.get_feature_names()
    tokens = [analyzer(doc) for doc in cleaned_docs]
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]
    topic_words = [[words for words, _ in _topic_model.get_topic(topic)] 
                   for topic in range(len(set(df["topic"]))-1)]
    
    # Evaluate
    coherence_model = CoherenceModel(topics=topic_words, 
                                     texts=tokens, 
                                     corpus=corpus,
                                     dictionary=dictionary, 
                                     coherence='c_v')
    coherence = coherence_model.get_coherence()
    return coherence

def get_topic_diversity(topic_model):
    unique_words = set()
    total_words = 0
    
    for topic_nr, topic_similarity_word_list in topic_model.topic_representations_.items():
        # the first topic is not a topic, these are the outlier documents, so we should disregard these
        if topic_nr == -1:
            continue
        for topic_word, similarity in topic_similarity_word_list:
            
            total_words += 1
            unique_words.add(topic_word)
            
    topic_diversity = len(unique_words)/total_words
    return topic_diversity


def show_bertopic_evaluations(custom_bert_params):
    _topic_model_content, _topics_numbers, _probs_numbers = train_bertopic_model(df_news["content"],
                                                                                 model_setting_index=-1,
                                                                                 customer_model_settings=custom_bert_params)
    print("total number of topics = ", len(_topic_model_content.get_topic_info()))
    print("documents without a topic = ", _topic_model_content.get_topic_info(-1)["Count"].values[0])
    print("topic diversity = ", get_topic_diversity(_topic_model_content))
    print("topic coherence = ", get_coherence(df_news, _topic_model_content, _topics_numbers))
    _topic_model_content.visualize_topics().show()
    _topic_model_content.visualize_barchart(top_n_topics=32).show()
    return _topic_model_content, _topics_numbers, _probs_numbers


In [6]:


custom_bert_parameters = {
    "language": "english",
    "nr_topics": "auto",
    "document_embedding": SentenceTransformer("all-MiniLM-L6-v2"),
    "reduce_dimensionality": UMAP(min_dist=0.05),
    "clustering": HDBSCAN(min_cluster_size=15),
    "topic_tokenization": CountVectorizer(stop_words="english"),
    "topic_representation": ClassTfidfTransformer(reduce_frequent_words=True),
    "representation_model": KeyBERTInspired()
}
show_bertopic_evaluations(custom_bert_parameters)

total number of topics =  35
documents without a topic =  1005
topic diversity =  0.48823529411764705


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

topic coherence =  0.528434313314104


(<bertopic._bertopic.BERTopic at 0x3198752b0>,
 [14,
  -1,
  0,
  -1,
  -1,
  -1,
  -1,
  4,
  4,
  3,
  0,
  -1,
  0,
  4,
  -1,
  -1,
  0,
  4,
  0,
  -1,
  4,
  4,
  4,
  -1,
  -1,
  -1,
  0,
  0,
  24,
  4,
  -1,
  4,
  0,
  -1,
  -1,
  4,
  0,
  0,
  26,
  0,
  -1,
  -1,
  33,
  -1,
  -1,
  -1,
  4,
  24,
  1,
  10,
  -1,
  4,
  10,
  10,
  -1,
  10,
  10,
  0,
  5,
  16,
  16,
  0,
  10,
  10,
  15,
  10,
  10,
  0,
  22,
  -1,
  0,
  0,
  -1,
  -1,
  13,
  -1,
  22,
  22,
  10,
  22,
  -1,
  -1,
  -1,
  8,
  11,
  14,
  -1,
  8,
  -1,
  -1,
  0,
  16,
  -1,
  -1,
  22,
  1,
  -1,
  0,
  -1,
  10,
  22,
  -1,
  22,
  1,
  1,
  29,
  -1,
  18,
  1,
  1,
  -1,
  29,
  -1,
  -1,
  1,
  1,
  18,
  1,
  18,
  18,
  0,
  18,
  1,
  18,
  1,
  18,
  -1,
  18,
  18,
  11,
  15,
  18,
  -1,
  -1,
  0,
  0,
  1,
  0,
  4,
  1,
  11,
  -1,
  14,
  26,
  22,
  -1,
  0,
  0,
  1,
  -1,
  1,
  -1,
  1,
  1,
  2,
  10,
  16,
  31,
  31,
  1,
  1,
  22,
  -1,
  2,
  1,
  1,
  1,
  1,
  4,
  1,
 

In [7]:


custom_bert_parameters = {
    "language": "english",
    "nr_topics": "auto",
    "document_embedding": SentenceTransformer("all-MiniLM-L6-v2"),
    "reduce_dimensionality": UMAP(min_dist=0.05, spread=0.5),
    "clustering": HDBSCAN(min_cluster_size=15),
    "topic_tokenization": CountVectorizer(stop_words="english"),
    "topic_representation": ClassTfidfTransformer(reduce_frequent_words=True),
    "representation_model": KeyBERTInspired()
}
show_bertopic_evaluations(custom_bert_parameters)

total number of topics =  24
documents without a topic =  970
topic diversity =  0.6


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

topic coherence =  0.5806442463758856


(<bertopic._bertopic.BERTopic at 0x3890e63c0>,
 [0,
  -1,
  0,
  -1,
  -1,
  -1,
  -1,
  2,
  2,
  1,
  0,
  0,
  -1,
  2,
  0,
  0,
  -1,
  2,
  -1,
  0,
  2,
  2,
  2,
  -1,
  -1,
  -1,
  0,
  0,
  15,
  2,
  0,
  2,
  0,
  -1,
  -1,
  2,
  -1,
  0,
  0,
  -1,
  -1,
  -1,
  2,
  -1,
  2,
  -1,
  2,
  15,
  0,
  0,
  -1,
  2,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  8,
  8,
  0,
  0,
  0,
  5,
  0,
  0,
  0,
  0,
  -1,
  0,
  0,
  -1,
  -1,
  7,
  -1,
  0,
  0,
  0,
  0,
  -1,
  -1,
  -1,
  0,
  -1,
  0,
  0,
  -1,
  -1,
  -1,
  0,
  8,
  -1,
  -1,
  0,
  0,
  -1,
  -1,
  -1,
  0,
  0,
  -1,
  0,
  0,
  0,
  6,
  0,
  10,
  0,
  -1,
  11,
  -1,
  -1,
  6,
  0,
  0,
  10,
  -1,
  10,
  10,
  -1,
  10,
  0,
  10,
  -1,
  10,
  -1,
  10,
  10,
  6,
  5,
  10,
  -1,
  -1,
  0,
  0,
  0,
  -1,
  2,
  -1,
  6,
  0,
  0,
  0,
  0,
  0,
  -1,
  0,
  0,
  0,
  0,
  -1,
  0,
  0,
  16,
  0,
  8,
  21,
  21,
  0,
  -1,
  0,
  -1,
  1,
  0,
  0,
  0,
  0,
  2,
  0,
  0,
  0,
  0,
  0,
  -1,
  0,
  -1