In [1]:
import Assignment2.df_filter_nvida
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pyLDAvis
import pyLDAvis.gensim_models
from nltk.tokenize import word_tokenize
import re
from unidecode import unidecode
import pandas as pd
from nltk.corpus import stopwords
from collections import Counter

from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel

In [2]:
stop_words = set(stopwords.words('english'))


In [3]:
news_articles_data = pd.read_csv("../data/us_equities_news_dataset.csv")


In [4]:
# Sample keywords related to NVIDIA and associated companies
nvidia_keywords = [
    'NVDA', 'NVIDIA']

'''

More keywords can be added to the list to improve the filtering process. But adding all those increases the document number significantly making processing slower and creating memory issues that are unsolvable by my mere laptop. 

nvidia_keywords = [
    'NVDA', 'NVIDIA', 'NIO', 'UBER', 'AMZN', 'AMAZON', 'TESLA', 'AI', 'GPU', 'GRAPHICS',
    'CHIP', 'SEMICONDUCTOR', 'AUTONOMOUS', 'DRIVING', 'DEEP LEARNING', 'MACHINE LEARNING'
]

'''

# Compile a regex pattern from the keywords list
nvidia_pattern = '|'.join(nvidia_keywords)  # Combines the keywords into a regex pattern

# Filter articles where the content or ticker column contains any of the keywords
df_news = news_articles_data[
    news_articles_data['content'].str.contains(nvidia_pattern, case=False, na=False)
]

In [5]:
df_news = df_news.drop_duplicates(subset='content', keep='first').reset_index(drop=True)
df_news

Unnamed: 0,id,ticker,title,category,content,release_date,provider,url,article_id
0,221539,NIO,A Central Bank War Just Started And Its Good F...,opinion,ECB Effects\nThe move in the euro was huge fa...,2019-03-07,Michael Kramer,https://www.investing.com/analysis/a-central-b...,200395687
1,221547,NIO,6 Stocks To Watch Nivida Could Be Falling,opinion,6 Stocks To Watch March 6 Trading Session\nSt...,2019-03-06,Michael Kramer,https://www.investing.com/analysis/6-stocks-to...,200394931
2,221572,NIO,Stocks Dow Drops Nearly 400 Points as Apple ...,news,Investing com A rout in Apple and Facebook ...,2018-11-19,Investing.com,https://www.investing.com/news/stock-market-ne...,1694042
3,221593,UBER,The Zacks Analyst Blog Highlights Advanced Mi...,opinion,For Immediate ReleaseChicago IL January 13 ...,2020-01-12,Zacks Investment Research,https://www.investing.com/analysis/the-zacks-a...,200498277
4,221597,UBER,The Best Of CES 2020 Revised,opinion,With 4 500 companies bringing their innovation...,2020-01-16,Zacks Investment Research,https://www.investing.com/analysis/the-best-of...,200499164
...,...,...,...,...,...,...,...,...,...
3436,442657,AMD,Here s Why Nvidia NVDA Stock Is Gaining Today,opinion,Shares of Nvidia NASDAQ NVDA are up nearly...,2016-09-27,Zacks Investment Research,https://www.investing.com/analysis/here's-why-...,200155860
3437,442682,AMD,4 Stocks To Watch Today ATW CWEI MXL SLCA,opinion,It was a pretty good start to the week on Mond...,2016-05-17,Harry Boxer,"https://www.investing.com/analysis/atw,-cwei,-...",200130262
3438,442705,AMD,Here s What The Buy Side Expects From AMD Thur...,opinion,Advanced Micro Devices Inc NYSE AMD is set ...,2014-04-17,Estimize,https://www.investing.com/analysis/here’s-what...,209915
3439,442984,T,Zacks com Featured Highlights AT T Nu Skin E...,opinion,For Immediate Release\n\nChicago IL July 22...,2016-07-21,Zacks Investment Research,https://www.investing.com/analysis/zacks.com-f...,200143537


## Preprocessing takes places in the following steps:
- Lowercasing the text
- Removing non-alphabetic characters
- Removing accents
- Tokenization
- Removing stopwords
- Removing top and bottom n% of words

In [6]:
def preprocess_texts_v1(texts, n=0):
    processed_texts = []
    stop_words = set(stopwords.words('english'))  # Load stopwords once

    for text in texts:
        # Lowercasing, keep text only, remove accents, tokenization
        tokens = [word for word in word_tokenize(re.sub(r'[^a-zA-Z\s]', '', unidecode(text.lower())))]

        # Stopword removal using a set for faster lookup
        tokens = [token for token in tokens if token not in stop_words]
        processed_texts.append(tokens)

    if n > 0:
        word_freq = Counter([word for sentence in processed_texts for word in sentence])
        top_n = set(word for word, _ in word_freq.most_common(int(n / 100 * len(word_freq))))
        bottom_n = set(word for word, _ in word_freq.most_common()[:-int(n / 100 * len(word_freq)) - 1:-1])
        
        processed_texts = [
            [word for word in sentence if word not in top_n and word not in bottom_n]
            for sentence in processed_texts
        ]

    return processed_texts

In [7]:
# Preprocess the filtered news content
processed_content = preprocess_texts_v1(df_news['content'].tolist())

In [8]:
# Save the processed content to a CSV file
processed_df = pd.DataFrame({'processed_content': [' '.join(content) for content in processed_content]})
processed_df.to_csv('../data/flsaw_processed_content.csv', index=False)

In [9]:
print(processed_content[:5])

[['ecb', 'effects', 'move', 'euro', 'huge', 'falling', 'pips', 'huge', 'important', 'piece', 'euro', 'broke', 'essential', 'support', 'wrote', 'thoughts', 'euro', 'could', 'even', 'fall', 'parity', 'fed', 'may', 'need', 'cut', 'rates', 'later', 'year', 'draghi', 'dovish', 'today', 'outlook', 'much', 'german', 'bunds', 'fell', 'around', 'bps', 'bps', 'year', 'fell', 'break', 'sends', 'year', 'back', 'december', 'lows', 'time', 'investors', 'come', 'realize', 'rates', 'remain', 'low', 'fed', 'need', 'stay', 'hold', 'avoid', 'dollar', 'strengthen', 'much', 'dollar', 'rise', 'kill', 'whatever', 'inflationary', 'forces', 'hurting', 'multinational', 'companies', 'result', 'may', 'fed', 'needs', 'cut', 'rates', 'late', 'keep', 'dollar', 'line', 'versus', 'euro', 'avoid', 'scenario', 'good', 'stocks', 'investors', 'move', 'risk', 'curve', 'low', 'interest', 'rate', 'foster', 'multiple', 'expansion', 'stocks', 'p', 'spy', 'nyse', 'spy', 'stocks', 'end', 'falling', 'p', 'dropping', 'roughly', 'b