In [3]:
import pandas as pd
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from spacy import displacy
import datetime as dt
import re

from sklearn.feature_extraction.text import TfidfVectorizer
import nmslib
# from transformers import pipeline

# uncomment for downloading spacy models
# !python -m spacy download en_core_web_sm
# !python -m spacy download en_core_web_lg
# give it 2 min

In [181]:
def lemmatize_text(text, nlp):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

def extract_entities(text, nlp):
    doc = nlp(text)
    return [ent.text for ent in doc.ents if ent.label_ == 'ORG']

In [4]:
nlp = spacy.load('en_core_web_sm')

In [5]:
# ATN = pd.read_csv('data/all-the-news-2-1.csv')
# 7min

ATN_c = pd.read_csv('data/ATN_stripped2020.csv')
# ATN_c = pd.read_csv('data/ATN_cleaned.csv', parse_dates=['date'])

In [6]:
ATN_c.head(5)
# ATN_stripped.head(1)

Unnamed: 0,date,title,article,section,publication
0,2020-04-02,A Man Admitted To Trying To Crash A Train Into...,The journalists at BuzzFeed News are proud to ...,,Buzzfeed News
1,2020-04-01,Ruth Bader Ginsburg Still Working Out with Tra...,Here's some good news we can all use ... Ruth ...,,TMZ
2,2020-04-01,Coronavirus Victim: 24-Year-Old Silvia Deyanir...,The journalists at BuzzFeed News are proud to ...,,Buzzfeed News
3,2020-04-01,Daily Telegraph Gives Chinese Coronavirus Prop...,The journalists at BuzzFeed News are proud to ...,,Buzzfeed News
4,2020-04-01,Coronavirus Pandemic: An American Is Trapped I...,Trenton Thurber The journalists at BuzzFeed Ne...,,Buzzfeed News


In [7]:
tickers_clean = pd.read_csv('data/SP500_tickers_clean.csv')
ticker_associations = pd.read_csv('data/SP500_ticker_associations.csv', header=0)

#### **Preprocess**

**For articles**  
remove stopwords and punctuation
only keep the relevant columns (Date, Title, Article)

**For ticker associations**  
remove stopwords and punctuation
concatenate and join words to one string


In [8]:
# for articels and titles
ATN_c['title'] = ATN_c['title'].str.lower().str.replace('[^\w\s]', '', regex=True)

In [9]:
ATN_c.iloc[1128]

date                                                  2020-03-30
title          intel chipmakers may skip big layoffs because ...
article        This story is available exclusively on Busines...
section                                                      NaN
publication                                     Business Insider
Name: 1128, dtype: object

In [10]:
ticker_associations.head(3)

Unnamed: 0,Ticker,Names,Associations
0,MO,Altria:Altria Group,Marlboro:Copenhagen:Juul
1,AMZN,Amazon Inc:Amazon.com,Amazon Prime:Kindle:Alexa
2,AMCR,Amcor:Amcor PLC,Plastic packaging:Resilient packaging


In [160]:
# make the ticker, name, association df into a list of "documents", where each ticker has concatenated all its names and associations
ticker_docs = ticker_associations[['Ticker', 'Names', 'Associations']].astype(str).apply(' '.join, axis=1)
ticker_docs = ticker_docs.str.replace(':', ' ').str.lower().str.replace('[^\w\s]', '', regex=True)
# remove stop words
ticker_docs = ticker_docs.apply(lambda x: ' '.join([word for word in x.split() if word not in (STOP_WORDS)]))
# only include each word once per line
# ticker_docs = ticker_docs.apply(lambda x: ' '.join(set(x.split())))
# ticker_docs = ticker_associations['Names'].astype(str).str.lower().str.replace(':', ' ').str.replace('[^\w\s]', '', regex=True)

In [161]:
# here, each ticker and assoc. is a document, ready to be vectorized
ticker_docs

0        mo altria altria group marlboro copenhagen juul
1      amzn amazon inc amazoncom amazon prime kindle ...
2      amcr amcor amcor plc plastic packaging resilie...
3            amd advanced micro devices amd ryzen radeon
4      aee ameren corporation ameren illinois ameren ...
                             ...                        
478                       aes aes corporation aes energy
479    agilent technologies inc agilent gcms instruments
480    akam akamai technologies inc akamai intelligen...
481                 algn align technology inc invisalign
482            ko cocacola company cocacola sprite fanta
Length: 483, dtype: object

In [188]:
article_docs = ATN_c['title'].astype(str).str.lower().str.replace('[^\w\s]', '', regex=True)

#### **vectorize with TF-IDF**

In [189]:
vectorizer = TfidfVectorizer(ngram_range=(1,3), analyzer='word') # 3/4-grams for char-level
ticker_association_vectors = vectorizer.fit_transform(article_docs.tolist())
# 17min

#### **Indexing vectors with NMSLIB**

In [211]:
index = nmslib.init(method='hnsw', space='cosinesimil_sparse', data_type=nmslib.DataType.SPARSE_VECTOR, dtype=nmslib.DistType.FLOAT)
# make the ticker vectors dense
# t_a_v = ticker_association_vectors.todense()
index.addDataPointBatch(ticker_association_vectors)
index.createIndex({'post': 2}, print_progress=True)
index.setQueryTimeParams({'efSearch': 300, 'algoType': 'old'})


0%   10   20   30   40   50   60   70   80   90   100%
|----|----|----|----|----|----|----|----|----|----|
***************************************************

0%   10   20   30   40   50   60   70   80   90   100%
|----|----|----|----|----|----|----|----|----|----|
*******************************************************

In [164]:
article_lemmatized = ATN_c['title'].iloc[1128:1129].apply(lemmatize_text)
# doc = nlp(ATN_c.article.iloc[1128])

In [152]:
'intel' in ATN_c['title'].iloc[1128].lower()

True

In [192]:
# print(ticker_association_vectors[0])
# vectorizer.vocabulary_
# Get the first document in the sparse matrix
doc_vector = ticker_association_vectors[226]

# Convert the sparse matrix row to a dense array
doc_array = doc_vector.toarray().flatten()

# Get the vocabulary and create a reverse mapping from index to term
vocab = vectorizer.vocabulary_
reverse_vocab = {index: term for term, index in vocab.items()}

# Create a list of tuples (term, score) and sort it by score in descending order
scores = [(reverse_vocab[i], score) for i, score in enumerate(doc_array)]
sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)

# Print the top 10 terms by TF-IDF score
for term, score in sorted_scores[:10]:
    print(f"{term}: {score}")

KeyboardInterrupt: 

In [213]:
# Let's say `article` is the text of an article you want to analyze.
# article_vector = vectorizer.transform(article_lemmatized.tolist())
article_vector = vectorizer.transform(['intel corporation'])
# article_dense = article_vector.todense().astype('float32')[0]
# article_vector = article_vector.astype('float32') #! HAS TO BE FLOAT32 for c++ conversion

# Find the 5 nearest neighbors in the index.
# indices, distances = index.knnQuery(vector=article_vector, k=10)
indices, distances = index.knnQueryBatch(index, article_vector, k=10, num_threads=4)

TypeError: knnQueryBatch(): incompatible function arguments. The following argument types are supported:
    1. (self: nmslib.dist.FloatIndex, queries: object, k: int = 10, num_threads: int = 0) -> object

Invoked with: <nmslib.FloatIndex method='hnsw' space='cosinesimil_sparse' at 0x7f7df2e2b5e0>, <nmslib.FloatIndex method='hnsw' space='cosinesimil_sparse' at 0x7f7df2e2b5e0>, <1x1763856 sparse matrix of type '<class 'numpy.float64'>'
	with 2 stored elements in Compressed Sparse Row format>; kwargs: k=10, num_threads=4

In [199]:
# Convert the sparse matrix row to a dense array
doc_array = article_vector.toarray().flatten()

# Get the vocabulary and create a reverse mapping from index to term
vocab = vectorizer.vocabulary_
reverse_vocab = {index: term for term, index in vocab.items()}

# Create a list of tuples (term, score) and sort it by score in descending order
scores = [(reverse_vocab[i], score) for i, score in enumerate(doc_array)]
sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)

# Print the top 10 terms by TF-IDF score
for term, score in sorted_scores[:20]:
    print(f"{term}: {score}")

corporation: 0.7374060750007629
intel: 0.6754496693611145
00: 0.0
00 mm: 0.0
00 mm 13: 0.0
00 mm 14: 0.0
00 qq: 0.0
00 qq 11: 0.0
00 qq reuters: 0.0
0000: 0.0
0000 senior: 0.0
0000 senior notes: 0.0
00003: 0.0
0008: 0.0
0008 per: 0.0
0008 per share: 0.0
001: 0.0
001 as: 0.0
001 as fed: 0.0
0012: 0.0


In [175]:
threshold = 0.9
# Filter out the results that are below a certain threshold.
filtered_indices = [i for i, d in zip(indices, distances) if d > threshold]

In [176]:
distances

array([0.809136  , 0.887751  , 0.8990367 , 0.9172862 , 0.94079536,
       0.94723755, 0.9586839 , 0.9589516 , 0.9598353 , 0.9607994 ],
      dtype=float32)

In [173]:
# print([article_lemmatized.iloc[i] for i in indices])
# article_lemmatized.iloc[indices[0]]

In [177]:
print([ticker_associations.iloc[i]['Ticker'] for i in filtered_indices])
ATN_c.iloc[1128]['title']

['BKR', 'MCD', 'MGM', 'PH', 'QCOM', 'MTCH', 'LVS']


'intel chipmakers may skip big layoffs because demand will snap back'

In [78]:
tfidf_scores = np.squeeze(np.asarray(article_vector))
vocabulary = vectorizer.vocabulary_
sorted_ngrams = sorted(vocabulary.items(), key=lambda x: tfidf_scores[x[1]], reverse=True)
sorted_ngrams[:5]

[('production and', 6113),
 ('wall street', 7565),
 ('lam research', 4751),
 ('mo altria', 5240),
 ('altria altria', 265)]

In [71]:
# Reverse the vocabulary dictionary
index_to_ngram = {v: k for k, v in vectorizer.vocabulary_.items()}

# Get the indices of the n-grams with the highest TF-IDF scores
top_ngram_indices = np.argsort(tfidf_scores)[::-1]

# Get the n-grams corresponding to these indices
top_ngrams = [(index_to_ngram[i], tfidf_scores[i]) for i in top_ngram_indices]

# Print the top 20 n-grams
print(top_ngrams[:10])

[('e ', 0.1685521), (' t', 0.15543188), ('th', 0.13725233), (' th', 0.1311991), (' a', 0.12768699), ('he ', 0.10906077), ('ma', 0.105170794), ('the', 0.09971617), ('t ', 0.099340156), ('re', 0.099340156)]


In [None]:
"""

"the assumption here is that 'Apple' (or term) is a term that exists in the vocabulary of your TF-IDF vectorizer."
What do we do if the term/ticker we are searching for doesnt exist in the vocabulary?..

TF-IDF vectorizer uses unigrams (whole words) as default. Do we want this behaviour? 
there's also a way to set multiple ngrams for wider search i guess.: TfidfVectorizer(ngram_range=(1, 2))


"""

#### **Creating data structures**

In [None]:
article_dates = articles.set_index('id')['date'].to_dict() # TODO: get back here when i figured storing out


In [None]:
"""
There needs to be a way to keep track of which articles mention what tickers, and the articles date and ID. 
Then each article can have its sentiment score calculated and for each ticker and day, we can avg out the sentiment.
"""

## **Sentiment Analysis**

In [7]:
"""
Strategy: 
preprocess to remove redundant words: stopwords, punctuation, lowercase.
We might need NER for having multi word company names... or we do some magic with the alt names

https://bergvca.github.io/2017/10/14/super-fast-string-matching.html : provides some super fast name matching using cosine similarity on n-grams from TF-IDF

Each documents is an article, so we find the ngrams that are rare across documents but some documents then have a high TF. 
This way we would know that document is talking about our ngram. ???

each artcle is a document
TF-IDF vectorizer from sklearn
feed into NMSLIB
setup articles with ID and date attached? (can it be done?)
for each ticker, find the closest articles.
create dict with ticker key, list of articles as values.
for each mentioned article, create a sentiment score using FinBERT
for each ticker, create a pd.series that accumulates the sentiment scores of all articles for a day into 1 row in the series.
we now have 500 series, each with all dates in the period of our original article dataset
each series has a exp.decay func applied to each day, so it computes a new weighted score for the day based on the score of the last 2 weeks
each accumulated score column can be combined for all tickers, which should be transformed into long format for use with the financial dataset

"""

'\nStrategy: \npreprocess to remove redundant words: stopwords, punctuation, lowercase.\nWe might need NER for having multi word company names...\n\n'

In [8]:
def ngrams(string, n=3):
    string = re.sub(r'[^\w\s]',r'', string) # remove all non-words
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

print('All 3-grams in "McDonalds":')
ngrams('McDonalds', n=4)

All 3-grams in "McDonalds":


['McDo', 'cDon', 'Dona', 'onal', 'nald', 'alds']

In [15]:
l = 0
ll = 0
for article in ATN_stripped.article:
    l += len(article)
    ll += len(ngrams(article, n=3))

print(f'len of articles: {l}, len of ngrams: {ll}')

len of articles: 611924452, len of ngrams: 592302728


In [18]:
l = 0
ll = 0
for article in ATN_stripped.title:
    l += len(article)
    ll += len(ngrams(article, n=4))

print(f'len of articles: {l}, len of ngrams: {ll}')

len of articles: 12929260, len of ngrams: 12046816


In [13]:
ATN_stripped.__len__()
ATN_publishers = ATN_stripped.publication.unique()

189978

In [14]:
1.7*190/60

5.383333333333334