In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from spacy import displacy
import datetime as dt
import re

from sklearn.feature_extraction.text import TfidfVectorizer
import nmslib
# from transformers import pipeline

# uncomment for downloading spacy models
# !python -m spacy download en_core_web_sm
# !python -m spacy download en_core_web_lg
# give it 2 min

In [6]:
nlp = spacy.load('en_core_web_sm')

In [14]:
# ATN = pd.read_csv('data/all-the-news-2-1.csv')
# 7min

# ATN_stripped = pd.read_csv('data/ATN_stripped2020.csv')
ATN_c = pd.read_csv('data/ATN_cleaned.csv', parse_dates=['date'])

In [3]:
ATN_c.head(5)
# ATN_stripped.head(1)

Unnamed: 0,date,title,article,section,publication
0,2019-08-08,'Game of Thrones' creators are heading to Netflix,Game of Thrones creators and showrunners David...,,Mashable
1,2019-08-07,"High-pressure cafeteria meme asks, 'where y'al...","You may be out of school, but one of the most ...",,Mashable
2,2019-08-07,Meghan Trainor and Daryl Sabara Down For 'Spy ...,Meghan Trainor and Daryl Sabara WANT IN on the...,,TMZ
3,2019-08-07,"'Beverly Hills, 90210' Cast -- 'Memba Them?!",Beverly Hills' most notorious gang has gotten ...,,TMZ
4,2019-08-07,NFL Rookie Daylon Mack's Truck Stuffed W/ Pack...,There's a rule for Baltimore Ravens rookies .....,,TMZ


In [140]:
tickers_clean = pd.read_csv('data/SP500_tickers_clean.csv')
ticker_associations = pd.read_csv('data/SP500_ticker_associations.csv', header=0)

#### **Preprocess**

In [35]:
# for articels and titles
ATN_c['title'] = ATN_c['title'].str.lower().str.replace('[^\w\s]', '', regex=True)

In [252]:
ATN_c[ATN_c['publication'] == 'Business Insider'][300:350]

Unnamed: 0,date,title,article,section,publication
66835,2019-05-20,beto orourkes bad polling numbers keep getting...,Beto O'Rourke is attempting to reboot his pres...,,Business Insider
66874,2019-05-20,felix gray bluelight blocking computer glasses...,Insider Picks writes about products and servic...,,Business Insider
66879,2019-05-20,kate middleton prince william share candid pho...,Prince William and Kate Middleton have shared ...,,Business Insider
66975,2019-05-20,bill hader talks dark season finale of barry g...,Warning: Major spoilers below if you have not ...,,Business Insider
67170,2019-05-19,game of throness john bradley west shared firs...,"John Bradley West, who plays Samwell Tarly on ...",,Business Insider
67181,2019-05-19,emilia clarke says emotional goodbye to game o...,"""Game of Thrones"" star Emilia Clarke, who play...",,Business Insider
67190,2019-05-19,times westerners were arrested in dubai over s...,The United Arab Emirates attracts millions of ...,,Business Insider
67325,2019-05-19,fran lebowitz regrets suggesting trump be turn...,The author and humorist Fran Lebowitz said she...,,Business Insider
67339,2019-05-19,alec baldwin led trumps entourage in dont stop...,Alec Baldwin appeared as President Donald Trum...,,Business Insider
67401,2019-05-18,7 of the least expensive cars to own,"Owning a car costs an average of $8,500 a year...",,Business Insider


In [191]:
ticker_associations.head(5)

Unnamed: 0,Ticker,Names,Associations
0,MO,Altria:Altria Group,Marlboro:Copenhagen:Juul
1,AMZN,Amazon Inc:Amazon.com,Amazon Prime:Kindle:Alexa
2,AMCR,Amcor:Amcor PLC,Plastic packaging:Resilient packaging
3,AMD,Advanced Micro Devices:AMD,Ryzen:Radeon
4,AEE,Ameren Corporation,Ameren Illinois:Ameren Missouri


In [146]:
# make the ticker, name, association df into a list of "documents", where each ticker has concatenated all its names and associations
ticker_docs = ticker_associations[['Ticker', 'Names', 'Associations']].astype(str).apply(' '.join, axis=1)
ticker_docs = ticker_docs.str.replace(':', ' ').str.lower().str.replace('[^\w\s]', '', regex=True)


In [147]:
# here, each ticker and assoc. is a document, ready to be vectorized
ticker_docs

0        mo altria altria group marlboro copenhagen juul
1      amzn amazon inc amazoncom amazon prime kindle ...
2      amcr amcor amcor plc plastic packaging resilie...
3            amd advanced micro devices amd ryzen radeon
4      aee ameren corporation ameren illinois ameren ...
                             ...                        
478                   aes the aes corporation aes energy
479    a agilent technologies inc agilent gcms instru...
480    akam akamai technologies inc akamai intelligen...
481                 algn align technology inc invisalign
482        ko the cocacola company cocacola sprite fanta
Length: 483, dtype: object

#### **vectorize with TF-IDF**

In [154]:
vectorizer = TfidfVectorizer(ngram_range=(3,4), analyzer='char') # 3/4-grams for char-level
ticker_association_vectors = vectorizer.fit_transform(ticker_docs.tolist())

#### **Indexing vectors with NMSLIB**

In [13]:
# https://12ft.io/proxy?&q=https%3A%2F%2Ftowardsdatascience.com%2Fcomprehensive-guide-to-approximate-nearest-neighbors-algorithms-8b94f057d6b6
class NMSLIBIndex():
    def __init__(self, vectors, labels, space):
        self.dimention = vectors.shape[1]
        self.vectors = vectors.astype('float32')
        self.labels = labels
        self.space = space
    
    def build(self, verbose=False):
        self.index = nmslib.init(method='hnsw', space=self.space)
        self.index.addDataPointBatch(self.vectors)
        self.index.createIndex({'post': 2}, print_progress=verbose)
        
    def query(self, vector, k=10):
        indices = self.index.knnQuery(vector, k=k)
        return [self.labels[i] for i in indices[0]]

# TODO: get back here and see if it can even betale sig...

In [None]:
# index = NMSLIBIndex(article_vectors, ATN_c['id'], 'cosinesimil_sparse')

In [229]:
index = nmslib.init(method='hnsw', space='cosinesimil',) #data_type=nmslib.DataType.SPARSE_VECTOR)
# make the ticker vectors dense
t_a_v = ticker_association_vectors.todense()
index.addDataPointBatch(t_a_v)
index.createIndex({'post': 2}, print_progress=True)


0%   10   20   30   40   50   60   70   80   90   100%
|----|----|----|----|----|----|----|----|----|----|
***************************************************

0%   10   20   30   40   50   60   70   80   90   100%
|----|----|----|----|----|----|----|----|----|----|
*****************************************************



In [253]:
# Let's say `article` is the text of an article you want to analyze.
article_vector = vectorizer.transform([ATN_c.iloc[69945]['title']]).todense().astype('float32')[0]
# article_vector = article_vector.astype('float32') #! HAS TO BE FLOAT32 for c++ conversion

# Find the 5 nearest neighbors in the index.
indices, distances = index.knnQuery(vector=article_vector, k=2)


In [254]:
indices, distances
print([ticker_associations.iloc[i]['Ticker'] for i in indices])
ATN_c.iloc[69945]['title']

['AAPL', 'TROW']


'trade war apple iphone prices could increase due to tariffs'

In [None]:
"""

"the assumption here is that 'Apple' (or term) is a term that exists in the vocabulary of your TF-IDF vectorizer."
What do we do if the term/ticker we are searching for doesnt exist in the vocabulary?..

TF-IDF vectorizer uses unigrams (whole words) as default. Do we want this behaviour? 
there's also a way to set multiple ngrams for wider search i guess.: TfidfVectorizer(ngram_range=(1, 2))


"""

#### **Creating data structures**

In [None]:
article_dates = articles.set_index('id')['date'].to_dict() # TODO: get back here when i figured storing out


In [None]:
"""
There needs to be a way to keep track of which articles mention what tickers, and the articles date and ID. 
Then each article can have its sentiment score calculated and for each ticker and day, we can avg out the sentiment.
"""

## **Sentiment Analysis**

In [7]:
"""
Strategy: 
preprocess to remove redundant words: stopwords, punctuation, lowercase.
We might need NER for having multi word company names... or we do some magic with the alt names

https://bergvca.github.io/2017/10/14/super-fast-string-matching.html : provides some super fast name matching using cosine similarity on n-grams from TF-IDF

Each documents is an article, so we find the ngrams that are rare across documents but some documents then have a high TF. 
This way we would know that document is talking about our ngram. ???

each artcle is a document
TF-IDF vectorizer from sklearn
feed into NMSLIB
setup articles with ID and date attached? (can it be done?)
for each ticker, find the closest articles.
create dict with ticker key, list of articles as values.
for each mentioned article, create a sentiment score using FinBERT
for each ticker, create a pd.series that accumulates the sentiment scores of all articles for a day into 1 row in the series.
we now have 500 series, each with all dates in the period of our original article dataset
each series has a exp.decay func applied to each day, so it computes a new weighted score for the day based on the score of the last 2 weeks
each accumulated score column can be combined for all tickers, which should be transformed into long format for use with the financial dataset

"""

'\nStrategy: \npreprocess to remove redundant words: stopwords, punctuation, lowercase.\nWe might need NER for having multi word company names...\n\n'

In [8]:
def ngrams(string, n=3):
    string = re.sub(r'[^\w\s]',r'', string) # remove all non-words
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

print('All 3-grams in "McDonalds":')
ngrams('McDonalds', n=4)

All 3-grams in "McDonalds":


['McDo', 'cDon', 'Dona', 'onal', 'nald', 'alds']

In [15]:
l = 0
ll = 0
for article in ATN_stripped.article:
    l += len(article)
    ll += len(ngrams(article, n=3))

print(f'len of articles: {l}, len of ngrams: {ll}')

len of articles: 611924452, len of ngrams: 592302728


In [18]:
l = 0
ll = 0
for article in ATN_stripped.title:
    l += len(article)
    ll += len(ngrams(article, n=4))

print(f'len of articles: {l}, len of ngrams: {ll}')

len of articles: 12929260, len of ngrams: 12046816


In [13]:
ATN_stripped.__len__()
ATN_publishers = ATN_stripped.publication.unique()

189978

In [14]:
1.7*190/60

5.383333333333334