In [2]:
import pandas as pd
import json
import os

In [143]:
df = pd.read_csv('dataset_2018_01.csv')

In [144]:
df.head()

Unnamed: 0,title,published,text
0,Emerging markets are set for an even bigger ra...,2018-01-03,17 Hours Ago | 02:56 Emerging markets soared m...
1,Cramer reflects on how Trump's actions are fue...,2018-01-03,Cramer reflects on how Trump's actions are fue...
2,The Wall Street Journal: Peter Thiel’s VC firm...,2018-01-03,"Published: Jan 2, 2018 5:59 p.m. ET Share Few ..."
3,Hoda Kotb Will Replace Matt Lauer on NBC’s ‘To...,2018-01-02,By Reuters 8:23 AM EST Television host Hoda Ko...
4,UK's Compass says new CEO to start Jan 1 after...,2018-01-01,"January 1, 2018 / 8:56 AM / Updated 5 hours ag..."


In [146]:
df.shape

(57802, 3)

### Remove articles that are too short ot too long (<100, >100000)

In [154]:
df = df[(df['text'].str.len() >= 100) & (df['text'].str.len() <= 100000)]

In [155]:
df.shape

(56885, 3)

### NER model

In [None]:
# Preproccessing data

In [113]:
df['text'] = df['text'].astype(str)
# because of the TypeError: 'float' object is not iterable

In [114]:
non_ascii_values = df['text'].apply(lambda x: any(ord(char) > 128 for char in x))
print(non_ascii_values)

0        False
1        False
2         True
3         True
4         True
         ...  
57797     True
57798     True
57799     True
57800     True
57801     True
Name: text, Length: 57802, dtype: bool


In [115]:
def remove_non_ascii(text):
    return ''.join(char for char in text if ord(char) < 128)

df['text'] = df['text'].apply(remove_non_ascii)

In [116]:
non_ascii_values = df['text'].apply(lambda x: any(ord(char) > 128 for char in x))
print(non_ascii_values)

0        False
1        False
2        False
3        False
4        False
         ...  
57797    False
57798    False
57799    False
57800    False
57801    False
Name: text, Length: 57802, dtype: bool


In [None]:
# pre-trained English model for Named Entity Recognition (NER) 

In [117]:
import spacy
model_ner = spacy.load("en_core_web_sm")

In [118]:
def extract_org(text):
    doc = model_ner(text)
    orgs = [ent.text for ent in doc.ents if ent.label_ == 'ORG']
    return ', '.join(orgs) if orgs else None

In [119]:
df['org_names'] = df['title'].apply(extract_org)

In [120]:
df = df.dropna(subset=['org_names'])

In [121]:
excluded_names = {"bloomberg", "reuters", "cnbc", "nbc", "the wall street journal", "wsj", "fortune", "nbc news today", "trump"}

In [122]:
df = df[~df['org_names'].str.lower().str.contains('|'.join(excluded_names))] # ~ operator negates the condition

In [123]:
df = df[df['org_names'].str.count(',') == 0]

In [124]:
df.head()

Unnamed: 0,title,published,text,org_names
9,Intel's alleged security flaw could cost chipm...,2018-01-03,A report alleging Intel chips have a security ...,Intel
21,"Tesla delivers 1,550 Model 3 sedans and 29,870...",2018-01-03,Tesla is apparently still deep in the circles ...,Tesla
26,"US looks good in 2018, but stocks abroad look ...",2018-01-02,"US looks good in 2018, but stocks abroad look ...",Invesco
28,China's WeChat denies storing user chats,2018-01-02,"January 2, 2018 / 7:00 AM / Updated 11 hours a...",WeChat
43,Weight Watchers rallies after DJ Khaled signin...,2018-01-02,Shares of Weight Watchers rose nearly 6 percen...,Weight Watchers


In [125]:
df.shape

(23537, 4)

In [126]:
# unique_org_names_df = df['org_names'].drop_duplicates().reset_index(drop=True)
# unique_org_names_df = pd.DataFrame(unique_org_names_df, columns=['org_names'])

In [127]:
#df.to_csv('df_after_ner.csv', index=False)

### Adding tickers. Fuzzy matching at scale

In [3]:
# US company tickers taken from https://www.sec.gov/file/company-tickers
company_ticker = pd.read_json('company_tickers.json', orient='index')
company_ticker = company_ticker[['ticker', 'title']]
company_ticker.head()

Unnamed: 0,ticker,title
0,MSFT,MICROSOFT CORP
1,AAPL,Apple Inc.
2,NVDA,NVIDIA CORP
3,AMZN,AMAZON COM INC
4,GOOGL,Alphabet Inc.


In [5]:
df = pd.read_csv('df_after_ner.csv')

In [6]:
import re
from ftfy import fix_text
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

In [7]:
# cleaning and splitting into ngrams
def ngrams(string, n=3):
    string = str(string)
    string = fix_text(string) # remove encoding mix-ups
    string = string.encode("ascii", errors="ignore").decode() #remove non ascii chars
    string = string.lower()
    chars_to_remove = [")","(",".","|","[","]","{","}","'"]
    rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
    string = re.sub(rx, '', string)
    string = string.replace('&', 'and')
    string = string.replace(',', ' ')
    string = string.replace('-', ' ')
    string = string.title() # capital at start of each word
    string = re.sub(' +',' ',string).strip() # get rid of multiple spaces and replace with a single
    string = ' '+ string +' ' # pad names for ngrams...
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

In [9]:
org_name_clean = company_ticker['title'].unique()

In [11]:
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams, lowercase=False)
tfidf = vectorizer.fit_transform(org_name_clean)
print('Vecorizing completed')

Vecorizing completed...


In [12]:
nbrs = NearestNeighbors(n_neighbors=1, n_jobs=-1).fit(tfidf)

In [13]:
org_column = 'org_names' #column to match against in the messy data
unique_org = set(df[org_column].values) # set used for increased performance

In [14]:
# matching query:
def getNearestN(query):
    queryTFIDF_ = vectorizer.transform(query)
    distances, indices = nbrs.kneighbors(queryTFIDF_)
    return distances, indices

In [15]:
import time
t1 = time.time()
print('getting nearest n...')
distances, indices = getNearestN(unique_org)
t = time.time()-t1
print("COMPLETED IN:", t)

getting nearest n...
COMPLETED IN: 3.6471893787384033


In [16]:
unique_org = list(unique_org) # need to convert back to a list
print('finding matches...')
matches = []
for i,j in enumerate(indices):
    temp = [round(distances[i][0],2), company_ticker.values[j][0][1],unique_org[i]]
    matches.append(temp)
    
print('Building data frame...')  
matches = pd.DataFrame(matches, columns=['match_confidence_lower_is_better','matched_name','origional_name'])
print('Done')

finding matches...
Building data frame...
Done


In [109]:
matches.head()

Unnamed: 0,match_confidence_lower_is_better,matched_name,origional_name
0,1.16,ING GROEP NV,WealthEngine Creates Advisory Board
1,0.34,"NEXTERA ENERGY PARTNERS, LP",NextEra Energy Partners
2,1.17,Greenpro Capital Corp.,BRIEF-Stenprop
3,0.79,HSBC HOLDINGS PLC,Southern California Super Lawyers'
4,1.0,Lenzing AG/ADR,Aerie Pharmaceuticals Announces


In [20]:
matches_filtered = matches[matches['match_confidence_lower_is_better'] < 0.78]
matches_filtered

Unnamed: 0,match_confidence_lower_is_better,matched_name,origional_name
1,0.34,"NEXTERA ENERGY PARTNERS, LP",NextEra Energy Partners
15,0.75,HUGOTON ROYALTY TRUST,BRIEF-Hugoton Royalty Trust Declares
21,0.15,"Borqs Technologies, Inc.",Borqs Technologies
29,0.72,INDEPENDENT BANK CORP,Independent Bank Corporation Reports
39,0.77,WIPRO LTD,Wipro Limited
...,...,...,...
13638,0.00,Airbus SE/ADR,GasLog Ltd.
13663,0.73,"CRA INTERNATIONAL, INC.",WTA International
13669,0.26,PROCTER & GAMBLE Co,Procter and Gamble
13690,0.60,TSAKOS ENERGY NAVIGATION LTD,Tsakos Energy Navigation Announces


In [22]:
matches_filtered = matches_filtered.merge(company_ticker, left_on='matched_name', right_on='title', how='left').drop(columns=['title'])

In [23]:
matches_filtered.shape

(2478, 4)

In [24]:
matches_filtered.tail()

Unnamed: 0,match_confidence_lower_is_better,matched_name,origional_name,ticker
2473,0.26,PROCTER & GAMBLE Co,Procter and Gamble,PG
2474,0.6,TSAKOS ENERGY NAVIGATION LTD,Tsakos Energy Navigation Announces,TNP
2475,0.6,TSAKOS ENERGY NAVIGATION LTD,Tsakos Energy Navigation Announces,TNP-PE
2476,0.6,TSAKOS ENERGY NAVIGATION LTD,Tsakos Energy Navigation Announces,TNP-PF
2477,0.61,"Adverum Biotechnologies, Inc.",BRIEF-Adverum Biotechnologies,ADVM


In [25]:
matches_filtered2 = matches_filtered.drop_duplicates(subset='matched_name', keep='first')

In [26]:
matches_filtered2.tail()

Unnamed: 0,match_confidence_lower_is_better,matched_name,origional_name,ticker
2458,0.74,"PNC FINANCIAL SERVICES GROUP, INC.",PNC Financial,PNC
2468,0.32,"Delek Logistics Partners, LP",Delek Logistics Partners,DKL
2469,0.0,ALAMO GROUP INC,Alamo Group Inc.,ALG
2474,0.6,TSAKOS ENERGY NAVIGATION LTD,Tsakos Energy Navigation Announces,TNP
2477,0.61,"Adverum Biotechnologies, Inc.",BRIEF-Adverum Biotechnologies,ADVM


In [34]:
matches_filtered2.shape

(1158, 4)

In [36]:
master = df.copy()

In [37]:
master.head()

Unnamed: 0,title,published,text,org_names
0,Intel's alleged security flaw could cost chipm...,2018-01-03,A report alleging Intel chips have a security ...,Intel
1,"Tesla delivers 1,550 Model 3 sedans and 29,870...",2018-01-03,Tesla is apparently still deep in the circles ...,Tesla
2,"US looks good in 2018, but stocks abroad look ...",2018-01-02,"US looks good in 2018, but stocks abroad look ...",Invesco
3,China's WeChat denies storing user chats,2018-01-02,"January 2, 2018 / 7:00 AM / Updated 11 hours a...",WeChat
4,Weight Watchers rallies after DJ Khaled signin...,2018-01-02,Shares of Weight Watchers rose nearly 6 percen...,Weight Watchers


In [38]:
master = pd.merge(master, matches_filtered2, left_on='org_names', right_on='origional_name', how='inner')
master.dropna(subset=['origional_name'], inplace=True)

In [43]:
master = master.drop(columns=['origional_name', 'match_confidence_lower_is_better'])

In [44]:
master.head()

Unnamed: 0,title,published,text,org_names,matched_name,ticker
0,Intel's alleged security flaw could cost chipm...,2018-01-03,A report alleging Intel chips have a security ...,Intel,INTEL CORP,INTC
1,Intel CEO: We believe we have the right fixes ...,2018-01-03,Intel CEO Brian Krzanich said researchers at G...,Intel,INTEL CORP,INTC
2,Intel just warned that its patches can cause p...,2018-01-12,Intel on Thursday said that recently-issued pa...,Intel,INTEL CORP,INTC
3,"Intel could breakout on earnings, here’s how t...",2018-01-24,"Chipmaker Intel reports earnings on Thursday, ...",Intel,INTEL CORP,INTC
4,Intel set to report earnings after the bell,2018-01-25,Intel stock rose more than 4.5 percent after t...,Intel,INTEL CORP,INTC


In [128]:
# master.to_csv('df_with_tickers.csv', index=False)

### BERT

In [129]:
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
import torch

In [130]:
tokenizer = BertTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

model = BertForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

In [131]:
def sentiment_score(text):
    texts = tokenizer.encode(text, return_tensors='pt')
    result = model(texts)
    return int(torch.argmax(result.logits))+1

In [132]:
master['sentiment'] = master['text'].apply(lambda x: sentiment_score(x[:512]))

In [133]:
master.head()

Unnamed: 0,title,published,text,org_names,matched_name,ticker,sentiment
0,Intel's alleged security flaw could cost chipm...,2018-01-03,A report alleging Intel chips have a security ...,Intel,INTEL CORP,INTC,1
1,Intel CEO: We believe we have the right fixes ...,2018-01-03,Intel CEO Brian Krzanich said researchers at G...,Intel,INTEL CORP,INTC,1
2,Intel just warned that its patches can cause p...,2018-01-12,Intel on Thursday said that recently-issued pa...,Intel,INTEL CORP,INTC,2
3,"Intel could breakout on earnings, here’s how t...",2018-01-24,"Chipmaker Intel reports earnings on Thursday, ...",Intel,INTEL CORP,INTC,1
4,Intel set to report earnings after the bell,2018-01-25,Intel stock rose more than 4.5 percent after t...,Intel,INTEL CORP,INTC,1


In [158]:
master.shape

(2493, 7)

### Article novelty 

In [160]:
from datetime import timedelta
master['published'] = pd.to_datetime(master['published'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  master['published'] = pd.to_datetime(master['published'])


In [172]:
novelty = master.copy()

In [180]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [181]:
def compute_similarity(text1, text2):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform([text1, text2])
    cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
    return cosine_sim

In [182]:
def check_similarity(row1, row2):
    if row1['matched_name'] == row2['matched_name']:
        delta_days = abs((row1['published'] - row2['published']).days)
        if delta_days <= 14:
            similarity = compute_similarity(row1['text'], row2['text'])
            return similarity
    return None

In [177]:
import time
t1 = time.time()
print('Started...')

rows_to_delete = np.zeros(len(novelty), dtype=bool)

# iterate through df, compare the current row with subsequent rows only
for i, row1 in novelty.iterrows():
    for j, row2 in novelty.iloc[i+1:].iterrows():
        similarity = check_similarity(row1, row2)
        if similarity is not None and similarity < 0.7:
            rows_to_delete[i] = True
            break  
            
# remove marked rows 
novelty = novelty[~rows_to_delete]

t = time.time()-t1
print("COMPLETED IN:", t)

Started...
COMPLETED IN: 69.44305109977722


In [178]:
master.shape

(2493, 7)

In [195]:
novelty.head()

Unnamed: 0,title,published,text,org_names,matched_name,ticker,sentiment
10,Intel's alleged security flaw could cost chipm...,2018-01-03,Intel's alleged security flaw could cost chipm...,Intel,INTEL CORP,INTC,1
90,Intel Promotes Four Corporate Officers,2018-01-31,NEWS HIGHLIGHTS Intel promotes Leslie S. Culbe...,Intel,INTEL CORP,INTC,4
91,"Tesla delivers 1,550 Model 3 sedans and 29,870...",2018-01-03,Tesla is apparently still deep in the circles ...,Tesla,"Tesla, Inc.",TSLA,1
92,Cramer Remix: Tesla’s stock is up for one simp...,2018-01-09,Cramer Remix: Teslas stock is up for one simpl...,Tesla,"Tesla, Inc.",TSLA,2
103,Tesla's Model 3 deliveries fall short of estim...,2018-01-04,Tesla's Model 3 deliveries fall short of estim...,Tesla,"Tesla, Inc.",TSLA,1


In [183]:
# novelty.to_csv('df_with_novelty.csv', index=False)

### Retreiveing data from API

In [189]:
master02 = novelty.copy()

In [190]:
# open date - day after publication, close date - 3 days after publication
def get_open_price(ticker, published_date):
    open_date = published_date + timedelta(days=1)
    end_date = published_date + timedelta(days=2) # not included
    ticker_data = yf.download(ticker, start=open_date, end=end_date)
    if not ticker_data.empty:
        return ticker_data['Open'][0]
    else:
        return None

def get_close_price(ticker, published_date):
    
    for i in range(3):  # try up to 3 days after the published_date in case it's weekened 
        close_date = published_date + timedelta(days=i+3)
        end_date = published_date + timedelta(days=i+4)
        ticker_data = yf.download(ticker, start=close_date, end=end_date)
        if not ticker_data.empty:
            return ticker_data['Close'][0]
    return None 

In [191]:
t1 = time.time()

master02['open_price'] = master02.apply(lambda row: get_open_price(row['ticker'], row['published']), axis=1)
master02['close_price'] = master02.apply(lambda row: get_close_price(row['ticker'], row['published']), axis=1)

t = time.time()-t1
print("COMPLETED IN:", t)

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed

1 Failed download:
['MBLY']: Exception("%ticker%: Data doesn't exist for startDate = 1515042000, endDa

COMPLETED IN: 588.6957583427429





In [192]:
master02.head()

Unnamed: 0,title,published,text,org_names,matched_name,ticker,sentiment,open_price,close_price
10,Intel's alleged security flaw could cost chipm...,2018-01-03,Intel's alleged security flaw could cost chipm...,Intel,INTEL CORP,INTC,1,43.52,44.740002
90,Intel Promotes Four Corporate Officers,2018-01-31,NEWS HIGHLIGHTS Intel promotes Leslie S. Culbe...,Intel,INTEL CORP,INTC,4,47.700001,44.52
91,"Tesla delivers 1,550 Model 3 sedans and 29,870...",2018-01-03,Tesla is apparently still deep in the circles ...,Tesla,"Tesla, Inc.",TSLA,1,20.858,22.427334
92,Cramer Remix: Tesla’s stock is up for one simp...,2018-01-09,Cramer Remix: Teslas stock is up for one simpl...,Tesla,"Tesla, Inc.",TSLA,2,22.146667,22.414667
103,Tesla's Model 3 deliveries fall short of estim...,2018-01-04,Tesla's Model 3 deliveries fall short of estim...,Tesla,"Tesla, Inc.",TSLA,1,21.108,22.427334


In [193]:
#master02.to_csv('df_with_price.csv', index=False)

In [None]:
# t1 = time.time()
# t = time.time()-t1
# print("COMPLETED IN:", t)