# Natural Language Processing  - Jupyter Notebook
### Cecilia, Conor, Francesco 
December 2019

# Importation of packages

Best practice: create a new python virtual environemnt and run the setup.py file provided in the project repository. Then run the notebook using the virtual environment. This should install everything required.

This project requires the modules found in the Requirements.txt file, which can be installed directly onto the current python installation via eg pip install -r Requirements.txt. It also requires the spacy model "en_core_web_sm" which can be installed via python -m spacy download en_core_web_sm after spacy in installed.

Anaconda installation may differ. In particular "conda forge install textacy" install of "pip install textacy".

This script should be run from the root of the project directory.

In [1]:
import copy
from gensim.models import word2vec
import itertools
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tag import StanfordNERTagger
import numpy as np
import os
import pandas as pd
import seaborn as sns
from sklearn import cluster, metrics
from sklearn import neighbors, datasets
from sklearn.cluster import KMeans
import snowballstemmer
import spacy
from string import ascii_lowercase
import textacy

This project also requires a jdk installation. Below are examples for MacOS and Windows. Edit this cell with a path to your jdk installation.

In [2]:
# Enter your username in here. I've put the paths you wrote as an attribute in the dictionary below
user = 'Conor'

# Dictionary containing the Java paths for each user
java_paths = {
    'Conor': '/usr/lib/jvm/java-8-openjdk-amd64',
    'Francesco': '/Users/macbookpro/Downloads/jdk-13.0.1.jdk/Contents/Home/bin/java',
    'Cecilia': 'C:/Program Files/Java/jdk-13.0.1/bin/java.exe'
}

java_path = java_paths[user]

os.environ['JAVAHOME'] = java_path

# Importing and Cleaning the data

In [3]:
tickers = pd.read_json('data/word_vectors/SP500_Tickers.json', typ='series')

header = pd.read_csv('headline_scraping/full_headlines.csv', sep = ",")
header = header.drop_duplicates('Headline')
header[(header['Date']=='31/10/19') & (header['Headline'].str.contains('goods rivals'))].loc[41725,'Headline']
header

Unnamed: 0.1,Unnamed: 0,Headline,Date,Provider
0,0,"​ UDAY states see Rs 11,989-crore drop in inte...",20/03/17,Economic Times India
1,1,Income Tax India: 1 taxpayer owes 11% of India...,24/01/17,Economic Times India
2,2,$10 billion unhedged gap in foreign exchange d...,17/10/16,Economic Times India
3,3,10% cost reduction can add $5.5 billion to Ind...,19/01/17,Economic Times India
5,5,​10% ethanol blending can save $1.7 billion in...,11/08/15,Economic Times India
...,...,...,...,...
65587,67548,Sir Sandy Crombie to join RBS board | The Inde...,23/05/09,The Independent
65588,67549,'We had a lot of fun': Sir Stephen Nickell ref...,25/01/17,The Independent
65589,67550,Sir Tom calls time on Dobbies interest | The I...,21/05/08,The Independent
65590,67551,Sir Victor Blank joins list for M&amp;S chairm...,16/05/10,The Independent


In [4]:
# Get a list of words to not include in headlines (improves SVO extraction accuracy)
stemmer = snowballstemmer.EnglishStemmer()
stop = stopwords.words('english')
stoplist = stemmer.stemWords(stop)
stoplist = set(stoplist)
stop = set(sorted(stop + list(stoplist)))

In [5]:
# remove characters and stoplist words, then generate dictionary of unique words
data = header

data['Original_Headline'] = data['Headline']
data['Headline'].replace('[!"#%\'()*+,-./:;<=>?@\[\]^_`{|}~1234567890’”“′‘\\\$&]','',inplace=True,regex=True)

extensions_list = ['reuters','Reuters','bloomberg', 'may','also','could','would', 'na','&amp','gets', 'getting', 'get','must','might','may','across','among','beside','however','yet','within']+list(ascii_lowercase)

stop = stop.union(set(extensions_list))

wordlist = filter(None, " ".join(list(set(list(itertools.chain(*data['Headline'].str.split(' ')))))).split(" "))

data['Headline'] = [' '.join(filter(None,filter(lambda word: word not in stop, line))) for line in data['Headline'].str.lower().str.split(' ')]

header = data

In [6]:
ticker = tickers.to_frame('Ticker').reset_index()
ticker.columns=['Name','Ticker']
ticker

Unnamed: 0,Name,Ticker
0,AGILENT TECHNOLOGIES,A
1,AMERICAN AIRLINES,AAL
2,ADVANCE AUTO PARTS,AAP
3,APPLE,AAPL
4,ABBVIE,ABBV
...,...,...
495,XYLEM,XYL
496,YUM! BRANDS,YUM
497,ZIMMER BIOMET HOLDINGS,ZBH
498,ZIONS BANCORPORATION NA,ZION


In [8]:
# Create a list of all the words/tickers of the company's names in the S&P500 
#In order to select only the headlines that concern the S&P500 companies. 

company_names = ticker['Name'].to_list()
ticker_list = ticker['Ticker'].to_list()

company_regex = str1 = '|'.join(ticker_list).lower()
company_regex = company_regex.replace( '&', ' ',)
company_regex = company_regex.replace( '   ', '&',)
company_regex = company_regex.replace( '  ', '&',)
company_regex = company_regex.replace( ' ', '&',)
company_regex = company_regex.replace( '-', '&',)

print(company_regex)

a|aal|aap|aapl|abbv|abc|abmd|abt|acn|adbe|adi|adm|adp|ads|adsk|aee|aep|aes|afl|agn|aig|aiv|aiz|ajg|akam|alb|algn|alk|all|alle|alxn|amat|amcr|amd|ame|amg|amgn|amp|amt|amzn|anet|anss|antm|aon|aos|apa|apd|aph|aptv|are|arnc|ato|atvi|avb|avgo|avy|awk|axp|azo|ba|bac|bax|bbt|bby|bdx|ben|bf.b|bhge|biib|bk|bkng|blk|bll|bmy|br|brk.b|bsx|bwa|bxp|c|cag|cah|cat|cb|cboe|cbre|cbs|cci|ccl|cdns|cdw|ce|celg|cern|cf|cfg|chd|chrw|chtr|ci|cinf|cl|clx|cma|cmcsa|cme|cmg|cmi|cms|cnc|cnp|cof|cog|coo|cop|cost|coty|cpb|cpri|cprt|crm|csco|csx|ctas|ctl|ctsh|ctva|ctxs|cvs|cvx|cxo|d|dal|dd|de|dfs|dg|dgx|dhi|dhr|dis|disck|dish|dlr|dltr|dov|dow|dre|dri|dte|duk|dva|dvn|dxc|ea|ebay|ecl|ed|efx|eix|el|emn|emr|eog|eqix|eqr|es|ess|etfc|etn|etr|evrg|ew|exc|expd|expe|exr|f|fang|fast|fb|fbhs|fcx|fdx|fe|ffiv|fis|fisv|fitb|flir|fls|flt|fmc|foxa|frc|frt|fti|ftnt|ftv|gd|ge|gild|gis|gl|glw|gm|googl|gpc|gpn|gps|grmn|gs|gww|hal|has|hban|hbi|hca|hcp|hd|hes|hfc|hig|hii|hlt|hog|holx|hon|hp|hpe|hpq|hrb|hrl|hsic|hst|hsy|hum|ibm|ice|idxx|i

In [9]:
# # Continue only with headlines which contain a company name or ticker
valid = pd.DataFrame(header['Headline'].str.contains(company_regex, regex=True))
valid
# len(good_indexes)
# header = header[]

Unnamed: 0,Headline
0,True
1,True
2,True
3,True
5,True
...,...
65587,True
65588,True
65589,True
65590,True


In [10]:
valid_indices = header[valid['Headline'] == True].reset_index()['index']
header = header.loc[valid_indices,:].reset_index()#[['Date','Header']]
# index

header

Unnamed: 0.1,index,Unnamed: 0,Headline,Date,Provider,Original_Headline
0,0,0,​ uday states see rs crore drop interest cost ...,20/03/17,Economic Times India,"​ UDAY states see Rs 11,989-crore drop in inte..."
1,1,1,income tax india taxpayer owes indias individu...,24/01/17,Economic Times India,Income Tax India: 1 taxpayer owes 11% of India...
2,2,2,billion unhedged gap foreign exchange deposit ...,17/10/16,Economic Times India,$10 billion unhedged gap in foreign exchange d...
3,3,3,cost reduction add billion indias trade revenu...,19/01/17,Economic Times India,10% cost reduction can add $5.5 billion to Ind...
4,5,5,​ ethanol blending save billion forex india re...,11/08/15,Economic Times India,​10% ethanol blending can save $1.7 billion in...
...,...,...,...,...,...,...
62518,65587,67548,sir sandy crombie join rbs board independent,23/05/09,The Independent,Sir Sandy Crombie to join RBS board | The Inde...
62519,65588,67549,lot fun sir stephen nickell reflects austerity...,25/01/17,The Independent,'We had a lot of fun': Sir Stephen Nickell ref...
62520,65589,67550,sir tom calls time dobbies interest independent,21/05/08,The Independent,Sir Tom calls time on Dobbies interest | The I...
62521,65590,67551,sir victor blank joins list mamps chairman ind...,16/05/10,The Independent,Sir Victor Blank joins list for M&amp;S chairm...


# SVO and Name Entity Recognition

In [11]:
# Load Spacy en_core_web_sm nlp model for SVO extraction
nlp = spacy.load('en_core_web_sm')

st = StanfordNERTagger(
    'ner/stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz',
    'ner/stanford-ner-2014-06-16/stanford-ner.jar',
    encoding = 'utf-8'
)
COMPANY_TYPES = ['PERSON', 'ORGANIZATION']

def get_ticker(name, tickers):
    for key, val in tickers.items():
        # Check if this is a ticker
        if val.upper() == name.upper():
            return f'__{val}'
        
        # Check that this is 
        if name.upper() in key.upper():
            return f'__{val}'
    return None

def parse_header(header,tickers,orig):
    '''Attempt to replace all organisations in a header with their ticker'''
    header = header.lower()
    nlp_header = nlp(header)

    tokens = list(textacy.extract.subject_verb_object_triples(nlp_header))  

    parsed_words = []
    for i in range(len(tokens)):
        ticker_0 = get_ticker(str(tokens[i][0]), tickers)
        ticker_2 = get_ticker(str(tokens[i][2]), tickers)  
        sub = str(tokens[i][0])
        obj = str(tokens[i][2])
        tick_sub = ''
        tick_obj = ''
        if ticker_0 is not None:
            sub = ticker_0
            tick_sub = 'sub'
        if ticker_2 is not None:
            obj = ticker_2
            tick_obj = 'obj'
        verb = str(tokens[i][1])
        tick = tick_sub + tick_obj
        
        if (ticker_0 != None) | (ticker_2 != None):
            parsed_words.append([sub, verb, obj, tick, header,orig])
        
    return parsed_words

In [None]:
svo_df = pd.DataFrame(columns = ['Date', 'Sub', 'Verb', 'Obj', 'Tick_Type', 'Headlines', 'Original Headline'])

for i in range (len(header['Headline'])):
    #print(i)
    headline = header.loc[i,'Headline']
    orig_head = header.loc[i,'Original_Headline'] 
    date = header.loc[i,'Date']
    svo_headlines = parse_header(headline,tickers,orig_head)
    
    for ls in svo_headlines:
        svo_df.loc[len(svo_df)] =  [date] + ls
    
svo_df

In [None]:
test = svo_df
svo_df['Date'] = pd.to_datetime(svo_df['Date'])

svo_df = svo_df.sort_values(by='Date')

length_training = int(np.floor(len(svo_df)*80/100))
date = svo_df['Date'].values
critical_date = date[length_training]
print(critical_date)
# print([x for x in svo_df['Verb'] if len(x.split(' ')) > 1])
# length_training = int(np.floor(len(svo_df)*80/100))
training = svo_df[svo_df['Date']<=critical_date]
test_data = svo_df[svo_df['Date']>critical_date]
training
# print((training.index))
# svo_df
#training.loc[40,'Obj']

#print(type(test.loc[1,'Date']))

# Word2Vec

In [None]:
model=word2vec.Word2Vec.load('word_vectors/models/model_1')
X = model[model.wv.vocab]

In [None]:
num_clusters = 100
kmeans = cluster.KMeans(n_clusters=num_clusters)
kmeans.fit(X)

## Create phrases vectors
#### (Concatenating the vectors)

In [None]:
data = []

to_parse = {
    'obj': 'Sub',
    'sub': 'Obj',
    'subobj': 'Verb'
}

for i in training.index:
    entry = {}
    for key in ['Date', 'Sub', 'Verb','Obj', 'Tick_Type', 'Headlines', 'Original Headline']:
        entry[key] = training.loc[i, key]
        
#     print(entry['Verb'])
    
    try:
        verb_vector = model.wv[entry['Verb'].upper()]
    except:
        verb_vector = None
    key_to_parse = to_parse[entry['Tick_Type']]
    
    try:
        other_vector = model.wv[entry[key_to_parse].upper()]
    except:
        other_vector = None
        
    if verb_vector is not None and other_vector is not None:
        final_vector = np.concatenate((verb_vector, other_vector))
        entry['final_vector'] = final_vector
    else:
        entry['final_vector'] = None
    
    data.append(entry)
    


In [None]:
print(data)

# Determine which tickers are the most quoted by articles

In [None]:
print(len(data))
print(data)
tickers_list = []
for x in data:
    if (x['Tick_Type'] == 'sub')&(x['final_vector'] is not None):
        tickers_list.append(x['Sub'])
    elif (x['Tick_Type'] == 'obj')&(x['final_vector'] is not None):
        tickers_list.append(x['Obj'])

#print(tickers_list)

from collections import Counter
counts = dict(Counter(tickers_list))

to_parse = {'tickers': [], 'counts': []}
for ticker, count in counts.items():
    to_parse['tickers'].append(ticker)
    to_parse['counts'].append(count)
print(to_parse)
    

ticker_counts = pd.DataFrame(to_parse)
ticker_counts = ticker_counts.sort_values(by = 'counts', ascending = False)
ticker_counts1 = ticker_counts[ticker_counts['counts']>5]
plt.figure(figsize = [20,10])
plt.scatter(ticker_counts1['tickers'], ticker_counts1['counts'])
plt.xticks(rotation = 'vertical')
#print(json.dumps(counts, indent = 2))
ticker_counts.head()
# plt.figure(figsize=(10,5))
# chart = sns.countplot(
#     data = ticker_counts,
#     x='tickers',
#     y = 'counts',
#     palette='Set1'
# )
# chart.set_xticklabels(chart.get_xticklabels(), rotation=45)

# Clustering phrases vectors

In [None]:
relevant_data = [x for x in data if x['final_vector'] is not None]

In [None]:
# vocab = [x['final_vector'] for x in data if x['final_vector'] is not None]

vocab = [x['final_vector'] for x in relevant_data]
num_clusters = 100
kmeans = cluster.KMeans(n_clusters = num_clusters)
kmeans.fit(vocab)
print(len(vocab))

# Mapping

In [None]:
centroids = kmeans.cluster_centers_
labels = kmeans.labels_
print(len(labels))
# print(centroids)

In [None]:
for i in range(len(relevant_data)):
    relevant_data[i]['cluster_label'] = labels[i]

In [None]:
n_neighbors = 5
y = labels

h = .02  # step size in the mesh

for weights in ['uniform', 'distance']:
    # we create an instance of Neighbours Classifier and fit the data.
    clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights, algorithm = 'ball_tree')
    clf.fit(vocab, y)

In [None]:
print(clf.fit(vocab, y))

In [None]:
A = clf.kneighbors([vocab[2]],n_neighbors)
A
# print(type([vocab[2]]))

# Returns study

In [None]:
returnsdf = pd.read_csv('data/returns/returnsSP500.csv')
returnsdf

# print(len(returnsdf.columns))
# print('GOOG' in returnsdf.columns)

In [None]:
print(len(returnsdf.columns))
#print(returnsdf.columns[22:100])

In [None]:
params = pd.read_csv('data/SP500_parameters.csv')
params['Ticker_b']=params['Ticker']
params['Ticker']='__'+params['Ticker']
params
# params.loc[50:100,'Ticker_b']


In [None]:
params[params['Ticker_b']=='FLS']

'FLS' in returnsdf.columns

In [None]:

ticker_counts.columns=['Ticker','count']
ticker_counts = ticker_counts.merge(params[['Ticker','GICS']],on = 'Ticker')


## Which industries are the most tackled by headlines 
##### in order to have enough data to study

In [None]:
GICS_count = ticker_counts.groupby(['GICS']).agg(['sum'])
GICS_count =GICS_count.sort_values(by = ('count','sum'),ascending=False)
# GICS_count
# plt.plot(GICS_count[('count','sum')])

plt.figure(figsize = [20,10])
# plt.plot(GICS_count[('count','sum')], marker = 'o')
GICS_count[('count','sum')].plot(kind='bar')
# plt.bar(GICS_count['count'], GICS_count['sum'])

plt.xlabel('GICS Industry Classification')
plt.ylabel('Number of Articles')

# plt.xticks(rotation = 'horizontal')

# GICS_count = GICS_count.sort_values(by = 'count', ascending = False)

In [None]:
industries = set(params['GICS'].values)

dataframe_industry_dict = {}
for industry in industries:
    industry_tickers = params.loc[params['GICS'] == industry, 'Ticker']
    industry_tickers = industry_tickers.str.replace('__','')
    industry_tickers = industry_tickers.str.replace('.','-')
    
    industry_tickers = list(set(industry_tickers) & set(returnsdf.columns))
    df_industry = returnsdf[industry_tickers + ['Date']]
    
    df_industry['market_exp'] = df_industry[industry_tickers].mean(axis=1)
    df_industry = df_industry.iloc[1:,:]#.dropna()
    cols = df_industry.columns
    cols = cols[-2:].append(cols[:(len(cols)-2)])
    df_industry = df_industry[cols]
    dataframe_industry_dict[industry] = df_industry
#print(len(healthy_ticker))

#print(len(df_health.columns))
# df_health
dataframe_industry_dict['Financials']


# LIBOR 3M importation

In [None]:
libor = pd.read_excel('data/LIBOR_3M.xlsx')
libor['LIBOR 3M'] = libor['LIBOR 3M']/100
libor['Date'] = pd.to_datetime(libor['Date'])
libor

In [None]:
for industry in industries:
    test = dataframe_industry_dict[industry]
    test['Date'] = pd.to_datetime(test['Date'])
    test = test.merge(libor,on = 'Date')
    
    cols = test.columns
    cols = cols[-1:].append(cols[:(len(cols)-1)])
    test = test[cols]
    
    dataframe_industry_dict[industry] = test
    
dataframe_industry_dict['Financials']

# Computation of abnormal returns

In [None]:
abret_dict = {}
for industry in industries:
    df_returns = dataframe_industry_dict[industry]
    capm = pd.DataFrame(df_returns['Date'])
    risk_free_rate = df_returns['LIBOR 3M']
    market_avg_returns = df_returns['market_exp']
    
    abret_dict[industry] = pd.DataFrame(df_returns['Date'])
    
    for col in df_returns.columns[3:]:
        col = col.replace('-', '.')
        
        beta = params.loc[params['Ticker_b'] == col]['Beta'].values[0]
        stock_returns = returnsdf[col.replace('.', '-')]
        
        capm[col] = risk_free_rate + beta*(market_avg_returns-risk_free_rate)
        
        abret_dict[industry][col] = stock_returns - capm[col]
        abret_dict[industry] = abret_dict[industry].where((pd.notnull(abret_dict[industry])), None)
        
abret_dict['Financials']

In [None]:
# Define a lookup function to return, for a given ticker, 
# start_date and horizon, the cumulative average returns over that period

def lookup_cumul_returns(ticker, start_date, horizon):
    '''
    Args:
    start_date (pd.Timestamp)
    '''
    
    try:
        industry_classification = params.loc[params['Ticker_b'] == ticker, 'GICS'].values[0]
    except:
        print(ticker)
        return None
        
    relevant_df = abret_dict[industry_classification]
    
    start_index_values = relevant_df.loc[pd.to_datetime(relevant_df['Date']) >= pd.Timestamp(start_date),:].index.values
    
    if len(start_index_values) == 0:
        return None
    else:
        start_index = start_index_values[0]

    end_index = start_index + horizon
    
    if end_index > len(relevant_df):
        return None
    
    relevant_returns = relevant_df.loc[start_index + 1:end_index, ticker]
    
    to_return = relevant_returns.cumsum().values[-1] / horizon
    
    if np.isnan(to_return):
        return None
    
    else:
        return to_return
    
output = lookup_cumul_returns('A', '2010-01-04', 3)
print(output)

In [None]:
horizons = [3, 5, 10]

print(list(relevant_data[0].keys()))

# Add a column with the referenced ticker for each headline
for i, el in enumerate(relevant_data):
    lookup_col = el['Tick_Type']
    if lookup_col in ['sub', 'obj']:
        el['Ticker'] = el[lookup_col.capitalize()]
    else:
        el['Ticker'] = None
        
    relevant_data[i] = el
    
relevant_data = [x for x in relevant_data if x['Ticker'] is not None]
    
# {
#     3 {
#         'headlines': [(Sub, Verb, Obj), (), ...],
#         '3 day': [],
#         '5 day': []
#     }
# }

# Set up the data structure. Each entry looks like the above
cluster_labels = set([x['cluster_label'] for x in relevant_data])

cluster_sentiments = {}
for label in cluster_labels:
    cluster_sentiments[label] = {
        'svos': [],
        'headlines': [],
    }
    for horizon in horizons:
        cluster_sentiments[label]['{}_days'.format(horizon)] = []       
        
for el in relevant_data:
    svo = (el['Sub'], el['Verb'], el['Obj'])
    start_date = el['Date']
    ticker = el['Ticker'].replace('__', '')
    
    cluster_label = el['cluster_label']
    headline = el['Original Headline']
    
    cluster_sentiments[cluster_label]['svos'].append(svo)
    cluster_sentiments[cluster_label]['headlines'].append(headline)
    
    for horizon in horizons:
        horizon_cumul_returns = lookup_cumul_returns(ticker, start_date, horizon)
        cluster_sentiments[cluster_label]['{}_days'.format(horizon)].append(horizon_cumul_returns)        
        

In [None]:
parsed_sentiments = copy.deepcopy(cluster_sentiments)

for label in cluster_sentiments:
    for horizon in horizons:
        entries = cluster_sentiments[label]['{}_days'.format(horizon)]
        non_null_entries = [x for x in entries if x is not None]
        parsed_sentiments[label]['{}_days'.format(horizon)] = np.mean(non_null_entries)

# Remove any clusters which have all nan values
all_nan_labels = []
for label, contents in parsed_sentiments.items():
    all_nan = True
    for horizon in horizons:
        if not np.isnan(contents['{}_days'.format(horizon)]):
            all_nan = False
            break
            
    if all_nan:
        all_nan_labels.append(label)
        
for label in all_nan_labels:
    del parsed_sentiments[label]
        
cluster_labels = list(parsed_sentiments.keys())

In [None]:
for label, contents in parsed_sentiments.items():
    print('cluster {}'.format(label))
    for horizon in [3,5,10]:
#         print(contents)
        print(contents['{}_days'.format(horizon)])

In [None]:
small_days_sentiment=[]
medium_days_sentiment = []
large_days_sentiment = []
for x in cluster_labels:
    small_days_sentiment.append(1000*parsed_sentiments[x]['3_days'])
    medium_days_sentiment.append(parsed_sentiments[x]['5_days'])
    large_days_sentiment.append(parsed_sentiments[x]['10_days'])

# fig,ax = plt.subplots(figsize = (20,4))
fig, ax = plt.subplots()
# fig(figsize = (20,20))
plt.scatter(small_days_sentiment,np.zeros(len(cluster_labels)))
label_graph=[str(list(cluster_labels)[i]) for i in range(len(cluster_labels))]
plt.xlim(-9,7)
for i,txt in enumerate(label_graph):
    ax.annotate(txt,(small_days_sentiment[i],np.zeros(len(cluster_labels))[i]))


## FORECASTING

In [None]:
def clean(headlines, stopwords):
    '''
    Args:
    headlines: pd.Series
    '''
    headlines = headlines.str.lower()
    
    headlines.replace('[!"#%\'()*+,-./:;<=>?@\[\]^_`{|}~1234567890’”“′‘\\\$&]','', inplace = True, regex=True)
    extensions_list = ['co','reuters','Reuters','bloomberg', 'may','also','could','would', 'na','&amp','gets', 'getting', 'get','must','might','may','across','among','beside','however','yet','within']+list(ascii_lowercase)

    stopwords = stopwords.union(set(extensions_list))

    wordlist = filter(None, " ".join(list(set(list(itertools.chain(*headlines.str.split(' ')))))).split(" "))

    headlines = [' '.join(filter(None,filter(lambda word: word not in stopwords, line))) for line in headlines.str.lower().str.split(' ')]
    
    return headlines

In [None]:
def closest_cluster_sentiment(headline, tickers):
    '''
    Returns the closest cluster to the new headline
    
    Args:
    headline (str)
    clusters (dict)
    '''
    
    n_neighors = 5
    
    # Get SVO of the new headline
    cleaned_headline = clean(pd.Series([headline]), stop)
    ticker_info = parse_header1(cleaned_headline[0], tickers, None)
    
    headline_sentiment = {}
    
    headline_sentiment = {
        '3_days': [],
        '5_days': [],
        '10_days': []
    }
    
    if len(ticker_info) != 0:
        try :
            for ls in ticker_info:
                ticker_key = ls[3]

                if ticker_key == 'obj':
                    ticker = ls[2]
                    svo = [ls[0],ls[1]]
                    svo_vec = [np.concatenate((model.wv[ls[1].upper()], model.wv[ls[0].upper()]))]

                if ticker_key == 'sub':
                    ticker = ls[0]
                    svo = [ls[1],ls[2]]
                    svo_vec = [np.concatenate((model.wv[ls[1].upper()], model.wv[ls[2].upper()]))]


                close_neighbors = clf.kneighbors(svo_vec,n_neighbors)
                clusters_list = [labels[close_neighbors[1][0][i]] for i in range(n_neighors)]
                distances_list = [close_neighbors[0][0][i] for i in range(n_neighors)]

                closest_cluster_label = clusters_list[0]
                for horizon in horizons:
                    closest_cluster_horizon_sentiment = parsed_sentiments[closest_cluster_label]['{}_days'.format(horizon)]
                    headline_sentiment['{}_days'.format(horizon)] = closest_cluster_horizon_sentiment
                
        except:
            return None
                
    return headline_sentiment

In [None]:
def inverse_distance_sentiment(headline, parsed_sentiments, tickers, labels, horizons):
    '''
    Returns the closest cluster to the new headline
    
    Args:
    headline (str)
    clusters (dict)
    tickers (dict): list of company name: ticker pairs
    horizons (iterable): iterable of horizon lengths to consider
    '''
    p = 2
    n_neighbours = len(labels)
    
    # Get SVO of the new headline
    cleaned_headline = clean(pd.Series([headline]), stop)
    ticker_info = parse_header1(cleaned_headline[0], tickers, None)
    
    headline_sentiment = {'{}_days'.format(horizon): [] for horizon in horizons}
    
    if len(ticker_info) == 0:
        return None
    try :
        for ls in ticker_info:
            ticker_key = ls[3]

            if ticker_key == 'obj':
                ticker = ls[2]
                svo = [ls[0],ls[1]]
                svo_vec = [np.concatenate((model.wv[ls[1].upper()], model.wv[ls[0].upper()]))]

            elif ticker_key == 'sub':
                ticker = ls[0]
                svo = [ls[1],ls[2]]
                svo_vec = [np.concatenate((model.wv[ls[1].upper()], model.wv[ls[2].upper()]))]

            else:
                # Skip if this is a subobj ticker label
                continue

            neighbours = clf.kneighbors(svo_vec, n_neighbours)
            clusters_list = list(set([labels[neighbours[1][0][i]] for i in range(n_neighbours)]) & set(list(parsed_sentiments.keys())))
            distances_list = [neighbours[0][0][i] for i in range(n_neighbours)]

            new_clusters_list = []
            new_distances_list = []

            for index, label in enumerate(clusters_list):
                if label in parsed_sentiments:
                    new_clusters_list.append(clusters_list[index])
                    new_distances_list.append(distances_list[index])


            distances_list = new_distances_list
            clusters_list = new_clusters_list
            
            #### NOTE! This is not correct

            for horizon in horizons:
                if 0 in distances_list :
                    headline_sentiment['{}_days'.format(horizon)] = parsed_sentiments[clusters_list[0]]['{}_days'.format(horizon)]
                else:
                    weights = np.array(list(map(lambda x: x**(-p), np.array(distances_list))))
                    sentiments = [parsed_sentiments[i]['{}_days'.format(horizon)] for i in clusters_list]
                    
                    if len(sentiments) == 0:
                        print(headline)
                    
                    if len(weights) == 0:
                        print(headline)
                        
                    headline_sentiment['{}_days'.format(horizon)] = np.average(sentiments, weights = weights)
                    
                    print(headline_sentiment['{}_days'.format(horizon)])
                    
                    if np.isnan(headline_sentiment['{}_days'.format(horizon)]):
                        print('was nan before')
                        headline_sentiment['{}_days'.format(horizon)] = None

    except:
        print('exception')
        return None
    
    all_empty = True
    for val in headline_sentiment.values():
        if type(val) == np.float64:
            all_empty = False
            break
            
    if all_empty:
        print('all empty')
        return None

    return headline_sentiment


In [None]:
testing_output = []

for index in test_data.index:
    date = test_data.loc[index, 'Date']
    headline = test_data.loc[index, 'Headlines']
    
    if test_data.loc[index, 'Tick_Type'] == 'obj':
        ticker = test_data.loc[index, 'Obj']
    if test_data.loc[index, 'Tick_Type'] == 'sub':
        ticker = test_data.loc[index, 'Sub']
#     else:
#         ticker = None
    info = {
        'date': date,
        'headline': headline,
        'ticker' : ticker
    }
    
#     horizon_sentiments = inverse_distance_sentiment(
#         headline = headline,
#         parsed_sentiments = parsed_sentiments,
#         tickers = tickers,
#         labels = labels,
#         horizons = horizons
#     )

    horizon_sentiments = closest_cluster_sentiment(
        headline = headline,
        tickers = tickers
    )
    
    if horizon_sentiments is not None:
        for horizon in horizons:
            info['{}_days'.format(horizon)] = horizon_sentiments['{}_days'.format(horizon)]
            
        testing_output.append(info)

In [None]:
testing_df = pd.DataFrame(testing_output)
testing_df

# print(len(testing_df))

In [None]:
testing_df[testing_df['headline']=='delta take percent stake latam airlines']

In [None]:
for horizon in horizons:
    for i in testing_df.index:
        date = testing_df.loc[i,'date']
        ticker = testing_df.loc[i,'ticker']
        ticker = ticker.replace('__','')
        cumul = lookup_cumul_returns(ticker, date, horizon)
        #print(cumul)
        testing_df.loc[i,str(horizon)+'_days_actual'] = cumul
        
testing_df = testing_df.dropna()
testing_df

In [None]:
for horizon in horizons:
    testing_df[str(horizon)+'_dir_comparison'] = testing_df[str(horizon)+'_days_actual']*testing_df[str(horizon)+'_days']
  

testing_df 

In [None]:
direction_test = testing_df
direction_result_dict ={}
for horizon in horizons :
    pos = len(direction_test.loc[direction_test[str(horizon)+'_dir_comparison']>0,:])
    tot = len(direction_test)
    direction_result_dict[horizon]=pos/tot
    
direction_result_dict
    

In [None]:
magnitude_test = testing_df

for horizon in horizons :
    magnitude_test[str(horizon)+'_rel_gap']= abs(magnitude_test[str(horizon)+'_days_actual']-magnitude_test[str(horizon)+'_days'])/magnitude_test[str(horizon)+'_days_actual']
    
magnitude_test