In [1]:
import spacy
from spacy.tokens import Token
from spacy import displacy
import pandas as pd
import pickle
import random
import tqdm
import numpy as np



In [3]:
all_verbs = []

In [4]:
file = open("../stored_data/articles_with_tickers.obj",'rb')
docs = pickle.load(file)
file.close()

In [5]:
nlp = spacy.load('../model/pipeline')

In [6]:
filtered_entities = [
    'EVENT',
    'FAC',
    'GPE',
    'LAW',
    'LOC',
    'NORP',
    'ORG',
    'PERSON',
    'PRODUCT',
    'WORK_OF_ART',
    'COMPANY',
    'COMMODITY'
]

In [7]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
# Add match ID "HelloWorld" with no callback and one pattern
pattern = [
    {"ENT_TYPE": {'IN': filtered_entities}}, 
    {"ENT_TYPE": {'NOT_IN': filtered_entities}, 'OP': '*'}, 
    {'POS': 'VERB'},
    {"ENT_TYPE": {'NOT_IN': filtered_entities}, 'OP': '*'}, 
    {"ENT_TYPE": {'IN': filtered_entities}}]
matcher.add("any_verb", [pattern])

In [8]:
all_verbs = []
e2e_relationship = []

for id, file in tqdm.tqdm(docs.items()):

    doc = nlp(file)

    matches = matcher(doc)
    for match_id, start, end in matches:
        string_id = nlp.vocab.strings[match_id]  # Get string representation
        span = doc[start:end]  # The matched span

        is_passive = False

        for token in span:
            if token.pos_ == "VERB":
                verb = token.lemma_
                all_verbs.append(token)
            if "pass" in token.dep_:
                is_passive = True

        if is_passive:
            for token in span:
                if "subjpass" in token.dep_:
                    obj = token
                    obj_ent = token.ent_type_
                elif "obj" in token.dep_:
                    subj = token
                    subj_ent = token.ent_type_
        else:
            for token in span:
                if "obj" in token.dep_:
                    obj = token
                    obj_ent = token.ent_type_
                elif "subj" in  token.dep_:
                    subj = token
                    subj_ent = token.ent_type_


        e2e_relationship.append([subj, subj_ent, verb, obj, obj_ent])
        

100%|██████████| 8399/8399 [03:54<00:00, 35.81it/s]


In [9]:
len(e2e_relationship)

47362

In [10]:
e2e_relationship_clean = []

for rel in e2e_relationship:
    if (rel[1] != '') and (rel[-1] != ''):
        e2e_relationship_clean.append(rel)

print(len(e2e_relationship_clean))

5241


In [11]:
df = pd.DataFrame(e2e_relationship_clean, columns=["subj", "subj_label", "verb", "obj", "obj_label"])

In [12]:
df_id = pd.read_csv('../database/entities.csv')

In [13]:
subj_idx = []
obj_idx = []

for idx, row in df.iterrows():
    subj = row['subj'].text.lower()
    obj = row['obj'].text.lower()

    idx = df_id.loc[df_id.name == subj]['id'].values
    try:
        idx = idx[0]
    except IndexError:
        idx = np.nan
        
    subj_idx.append(idx)

    idx = df_id.loc[df_id.name == obj]['id'].values
    try:
        idx = idx[0]
    except IndexError:
        idx = np.nan
    obj_idx.append(idx)

In [14]:
df['subj_id'] = subj_idx
df['obj_id'] = obj_idx

In [15]:
df.to_csv('../database/relationships.csv')

In [16]:
df = pd.read_csv('../database/relationships.csv', index_col=0)

In [17]:
df = df.dropna()

In [18]:
df = df.astype({"subj_id": int, "obj_id": int})

In [19]:
df.to_csv('../database/relationships.csv')

In [20]:
used_verbs = df['verb'].to_list()

In [21]:
from nltk.corpus import wordnet

In [22]:
synonyms = []
antonyms = []

dictSynonyms = {}
synonyms = [[] for x in range(len(set(used_verbs)))]

for idx, verb in enumerate(set(used_verbs)):

    dictSynonyms[verb] = []

    for syn in wordnet.synsets(verb):
        for l in syn.lemmas():
            dictSynonyms[verb].append(l.name())
            synonyms[idx].append(l.name())

In [23]:
(len(set(used_verbs)))

465

In [41]:
clusters = {}

for idx, verb_1 in enumerate(list(set(used_verbs))):
    print(idx, len(set(used_verbs)), end="\r")

    clusters[verb_1] = []

    for verb_2 in list(set(used_verbs)):

        if verb_2 in dictSynonyms[verb_1]:
            clusters[verb_1].append(verb_2)

464 465 465

In [42]:
only_cluster = []

for key, cluster in clusters.items():
    only_cluster.append(cluster)

In [43]:
only_cluster

[['become', 'go', 'get', 'turn'],
 ['finish', 'complete', 'end', 'close', 'stop'],
 ['locate', 'settle', 'place'],
 ['welcome', 'receive'],
 ['cause', 'make', 'get', 'do', 'have'],
 ['resume'],
 ['notify'],
 [],
 ['accuse', 'charge'],
 ['limit', 'fix', 'set'],
 ['withdraw', 'take', 'remove'],
 ['stress'],
 ['register', 'show', 'record', 'file'],
 ['want', 'require'],
 ['win', 'acquire', 'gain', 'succeed'],
 ['criticize', 'criticise'],
 ['operate', 'go', 'control', 'work', 'run'],
 ['allocate'],
 ['table', 'postpone', 'remit', 'defer'],
 ['tender', 'bid', 'offer'],
 ['issue', 'supply', 'release', 'emerge', 'return', 'publish', 'take', 'cut'],
 ['choose', 'take', 'prefer'],
 [],
 ['raise', 'promote', 'rise', 'produce', 'lift', 'grow', 'enhance', 'prove'],
 ['establish', 'launch', 'show', 'make', 'build', 'base', 'give', 'prove'],
 ['reiterate', 'restate', 'repeat'],
 ['arrange', 'put', 'do', 'set'],
 ['support', 'confirm', 'hold', 'keep', 'back', 'defend'],
 ['ban'],
 ['decrease', 'fall'

In [25]:
cluster_matrix = np.array(only_cluster)

  cluster_matrix = np.array(only_cluster)


In [26]:
cluster_matrix[np.where(cluster_matrix==None)] = 'None'

In [27]:
cluster_matrix

array([list(['wage', 'pay']), list(['narrow']),
       list(['operate', 'go', 'work', 'control', 'run']),
       list(['produce', 'get', 'raise', 'acquire', 'grow', 'make', 'develop']),
       list(['accord', 'agree', 'grant']), list(['elect']),
       list(['poll']), list(['smuggle']),
       list(['accept', 'take', 'have', 'admit']), list(['deny']),
       list(['harvest']),
       list(['drop', 'dismiss', 'fall', 'spend', 'miss', 'throw']),
       list(['donate']), list(['anger']),
       list(['shut', 'exclude', 'close']), list(['baulk']),
       list(['blast', 'shell']),
       list(['press', 'push', 'urge', 'bid', 'fight']),
       list(['feel', 'find', 'look']),
       list(['base', 'mean', 'establish']), list([]),
       list(['report', 'study', 'describe', 'cover']),
       list(['invest', 'put', 'commit', 'place']),
       list(['be', 'comprise', 'follow', 'represent', 'live', 'exist']),
       list(['report', 'study', 'take', 'work', 'consider', 'examine']),
       list(['co

In [28]:
from sklearn.metrics import pairwise_distances

In [29]:
uniques = np.unique(cluster_matrix)
X = np.searchsorted(uniques, cluster_matrix)

In [None]:
distance_matrix = pairwise_distances(X, metric="jaccard")

In [None]:
from sklearn.cluster import DBSCAN
clustering = DBSCAN(eps=0.001, min_samples=1).fit(distance_matrix)

max(clustering.labels_)

19

In [None]:
for x in range(max(clustering.labels_)):
    print(f"cluster {x}")
    for idx, val in enumerate(clustering.labels_):
        if val == x:
            print(used_verbs[idx])

cluster 0
have
rise
depend
convince
intervene
allow
release
agree
do
include
follow
rise
curb
supply
have
negotiate
buy
have
cut
say
come
market
revise
buy
say
become
tell
tell
cover
prove
hold
post
suspend
acquire
say
say
pursue
publicise
include
thwart
cluster 1
have
say
tell
say
have
say
feed
visit
tell
say
sell
value
expect
warn
neighbour
fall
say
start
confirm
follow
buy
have
base
reach
tell
add
explore
leave
resume
merge
affiliate
complete
say
produce
preside
capture
meet
say
carry
lead
lead
leave
raise
say
raise
sell
tell
say
say
return
beat
say
say
cluster 2
say
combine
sell
leave
persuade
contact
control
live
reiterate
tell
buy
seek
tell
head
discuss
look
persuade
ask
face
say
groundnutsee
concern
tell
say
begin
concentrate
tell
give
announce
think
ask
sell
tell
raise
impose
begin
extend
sell
report
complete
tell
deal
buy
ask
reply
smuggle
say
show
ask
import
discredit
run
develop
say
base
say
announce
gross
take
tell
industrialise
concern
see
say
talk
usda
accept
usda
benefit

In [59]:
only_cluster

[['become', 'go', 'get', 'turn'],
 ['finish', 'complete', 'end', 'close', 'stop'],
 ['locate', 'settle', 'place'],
 ['welcome', 'receive'],
 ['cause', 'make', 'get', 'do', 'have'],
 ['resume'],
 ['notify'],
 [],
 ['accuse', 'charge'],
 ['limit', 'fix', 'set'],
 ['withdraw', 'take', 'remove'],
 ['stress'],
 ['register', 'show', 'record', 'file'],
 ['want', 'require'],
 ['win', 'acquire', 'gain', 'succeed'],
 ['criticize', 'criticise'],
 ['operate', 'go', 'control', 'work', 'run'],
 ['allocate'],
 ['table', 'postpone', 'remit', 'defer'],
 ['tender', 'bid', 'offer'],
 ['issue', 'supply', 'release', 'emerge', 'return', 'publish', 'take', 'cut'],
 ['choose', 'take', 'prefer'],
 [],
 ['raise', 'promote', 'rise', 'produce', 'lift', 'grow', 'enhance', 'prove'],
 ['establish', 'launch', 'show', 'make', 'build', 'base', 'give', 'prove'],
 ['reiterate', 'restate', 'repeat'],
 ['arrange', 'put', 'do', 'set'],
 ['support', 'confirm', 'hold', 'keep', 'back', 'defend'],
 ['ban'],
 ['decrease', 'fall'

In [34]:
def jaccard_similarity(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    return float(len(s1.intersection(s2)) / len(s1.union(s2)))

In [35]:
def generate_self_similarity_matrix(cluster_lists):

    count = 0

    self_similarity_matrix = [[] for x in range(len(cluster_lists))]

    for cluster_1 in cluster_lists:
        for cluster_2 in cluster_lists:
            try:
                self_similarity_matrix[count].append(jaccard_similarity(cluster_1, cluster_2))
            except ZeroDivisionError:
                self_similarity_matrix[count].append([])

        count += 1
    
    return self_similarity_matrix

In [36]:
def jaccard_clustering(clustered_list, thresh):

    clustered_results = []

    self_similarity_matrix = generate_self_similarity_matrix(clustered_list)

    for id, row in enumerate(self_similarity_matrix):
        
            indices = []
            for val in row:
                try:
                    if (val >= thresh) and (val != 1.0):
                        indices.append(row.index(val))
                except:
                    pass

            result = [clustered_list[y] for y in indices]
            if result == []:
                clustered_results.append(list(set(clustered_list[id])))
            else:
                clustered_results.append(list(set([item for sublist in result for item in sublist])))

    return clustered_results


In [37]:
def clean_clusters(clustered_results):
    cleaned_clusters = []

    verbs_done = []

    for row_1 in results_clusters:
        if row_1 != []:
            new_row = row_1.copy()
            for row_2 in results_clusters:
                if (row_2 != []) and (row_2[0] not in verbs_done):
                    if row_1[0] == row_2[0]:
                        verbs_done.append(row_1[0])
                        for el in row_2:
                            new_row.append(el)
            cleaned_clusters.append(list(set(new_row)))

    return cleaned_clusters

In [38]:
def merge(lists, results=None):

    if results is None:
        results = []

    if not lists:
        return results

    first = lists[0]
    merged = []
    output = []

    for li in lists[1:]:
        for i in first:
            if i in li:
                merged = merged + li
                break
        else:
            output.append(li)

    merged = merged + first
    results.append(list(set(merged)))

    return merge(output, results)

In [115]:
thresh = 0.4
results_clusters = jaccard_clustering(only_cluster, thresh)


In [116]:
len(results_clusters)

465

In [117]:
results_clusters

[['become', 'go', 'get', 'turn'],
 ['finish', 'stop', 'shut', 'end', 'close', 'near'],
 ['locate', 'resolve', 'settle', 'fall'],
 ['receive', 'welcome'],
 ['get', 'cause', 'make', 'have', 'do'],
 ['resume'],
 ['notify'],
 [],
 ['accuse', 'charge'],
 ['get', 'fix', 'make', 'set', 'cook', 'limit'],
 ['remove', 'hit', 'withdraw', 'take'],
 ['stress'],
 ['register', 'show', 'charge', 'enter', 'record', 'file', 'book'],
 ['want', 'require'],
 ['succeed', 'follow', 'win'],
 ['criticise', 'criticize'],
 ['run', 'operate', 'work', 'control', 'go'],
 ['allocate'],
 ['remit', 'postpone', 'table', 'submit', 'defer'],
 ['bid', 'call', 'press', 'tender', 'invite', 'offer'],
 ['take', 'supply', 'emerge', 'publish', 'release', 'return', 'cut', 'issue'],
 ['favor', 'choose', 'prefer'],
 [],
 ['prove', 'lift', 'develop', 'boost', 'raise', 'grow', 'rise'],
 ['prove', 'give', 'make', 'establish', 'launch', 'base', 'build', 'show'],
 ['restate', 'repeat', 'reiterate', 'double'],
 ['put', 'fix', 'set', 'pl

In [118]:
#results_clusters = merge(results_clusters)

In [119]:
#len(results_clusters)

In [120]:
relationship_db = {
    "undefined" : []
}

count = 0
for x in results_clusters:
    if len(x) > 1:
        count += 1
        relationship_db[x[0]] = x
    else:
        relationship_db["undefined"].append(x)

print(count)

334


In [121]:
relationship_db.keys()

dict_keys(['undefined', 'become', 'finish', 'locate', 'receive', 'get', 'accuse', 'remove', 'register', 'want', 'succeed', 'criticise', 'run', 'remit', 'bid', 'take', 'favor', 'prove', 'restate', 'put', 'confirm', 'decrease', 'encourage', 'tie', 'mention', 'transport', 'send', 'establish', 'allow', 'contact', 'suggest', 'remain', 'require', 'adopt', 'cause', 'damage', 'beat', 'repay', 'describe', 'expand', 'gross', 'place', 'break', 'acquire', 'reach', 'reject', 'dismiss', 'disclose', 'move', 'make', 'accord', 'exempt', 'exclude', 'say', 'contain', 'tender', 'give', 'permit', 'follow', 'award', 'combine', 'talk', 'keep', 'oppose', 'lead', 'provide', 'maintain', 'represent', 'publish', 'strike', 'record', 'ensure', 'examine', 'forecast', 'orient', 'force', 'emerge', 'look', 'charge', 'relate', 'believe', 'improve', 'trade', 'urge', 'grant', 'view', 'state', 'gain', 'happen', 'inspect', 'estimate', 'fix', 'wage', 'post', 'persuade', 'acknowledge', 'have', 'wait', 'insist', 'switch', 'pro

In [129]:
relationship_db['inspect']

['inspect', 'impose', 'call', 'see', 'visit']

In [33]:
df

Unnamed: 0,subj,subj_label,verb,obj,obj_label,subj_id,obj_id
0,taiwan,GPE,have,u.s,GPE,86,1128
1,korea,GPE,have,u.s,GPE,522,1128
4,indonesia,GPE,say,rotterdam,GPE,97,98
5,oil,COMMODITY,import,indonesia,GPE,19746,97
11,komatsu,PERSON,tell,reuters,ORG,2196,2186
...,...,...,...,...,...,...,...
5232,ghana,GPE,want,cocoa,COMMODITY,190,19756
5233,siemens,ORG,near,gm,COMPANY,8121,3527
5236,inc,ORG,say,piedmont,COMPANY,3706,1116
5238,bundesbank,ORG,say,ag,ORG,2206,8829


In [150]:
relationships = []

for idx, row in df.iterrows():
    found = False
    for key, cluster in relationship_db.items():
        if (row['verb'] in cluster) and (found == False):
            found = True
            res = key

    if not res:
        res ='None'

    relationships.append(res)


In [151]:
relationships

['receive',
 'receive',
 'say',
 'say',
 'say',
 'examine',
 'encourage',
 'decline',
 'face',
 'say',
 'tighten',
 'say',
 'combine',
 'run',
 'lose',
 'lose',
 'lose',
 'forecast',
 'put',
 'trade',
 'run',
 'trade',
 'trade',
 'provide',
 'put',
 'persuade',
 'contact',
 'contact',
 'contact',
 'persuade',
 'persuade',
 'grant',
 'say',
 'contain',
 'say',
 'inspect',
 'inspect',
 'acknowledge',
 'lead',
 'restate',
 'forecast',
 'say',
 'say',
 'buy',
 'buy',
 'buy',
 'talk',
 'say',
 'say',
 'trade',
 'bid',
 'be',
 'provide',
 'receive',
 'adopt',
 'say',
 'allow',
 'say',
 'grant',
 'inspect',
 'point',
 'point',
 'allow',
 'want',
 'register',
 'say',
 'say',
 'say',
 'say',
 'say',
 'say',
 'say',
 'become',
 'require',
 'receive',
 'persuade',
 'say',
 'say',
 'want',
 'accord',
 'buy',
 'say',
 'face',
 'register',
 'want',
 'describe',
 'say',
 'say',
 'say',
 'say',
 'buy',
 'tender',
 'tender',
 'relate',
 'say',
 'describe',
 'say',
 'cause',
 'say',
 'say',
 'say',
 'es

In [154]:
df['relationship'] = relationships

In [155]:
df.to_csv('../database/relationships.csv')

In [156]:
len(df['relationship'].unique())

123