In [1]:
import nltk
import random
import spacy
from spacy import displacy
from collections import Counter
import typing
from typing import List, Dict 
import tqdm
import json
import pandas as pd
import pickle
import string 
import re

In [2]:
def load_data(file):
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return (data)

def save_data(file, data):
    with open (file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

In [3]:
# download the dataset

nltk.download('reuters')
nltk.download('punkt')
from nltk.corpus import reuters

[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\aubin\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aubin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# Show a random sample from the dataset

files = reuters.fileids()

In [6]:
sample = random.sample(files, 1)

print(reuters.raw(sample))

EQUION CORP &lt;EQUI> 2ND QTR JAN 31 NET
  Oper shr 19 cts vs 18 cts
      Oper net 951,902 vs 987,860
      Revs 19.0 mln vs 17.1 mln
      Six mths
      Oper shr 26 cts vs 35 cts
      Oper net 1,332,273 vs 2,502,868
      Revs 33.6 mln vs 29.2 mln
      Note: Oper net excludes tax credits of 897,925 dlrs vs
  841,511,dlrs for qtr and 1,306,860 dlrs vs 2,132,073 dlrs for
  six mths.
      Note: Year-ago results restated to reflect change in
  accounting principle effective August one, 1985.
  




In [22]:
sample = random.sample(files, 1)

print(reuters.raw(sample))

WHIRLPOOL CORP &lt;WHR> 1ST QTR NET
  Shr 66 cts vs 67 cts
      Net 48,700,000 vs 49,300,000
      Sales 961.0 mln vs 870.6 mln
      Avg shrs 74,123,837 vs 73,374,398
  




In [26]:
sample = random.sample(files, 1)

print(reuters.raw(sample))

U.S. BUSINESS LOANS FELL 822 MLN DLRS IN MARCH 25 WEEK, FED SAYS

  U.S. BUSINESS LOANS FELL 822 MLN DLRS IN MARCH 25 WEEK, FED SAYS
  




In [29]:
sample = random.sample(files, 1)

print(reuters.raw(sample))

U.K. MONEY MARKET DEFICIT FORECAST AT 450 MLN STG
  The Bank of England said it forecast a
  shortage of around 450 mln stg in the money market today.
      Among the main factors affecting liquidity, bills maturing
  in official hands and the take-up of treasury bills will drain
  some 650 mln stg while a rise in note circulation will take out
  around 30 mln stg.
      Partly offsetting these outflows, bankers' balances above
  target and exchequer transactions will add some 200 mln stg and
  35 mln stg to the system respectively.
  




In [9]:
# There are 4 main types of articles. The first 3 mentioned above do not hold any information about relation but simply about revenues, both quarterly and yearly.
# For a prelimnary analysis we will be ignoring these. One easy way of ignoring these is by excluding all articles that contain the word "qtr", the other one is where 
# "vs" is mentioned at least twice.

In [5]:
suitable_articles = []
suitable_ids = []

for article_id in files:
    if reuters.raw(article_id).count("vs") <= 1:
        suitable_articles.append(reuters.raw(article_id))
        suitable_ids.append(article_id)

In [6]:
sample = random.sample(suitable_articles, 1)

print(sample)

['YEUTTER SAYS U.S. BUDGET DEFICIT REDUCTION KEY TO TRADE DEFICIT SOLUTION\n\n  YEUTTER SAYS U.S. BUDGET DEFICIT REDUCTION KEY TO TRADE DEFICIT SOLUTION\n  \n\n']


In [7]:
# This gives us a list of 9134 articles that are suitable to be analyzed for relations between entities.

len(suitable_articles)

8399

In [10]:
nlp = spacy.load('en_core_web_lg')
txt = suitable_articles[0]
doc = nlp(txt)

In [11]:
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [12]:
def extract_proper_nouns(doc) -> List[str]:
    # grab list of indexes proper noun positive matches
    pos = [tok.i for tok in doc if tok.pos_ == "PROPN"]
    consecutives = []
    current = []

    # loop over proper noun position and write to the data
    for elt in pos:
        if len(current) == 0:
            current.append(elt)
        else:
            if current[-1] == elt - 1:
                current.append(elt)
            else:
                consecutives.append(current)
                current = [elt]
    if len(current) != 0:
        consecutives.append(current)
    return [doc[consecutive[0]:consecutive[-1]+1] for consecutive in consecutives]

In [31]:
# data cleaning functionality
def clean_sentence(sentence):
    lemmas=[]
    for word in sentence:
        lemmas.append(word.lemma_)
    lemma_string = ' '.join(lemmas)
    txt_no_return = lemma_string.replace("\n", " ") # remove "/n"
    text_without_punct = txt_no_return.translate(str.maketrans('', '', string.punctuation)) # remove punctuation
    text_one_space = re.sub(' +', ' ', text_without_punct) # remove multiple spaces
    text_clean = text_one_space.strip()
    return text_clean

def get_doc(article):
    doc = nlp(article)
    sentences = []
    for sent in doc.sents:
        sentences.append(clean_sentence(sent))
    return sentences

In [36]:
# There are 4 main types of articles. The first 3 mentioned above do not hold any information about relation but simply about revenues, both quarterly and yearly.
# For a prelimnary analysis we will be ignoring these. One easy way of ignoring these is by excluding all articles that contain the word "qtr", the other one is where 
# "vs" is mentioned at least twice.

suitable_articles = {}                                   
                                                          
for article_id in tqdm.tqdm(files):                                  
    if reuters.raw(article_id).count("vs") <= 1:   #This gives us a list of 9134 articles that are suitable to be analyzed for relations between entities.       
        suitable_articles[article_id] = get_doc(reuters.raw(article_id))

100%|██████████| 10788/10788 [19:54<00:00,  9.03it/s] 


In [38]:
filehandler = open("docs.obj","wb")
pickle.dump(suitable_articles, filehandler)
filehandler.close()

In [7]:
file = open("docs.obj",'rb')
cleaned_articles = pickle.load(file)
file.close()

In [44]:
nouns_per_article = {}

for key, cleaned_text in tqdm.tqdm(suitable_articles.items()):
    text = ""
    for sentence in cleaned_text:
        text += " " + sentence
    doc = nlp(text)
    nouns_per_article[key] = extract_proper_nouns(doc)

100%|██████████| 8399/8399 [06:40<00:00, 20.95it/s]


In [47]:
nouns_str_per_article = {}

for key,val in nouns_per_article.items():
    nouns_str_per_article[key] = [v.text for v in val]

In [49]:
filehandler = open("nouns.obj","wb")
pickle.dump(nouns_str_per_article, filehandler)
filehandler.close()

In [8]:
file = open("nouns.obj",'rb')
nouns_str_per_article = pickle.load(file)
file.close()

In [9]:
# Making a prelimanary count of all the proper nouns I can find. I couldn't track a working spacy method so I converted to string and used native Counter.
# Also made 2 lists, one containing all unique values and 1 containing all values, both as strings.

all_entities = []
counted_entities = Counter()
unique_entities = set()

for key, val in nouns_str_per_article.items():
    counted_entities += Counter(val)
    all_entities.append(val)

all_entities = [item for sublist in all_entities for item in sublist]
unique_entities = set(all_entities)

In [10]:
# Save the ones with a minimum of 3 links

counted_entities_thresh = Counter({k: c for k, c in counted_entities.items() if c >= 2})

In [33]:
# Following method is a slow way of calculating the similarities between the elements.
"""
from difflib import SequenceMatcher

similarity = []
unique_list = list(unique_entities)

#for val_1 in counted_entities.most_common()[0]:
for val_2 in tqdm.tqdm(unique_list):
    similarity.append(SequenceMatcher(None, str(counted_entities.most_common()[0][0]), str(val_2)).ratio())
"""

'\nfrom difflib import SequenceMatcher\n\nsimilarity = []\nunique_list = list(unique_entities)\n\n#for val_1 in counted_entities.most_common()[0]:\nfor val_2 in tqdm.tqdm(unique_list):\n    similarity.append(SequenceMatcher(None, str(counted_entities.most_common()[0][0]), str(val_2)).ratio())\n'

In [30]:

from Levenshtein import ratio

lev_similarity = x = [[] for i in range(len(counted_entities_thresh))]
unique_list = [val for val in counted_entities_thresh]

for idx, val_1 in enumerate(counted_entities_thresh):
    print(idx/len(counted_entities_thresh),end="\r")
    for val_2 in counted_entities_thresh:
        lev_similarity[idx].append(ratio(val_1, val_2))


0.99989045897688694655

In [58]:
y = 267

for idx in sorted(range(len(lev_similarity[y])), key=lambda x: lev_similarity[y][x])[-20:]:
    print(unique_list[idx])
    print(lev_similarity[y][idx])


monday
0.5
Beaumont Texas
0.5
Calmon
0.5
La Paz
0.5
National Pizza
0.5
Ramon del Rosario
0.5185185185185185
Raymond Stone
0.5217391304347826
Rha Woong Bae
0.5217391304347826
Richmond Hill
0.5217391304347826
San Diego
0.5263157894736842
van Driel
0.5263157894736842
Van Driel
0.5263157894736842
Ron Paice
0.5263157894736842
Tasmania
0.5555555555555556
Diaz
0.5714285714285714
Romania
0.5882352941176471
Ramirez
0.5882352941176471
Raymond
0.5882352941176471
Raimond
0.5882352941176471
Ramon Diaz
1.0


In [18]:
def find_links(search_term, threshold_links):
    counted = Counter()

    for _, ent in nouns_str_per_article.items():
        if search_term in [x for x in ent]:
            counted += Counter([x for x in ent])

    counted = Counter({k: c for k, c in counted.items() if c >= threshold_links})

    return counted.most_common()

In [37]:
def save_links(links, name):
    df = pd.DataFrame(links)
    df.to_csv(name)
    return df

In [20]:
find_links('Multifoods', 3)

[('Multifoods', 5), ('Bregman Partners', 4)]

In [26]:
find_links('Harcourt', 3)

[('Harcourt', 44),
 ('Harper', 23),
 ('Row', 9),
 ('Robert Maxwell', 6),
 ('Harcourt Brace Jovanovich Inc', 6),
 ('Maxwell', 6),
 ('BPCC', 6),
 ('Reed', 6),
 ('HARCOURT', 5),
 ('June', 5),
 ('Harcourt Brace', 4),
 ('New York', 3),
 ('Printing', 3),
 ('stg', 3),
 ('HARPER', 3),
 ('March', 3)]

In [266]:
find_links('Crazy Eddie', 3)

[('Crazy Eddie', 40),
 ('Entertainment Marketing', 15),
 ('Crazy Eddie Inc', 10),
 ('CRAZY EDDIE', 5),
 ('Antar', 5),
 ('June', 5),
 ('April', 4),
 ('Zinn', 4),
 ('lt;CRZY', 3),
 ('Eddie', 3),
 ('Entertainment Marketing Inc', 3),
 ('lt;EM', 3),
 ('Entertainment', 3),
 ('Belzberg', 3),
 ('May', 3),
 ('dlrs', 3),
 ('SEC', 3)]

In [206]:
find_links('Delta', 3)

[('Delta', 18),
 ('Western', 7),
 ("O'Connor", 4),
 ('DELTA', 3),
 ('lt;DAL', 3),
 ('lt;WAL', 3),
 ('Appeals', 3),
 ('Supreme Court', 3),
 ('PNB', 3),
 ('Toyota', 3),
 ('April', 3)]

In [43]:
find_links('Satoshi Sumita', 4)

[('Japan', 52),
 ('Sumita', 25),
 ('Paris', 24),
 ('Bank', 24),
 ('Satoshi Sumita', 16),
 ('U.S.', 13),
 ('April', 11),
 ('SUMITA', 10),
 ('Canada', 7),
 ('Britain', 7),
 ('Group', 6),
 ('Dealers', 6),
 ('France', 6),
 ('Bundesbank', 5),
 ('February', 5),
 ('JAPAN', 5),
 ('Baker', 4),
 ('Washington', 4),
 ('Lower House Budget Committee', 4),
 ('Parliament', 4),
 ('Plaza', 4),
 ('Tokyo', 4),
 ('U.S. Treasury Secretary James Baker', 4),
 ('West Germany', 4)]

In [44]:
df = save_links(find_links('Satoshi Sumita', 4), 'Satoshi.csv')

In [45]:
df["linked"] = ["Satoshi Sumita" for x in range(len(df))]

In [46]:
from pyvis.network import Network
import pandas as pd

got_net = Network(height='100%', width='100%', bgcolor='#222222', font_color='white')

# set the physics layout of the network
got_net.barnes_hut()
got_data = df

sources = got_data['linked']
targets = got_data[0]
weights = got_data[1] * 50

edge_data = zip(sources, targets, weights)

for e in edge_data:
    src = e[0]
    dst = e[1]
    w = e[2]

    got_net.add_node(src, src, title=src)
    got_net.add_node(dst, dst, title=dst)
    got_net.add_edge(src, dst, value=w)

neighbor_map = got_net.get_adj_list()

# add neighbor data to node hover data
for node in got_net.nodes:
    node['title'] += ' Neighbors:<br>' + '<br>'.join(neighbor_map[node['id']])
    node['value'] = len(neighbor_map[node['id']])

got_net.show('links.html')

In [8]:

from Levenshtein import ratio

lev_similarity = []
unique_list = list(unique_entities)
lev_similarity.append(ratio("Crazy Eddie Inc", "CRAZY EDDY"))
"""
#for val_1 in counted_entities.most_common()[0]:
for val_2 in tqdm.tqdm(unique_list):
    lev_similarity.append(ratio(str(counted_entities_thresh.most_common()[0][0]), str(val_2)))
"""

NameError: name 'unique_entities' is not defined