# Relation extraction
## Table of contents<a name=contents></a>
0. [Downloads](#downloads)
1. [Packages](#packages)
2. [Text](#text)
3. [NLP pipes](#pipes)
4. [Dependency trees](#trees)
5. [Help functions](#functions)
6. [Trials](#trials)
    1. [Sentences](#sent)
        1. [Passive](#passive)
        2. [Two verbs](#2verbs)
        3. [Two subsentences](#2subsent)
        4. [Pronominal subsentence](#pron_subsent)
    2. [Articles](#articles)
        1. [Crude category: test/15063 (2nd of crude's category)](#test/15063)
        2. [Reuters website: merger](#merger)
    3. [Categories](#categories)
        1. [Crude](#crude)

## 0. Downloads <a name=downloads></a>

In [None]:
import nltk
nltk.download('reuters')

In [None]:
!unzip /root/nltk_data/corpora/reuters.zip -d /root/nltk_data/corpora

In [None]:
!pip install -U spacy==3.0.7

In [None]:
!python -m spacy download en_core_web_lg

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
!pip install allennlp==2.1.0 allennlp-models==2.1.0

In [None]:
!wget "https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2021.03.10.tar.gz"

## 1. Packages <a name=packages></a>

In [1]:
import spacy
from spacy import displacy


from spacy.tokens import Token
from spacy import Language

from collections import deque

from nltk.corpus import reuters

import regex as re

import requests
from bs4 import BeautifulSoup

import pandas as pd

  return torch._C._cuda_getDeviceCount() > 0


## 2. Text

In [2]:
nlp = spacy.load('en_core_web_lg')

In [3]:
text = "Fujitsu, a competitor of NEC, acquired Fairchild Corp."
doc = nlp(text)

In [137]:
displacy.render(doc, style="dep",options={'compact': False, 'distance': 100})

Back to the [table of contents](#contents).

## 3. NLP pipes <a name=pipes></a>

In [5]:
Token.set_extension('ref_n', default='', force = True)
Token.set_extension('ref_t', default='', force = True)

@Language.component("init_coref")
def init_coref(doc):
    for e in doc.ents:
        if e.label_ in ['ORG', 'GOV', 'PERSON','MONEY']:
            e[0]._.ref_n, e[0]._.ref_t = e.text, e.label_
    return doc

In [None]:
Token.set_extension('ref_n', default='', force = True)
Token.set_extension('ref_t', default='', force = True)

@Language.component("init_coref")
def init_coref(doc):
    for e in doc.ents:
        if e.label_ in ['ORG', 'GOV', 'PERSON','MONEY']:
            e[0]._.ref_n, e[0]._.ref_t = e.text, e.label_
    return doc

In [6]:
def reset_pipeline(nlp, pipes):
    # remove all custom pipes
    custom_pipes = [pipe for (pipe, _) in nlp.pipeline
                    if pipe not in ['tagger', 'parser', 'ner',
                                    'tok2vec', 'attribute_ruler', 'lemmatizer']]
    for pipe in custom_pipes:
        _ = nlp.remove_pipe(pipe)
    # re-add specified pipes
    for pipe in pipes:
        if 'neuralcoref' == pipe or 'neuralcoref' in str(pipe.__class__):
            nlp.add_pipe(pipe, name='neural_coref')
        else:
            nlp.add_pipe(pipe)

    print(f"Model: {nlp.meta['name']}, Language: {nlp.meta['lang']}")
    print(*nlp.pipeline, sep='\n')
    
reset_pipeline(nlp, ['init_coref'])

Model: core_web_lg, Language: en
('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec object at 0x7f7021248d00>)
('tagger', <spacy.pipeline.tagger.Tagger object at 0x7f7021248c90>)
('parser', <spacy.pipeline.dep_parser.DependencyParser object at 0x7f7020f2be50>)
('attribute_ruler', <spacy.pipeline.attributeruler.AttributeRuler object at 0x7f7020e2d960>)
('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer object at 0x7f7020ec4690>)
('ner', <spacy.pipeline.ner.EntityRecognizer object at 0x7f7020f2bed0>)
('init_coref', <function init_coref at 0x7f70211cb320>)


Back to the [table of contents](#contents).

## 4. Dependency trees <a name=trees></a>

Dependency trees seem to be very efficient to find verbs (active or passive) and their subject and object.

In [26]:
def synonyms(term):
    """
    source: https://stackoverflow.com/questions/52910297/pydictionary-word-has-no-synonyms-in-the-api
    """
    response = requests.get('https://www.thesaurus.com/browse/{}'.format(term))
    soup = BeautifulSoup(response.text, 'html.parser')
    soup.find('section', {'class': 'css-191l5o0-ClassicContentCard e1qo4u830'})
    return [span.text.strip() for span in soup.findAll('a', {'class': 'css-1kg1yv8 eh475bn0'})] 

In [91]:
# Actually we search for the shortest path between the
# subject running through our predicate (verb) to the object.
# subject and object are organizations in our examples.

# Here are the three helper functions omitted in the book:
# - bfs: breadth first searching the closest subject/object 
# - is_passive: checks if noun or verb is in passive form
# - find_subj: searches left part of tree for subject
# - find_obj: searches right part of tree for object

def bfs(root, ent_type: str, deps:list, first_dep_only:bool=False):
    """
    
    : root: token containing the word at the left of the verb, hopefully the subject?
    : ent_type: specifies entity type (for now always called for "ORG")
    : deps: ??? ['nsubjpass', 'nsubj:pass'] ???
    : first_dep_only: 
    """
    """Return first child of root (included) that matches
    ent_type and dependency list by breadth first search.
    Search stops after first dependency match if first_dep_only
    (used for subject search - do not "jump" over subjects)"""
    # deque to ease the access to the list
    to_visit = deque([root]) # queue for bfs

    while len(to_visit) > 0:
        # the left element of the queue is given to child and deleted from the queue
        child = to_visit.popleft()
        ## print("child", child, child.dep_)
        # check if the dependency of the token was one of those provided
        if child.dep_ in deps:
            # check if the label/entity type is the same as the one provided
            if child._.ref_t == ent_type:
                return child
            #else:
            #    for 
            # explore what to do if we keep looking after the first dependency match?
            # quid for a subject with an "and"???
            elif first_dep_only: # first match (subjects)
                return None
        # check if it is a compound (adjective),
        # if the noun it describes dependency is one of those provided
        # and if it has the right entity type but only works on the first token of the entity (customized pipe)
        # why doesn't it return the whole entity then? A compound is no subject by its own...?
        # add " or child.head.head.dep_ in deps " to the second condition only if ent_type=="MONEY"
        # or use the root of the entity?
        elif child.dep_ == 'compound' and \
             (child.head.dep_ in deps or child.head.head.dep_ in deps) and \
             child._.ref_t == ent_type: # check if contained in compound
            return child
        to_visit.extend(list(child.children))
    return None

def is_passive(token):
    if token.dep_.endswith('pass'): # noun
        return True
    for left in token.lefts: # verb
        if left.dep_ == 'auxpass':
            return True
    return False

def find_subj(pred, ent_type: str, passive: bool):
    """
    Find closest subject in predicates left subtree or
    predicates parent's left subtree (recursive).
    Has a filter on organizations.
    : pred: token containing a verb
    : ent_type: specifies entity type (for now always called for "ORG")
    : passive: specifies if the verb is in the passive form
    : return: pred's subject
    """
    ## To modify to make it work for different kind of entities

    # begins with the further related word on the left of the predicate
    for left in pred.lefts:
        if passive: # if pred is passive, search for passive subject
            subj = bfs(left, ent_type, ['nsubjpass', 'nsubj:pass'], True)
        else:
            subj = bfs(left, ent_type, ['nsubj'], True)
        if subj is not None: # found it!
            return subj
    
    # if the subject is not on the left tree of the predicate,
    # the predicate's head could be another verb with the same subject
    # example: Apple is looking at buying a startup
    if pred.head != pred and not is_passive(pred): # why not just "passive" instead of is_passive(pred)?
        return find_subj(pred.head, ent_type, passive) # climb up left subtree
    else:
        return None

def find_obj(pred, ent_type, excl_prepos):
    """
    Find closest object in predicates right subtree.
    Skip prepositional objects if the preposition is in exclude list.
    Has a filter on organizations.
    : pred: token containing a verb
    : ent_type: specifies entity type (for now always called for "ORG")
    : excl_prepos: excluded prepositions
    : return: object of the predicate
    """
    
    ## To modify to make it work for different kind of entities
        
    # looks into every related token on the right of the predicate
    # until it finds an object filling the conditions
    for right in pred.rights:
        ## print("right: ",right)
        obj = bfs(right, ent_type, ['dobj', 'pobj', 'iobj', 'obj', 'obl'])
        # if an object is found,
        # it looks that its preposition is not excluded
        if obj is not None:
            if obj.dep_ == 'pobj' and obj.head.lemma_.lower() in excl_prepos: # check preposition
                continue
            return obj
    return None

def extract_rel_dep(doc,
                    pred_name:str, pred_synonyms:list=[pred_name]+synonyms(pred_name),
                    subj_ents:list=['ORG', 'GOV', 'PERSON'],
                    obj_ents:list=['ORG', 'GOV', 'PERSON','MONEY'],
                    excl_prepos=[]):
    """
    Method extracting relationship(s) (may be plural!)
    It only returns triplets!
    : doc: text to analyze
    : pred_name: predicate
    : pred_synonyms: predicate's synonyms
    : excl_prepos: prepositions which can not precede the object chosen
    : return: triplet(s) with the subject and its entity type,
              the predicate and the object and its entity type
    """
    for token in doc:
        ## print(token, token.pos_, token.lemma_)
        # looks for a verb equivalent to the predicate referred to
        if token.pos_ == 'VERB':# and token.lemma_ in pred_synonyms:
            ## print("found token: ",token)
            # saves that verb as a predicate (readability)
            # looks if it is passive
            # and then searches for the subject of the verb
            pred = token
            passive = is_passive(pred)
            ## print("passive: ",passive)
            for subj_ent in subj_ents:
                subj = find_subj(pred, subj_ent, passive)
                ## print("subject: ",subj)
                # if the subject is found, it looks for the object
                if subj is not None:
                    for obj_ent in obj_ents:
                        obj = find_obj(pred, obj_ent, excl_prepos)
                        ## print("object: ",obj)
                        if obj is not None:
                            # if there is a subject and an object,
                            # it sets the triplet in the following order:
                            # active subject, verb in active form, passive subject
                            if passive: # switch roles
                                obj, subj = subj, obj
                            yield ((subj._.ref_n, subj._.ref_t), pred, #, pred_name
                                   (obj._.ref_n, obj._.ref_t))

Back to the [table of contents](#contents).

## 5. Help functions <a name=functions></a>

In [18]:
def clean(article:str):
    pattern = re.compile("\n")
    article = re.sub(pattern,"",article)
    pattern = re.compile(" +")
    article = re.sub(pattern," ",article)
    pattern = re.compile("(\s\&lt;.{1,4}>)")
    article = re.sub(pattern,"",article)
    # pattern = re.compile(".'s")
    # article = re.sub(pattern,"'s",article) #???
    article = article.strip()
    #re.findall("(\w[^\.]*\.)",article)
    return article

In [138]:
def render(doc,style:str):
    displacy.render(doc, style=style,options={'compact': False, 'distance': 100})

In [185]:
def peek(iterable):
    try:
        first = next(iterable)
    except StopIteration:
        return None
    return iterable

In [167]:
def print_rel(doc,
              pred_name:str, pred_synonyms:str,
              subj_ents:list=['ORG', 'GOV', 'PERSON'],
              obj_ents:list=['ORG', 'GOV', 'PERSON','MONEY'],
              excl_prepos=[]):
    
    generator = extract_rel_dep(doc, pred_name, pred_synonyms, subj_ents, obj_ents, excl_prepos)
    
    try:
        for relation in generator:
            print(relation)
    except StopIteration:
        print("No relationship")
    
#    if peek(generator):
#        for relation in generator:
#            print(relation)
    
    
#if peek(extract_rel_dep(mergers_doc, pred_name=verb, pred_synonyms=[verb]+synonyms(verb), excl_prepos=[])):
#    for relation in extract_rel_dep(mergers_doc, pred_name=verb, pred_synonyms=[verb]+synonyms(verb), excl_prepos=[]):
#        print(relation)
        
#generator = extract_rel_dep(mergers_doc, pred_name=verb, pred_synonyms=[verb]+synonyms(verb), excl_prepos=[])
#print(generator)
#for relation in generator:
#    print(relation)
        
#print_rel(mergers_doc, pred_name=verb, pred_synonyms=[verb]+synonyms(verb), excl_prepos=[])

In [15]:
reuters_fileids_crude = reuters.fileids(categories=['crude'])

Back to the [table of contents](#contents).

## 6. Trials <a name=trials></a>

### A. Sentences <a name=sent></a>

#### a. Passive <a name=passive></a>

In [100]:
doc_pass = nlp("I have been married to my wife for 30 years")

In [101]:
for t in doc_pass:
    print(t,t.dep_,t.head)

I nsubj been
have aux been
been ROOT been
married acomp been
to prep married
my poss wife
wife pobj to
for prep married
30 nummod years
years pobj for


Back to the [table of contents](#contents).

#### b. Two verbs <a name=2verbs></a>

In [187]:
doc_2verbs = nlp("Apple is looking at buying Amazon for $1 billion")# dlrs") # $1 billion

In [188]:
render(doc_2verbs,"ent")

In [12]:
for elem in extract_rel_dep(doc_2verbs, pred_name="buy",pred_synonyms=["buy"]):
    print(elem)

found token:  buying
passive:  False
child Apple nsubj
subject:  Apple
right:  Amazon
child Amazon dobj
right:  for
child for prep
child dlrs pobj
child billion nummod
child 1 compound
object:  1
(('Apple', 'ORG'), 'buy', ('1 billion dlrs', 'MONEY'))


In [15]:
for t in doc_2verbs:
    print("TOKEN: ",t,"\ndepencies:",t.dep_,"\ntoken's head:", t.head)
    for child in t.children:
        print("children: \n",child)
    for left in t.lefts:
        print("lefts: \n",left)
    for right in t.rights:
        print("rights: \n",right)

TOKEN:  Apple 
depencies: nsubj 
token's head: looking
TOKEN:  is 
depencies: aux 
token's head: looking
TOKEN:  looking 
depencies: ROOT 
token's head: looking
children: 
 Apple
children: 
 is
children: 
 at
lefts: 
 Apple
lefts: 
 is
rights: 
 at
TOKEN:  at 
depencies: prep 
token's head: looking
children: 
 buying
rights: 
 buying
TOKEN:  buying 
depencies: pcomp 
token's head: at
children: 
 Amazon
children: 
 for
rights: 
 Amazon
rights: 
 for
TOKEN:  Amazon 
depencies: dobj 
token's head: buying
TOKEN:  for 
depencies: prep 
token's head: buying
children: 
 billion
rights: 
 billion
TOKEN:  $ 
depencies: quantmod 
token's head: billion
TOKEN:  1 
depencies: compound 
token's head: billion
TOKEN:  billion 
depencies: pobj 
token's head: for
children: 
 $
children: 
 1
lefts: 
 $
lefts: 
 1


Back to the [table of contents](#contents).

#### c. Two subsentences <a name=2subsent></a>

In [16]:
doc_verb_before = nlp("If I eat an apple, I drink water")

In [17]:
type(doc_verb_before)

spacy.tokens.doc.Doc

In [19]:
for t in doc_verb_before:
    print("TOKEN: ",t,"\ntoken's head:", t.head)
    if peek(t.children):
        print("Children:")
        for child in t.children:
            print(child)
    if peek(t.lefts):
        print("Lefts:")
        for left in t.lefts:
            print(left)
    if peek(t.rights):
        print("Rights:")
        for right in t.rights:
            print(right)
    print("\n")

TOKEN:  If 
token's head: eat


TOKEN:  I 
token's head: eat


TOKEN:  eat 
token's head: drink
Children:
If
I
apple
Lefts:
If
I
Rights:
apple


TOKEN:  an 
token's head: apple


TOKEN:  apple 
token's head: eat
Children:
an
Lefts:
an


TOKEN:  , 
token's head: drink


TOKEN:  I 
token's head: drink


TOKEN:  drink 
token's head: drink
Children:
eat
,
I
water
Lefts:
eat
,
I
Rights:
water


TOKEN:  water 
token's head: drink




In [20]:
next(doc_verb_before[2].children)

If

Back to the [table of contents](#contents).

#### d. Pronominal subsentence <a name=pron_subsent></a>

In [140]:
sentence = 'Chairman Gordon Cain, who previously led a leveraged buyout of Dupont\'s Conoco Inc\'s chemical business, has spent 1.1 billion dlrs since January to buy seven petrochemical plants along the Texas Gulf Coast.'
doc_sentence = nlp(sentence)
render(doc_sentence, style="dep")

Back to the [table of contents](#contents).

### B. Articles <a name=articles></a>

#### a. Crude category: test/15063 (2nd of crude's category) <a name=test/15063></a>

In [106]:
reuters_fileids_crude[1]

'test/15063'

In [102]:
article = reuters.raw(reuters_fileids_crude[1])

In [111]:
cleaned_article_title = clean(article)

In [109]:
article = 'Cheap oil feedstocks, the weakened U.S. dollar and a plant utilization rate approaching 90 pct will propel the streamlined U.S. petrochemical industry to record profits this year, with growth expected through at least 1990, major company executives predicted. This bullish outlook for chemical manufacturing and an industrywide move to shed unrelated businesses has prompted GAF Corp &lt;GAF>, privately-held Cain Chemical Inc, and other firms to aggressively seek acquisitions of petrochemical plants. Oil companies such as Ashland Oil Inc &lt;ASH>, the Kentucky-based oil refiner and marketer, are also shopping for money-making petrochemical businesses to buy. "I see us poised at the threshold of a golden period," said Paul Oreffice, chairman of giant Dow Chemical Co &lt;DOW>, adding, "There\'s no major plant capacity being added around the world now. The whole game is bringing out new products and improving the old ones." Analysts say the chemical industry\'s biggest customers, automobile manufacturers and home builders that use a lot of paints and plastics, are expected to buy quantities this year. U.S. petrochemical plants are currently operating at about 90 pct capacity, reflecting tighter supply that could hike product prices by 30 to 40 pct this year, said John Dosher, managing director of Pace Consultants Inc of Houston. Demand for some products such as styrene could push profit margins up by as much as 300 pct, he said. Oreffice, speaking at a meeting of chemical engineers in Houston, said Dow would easily top the 741 mln dlrs it earned last year and predicted it would have the best year in its history. In 1985, when oil prices were still above 25 dlrs a barrel and chemical exports were adversely affected by the strong U.S. dollar, Dow had profits of 58 mln dlrs. "I believe the entire chemical industry is headed for a record year or close to it," Oreffice said. GAF chairman Samuel Heyman estimated that the U.S. chemical industry would report a 20 pct gain in profits during 1987. Last year, the domestic industry earned a total of 13 billion dlrs, a 54 pct leap from 1985. The turn in the fortunes of the once-sickly chemical industry has been brought about by a combination of luck and planning, said Pace\'s John Dosher. Dosher said last year\'s fall in oil prices made feedstocks dramatically cheaper and at the same time the American dollar was weakening against foreign currencies. That helped boost U.S. chemical exports. Also helping to bring supply and demand into balance has been the gradual market absorption of the extra chemical manufacturing capacity created by Middle Eastern oil producers in the early 1980s. Finally, virtually all major U.S. chemical manufacturers have embarked on an extensive corporate restructuring program to mothball inefficient plants, trim the payroll and eliminate unrelated businesses. The restructuring touched off a flurry of friendly and hostile takeover attempts. GAF, which made an unsuccessful attempt in 1985 to acquire Union Carbide Corp &lt;UK>, recently offered three billion dlrs for Borg Warner Corp &lt;BOR>, a Chicago manufacturer of plastics and chemicals. Another industry powerhouse, W.R. Grace &lt;GRA> has divested its retailing, restaurant and fertilizer businesses to raise cash for chemical acquisitions. But some experts worry that the chemical industry may be headed for trouble if companies continue turning their back on the manufacturing of staple petrochemical commodities, such as ethylene, in favor of more profitable specialty chemicals that are custom-designed for a small group of buyers. "Companies like DuPont &lt;DD> and Monsanto Co &lt;MTC> spent the past two or three years trying to get out of the commodity chemical business in reaction to how badly the market had deteriorated," Dosher said. "But I think they will eventually kill the margins on the profitable chemicals in the niche market." Some top chemical executives share the concern. "The challenge for our industry is to keep from getting carried away and repeating past mistakes," GAF\'s Heyman cautioned. "The shift from commodity chemicals may be ill-advised. Specialty businesses do not stay special long." Houston-based Cain Chemical, created this month by the Sterling investment banking group, believes it can generate 700 mln dlrs in annual sales by bucking the industry trend. Chairman Gordon Cain, who previously led a leveraged buyout of Dupont\'s Conoco Inc\'s chemical business, has spent 1.1 billion dlrs since January to buy seven petrochemical plants along the Texas Gulf Coast. The plants produce only basic commodity petrochemicals that are the building blocks of specialty products. "This kind of commodity chemical business will never be a glamorous, high-margin business," Cain said, adding that demand is expected to grow by about three pct annually. Garo Armen, an analyst with Dean Witter Reynolds, said chemical makers have also benefitted by increasing demand for plastics as prices become more competitive with aluminum, wood and steel products. Armen estimated the upturn in the chemical business could last as long as four or five years, provided the U.S. economy continues its modest rate of growth.'
cleaned_article = clean(article) # no title

In [112]:
doc_reuters = nlp(cleaned_article)

In [115]:
doc_reuters.text

'Cheap oil feedstocks, the weakened U.S. dollar and a plant utilization rate approaching 90 pct will propel the streamlined U.S. petrochemical industry to record profits this year, with growth expected through at least 1990, major company executives predicted. This bullish outlook for chemical manufacturing and an industrywide move to shed unrelated businesses has prompted GAF Corp, privately-held Cain Chemical Inc, and other firms to aggressively seek acquisitions of petrochemical plants. Oil companies such as Ashland Oil Inc, the Kentucky-based oil refiner and marketer, are also shopping for money-making petrochemical businesses to buy. "I see us poised at the threshold of a golden period," said Paul Oreffice, chairman of giant Dow Chemical Co, adding, "There\'s no major plant capacity being added around the world now. The whole game is bringing out new products and improving the old ones." Analysts say the chemical industry\'s biggest customers, automobile manufacturers and home bui

In [141]:
render(doc_reuters, style="ent")

In [25]:
for ent in doc_reuters.ents:
    if ent.label_ == "MONEY":#"CARDINAl": #"QUANTITY":
        print(ent)

741 mln dlrs
58 mln dlrs
13 billion dlrs
three billion dlrs
700 mln dlrs
1.1 billion dlrs


In [119]:
coref_article = 'Cheap oil feedstocks, the weakened U.S. dollar and a plant utilization rate approaching 90 pct will propel the streamlined U.S. petrochemical industry to record profits this year, with growth expected through at least 1990, major company executives predicted. This bullish outlook for chemical manufacturing and an industrywide move to shed unrelated businesses has prompted GAF Corp, privately-held Cain Chemical Inc, and other firms to aggressively seek acquisitions of petrochemical plants. Oil companies such as Ashland Oil Inc, the Kentucky-based oil refiner and marketer, are also shopping for money-making petrochemical businesses to buy. "I see us poised at the threshold of a golden period," said Paul Oreffice, chairman of giant Dow Chemical Co, adding, "There\'s no major plant capacity being added around the world now. The whole game is bringing out new products and improving the old ones." Analysts say the chemical industry\'s biggest customers, automobile manufacturers and home builders that use a lot of paints and plastics, are expected to buy quantities this year. U.S. petrochemical plants are currently operating at about 90 pct capacity, reflecting tighter supply that could hike product prices by 30 to 40 pct this year, said John Dosher, managing director of Pace Consultants Inc of Houston. Demand for some products such as styrene could push profit margins up by as much as 300 pct, John Dosher, managing director of Pace Consultants Inc of Houston said. Oreffice, speaking at a meeting of chemical engineers in Houston, said Dow would easily top the 741 mln dlrs Dow earned last year and predicted Dow would have the best year in Dow history. In 1985, when oil prices were still above 25 dlrs a barrel and chemical exports were adversely affected by the strong U.S. dollar, Dow had profits of 58 mln dlrs. "I believe the entire chemical industry is headed for a record year or close to the entire chemical industry," Oreffice said. GAF chairman Samuel Heyman estimated that the entire chemical industry would report a 20 pct gain in profits during 1987. last year, the entire chemical industry earned a total of 13 billion dlrs, a 54 pct leap from 1985. The turn in the fortunes of the entire chemical industry has been brought about by a combination of luck and planning, said Pace\'s John Dosher. Pace\'s John Dosher said last year\'s fall in oil prices made feedstocks dramatically cheaper and at the same time the strong U.S. dollar was weakening against foreign currencies. That helped boost U.S. chemical exports. Also helping to bring supply and demand into balance has been the gradual market absorption of the extra chemical manufacturing capacity created by Middle Eastern oil producers in the early 1980s. Finally, virtually all major U.S. chemical manufacturers have embarked on an extensive corporate restructuring program to mothball inefficient plants, trim the payroll and eliminate unrelated businesses. The restructuring touched off a flurry of friendly and hostile takeover attempts. GAF, which made an unsuccessful attempt in 1985 to acquire Union Carbide Corp, recently offered three billion dlrs for Borg Warner Corp, a Chicago manufacturer of plastics and chemicals. Another industry powerhouse, W.R. Grace has divested W.R. Grace retailing, restaurant and fertilizer businesses to raise cash for chemical acquisitions. But some experts worry that the chemical industry may be headed for trouble if companies continue turning companies back on the manufacturing of staple petrochemical commodities, such as ethylene, in favor of more profitable specialty chemicals that are custom-designed for a small group of buyers. "Companies like DuPont and Monsanto Co spent the past two or three years trying to get out of the commodity chemical business in reaction to how badly the market had deteriorated," Dosher said. "But I think Companies like DuPont and Monsanto Co will eventually kill the margins on the profitable chemicals in the market." Some top chemical executives share the concern. "The challenge for the chemical industry is to keep from getting carried away and repeating past mistakes," GAF\'s Heyman cautioned. "The shift from commodity chemicals may be ill-advised. Specialty businesses do not stay special long." Houston-based Cain Chemical, created this month by the Sterling investment banking group, believes the Sterling investment banking group can generate 700 mln dlrs in annual sales by bucking the industry trend. Chairman Gordon Cain, who previously led a leveraged buyout of Dupont\'s Conoco Inc\'s chemical business, has spent 1.1 billion dlrs since January to buy seven petrochemical plants along the Texas Gulf Coast. seven petrochemical plants along the Texas Gulf Coast produce only basic commodity petrochemicals that are the building blocks of specialty products. "This kind of commodity chemical business will never be a glamorous, high-margin business," Cain said, adding that demand is expected to grow by about three pct annually. Armen, said chemical makers have also benefitted by increasing demand for plastics as prices become more competitive with aluminum, wood and steel products. Armen estimated the upturn in the chemical business could last as long as four or five years, provided the U.S. economy continues the U.S. economy modest rate of growth.'

In [120]:
coref_article

'Cheap oil feedstocks, the weakened U.S. dollar and a plant utilization rate approaching 90 pct will propel the streamlined U.S. petrochemical industry to record profits this year, with growth expected through at least 1990, major company executives predicted. This bullish outlook for chemical manufacturing and an industrywide move to shed unrelated businesses has prompted GAF Corp, privately-held Cain Chemical Inc, and other firms to aggressively seek acquisitions of petrochemical plants. Oil companies such as Ashland Oil Inc, the Kentucky-based oil refiner and marketer, are also shopping for money-making petrochemical businesses to buy. "I see us poised at the threshold of a golden period," said Paul Oreffice, chairman of giant Dow Chemical Co, adding, "There\'s no major plant capacity being added around the world now. The whole game is bringing out new products and improving the old ones." Analysts say the chemical industry\'s biggest customers, automobile manufacturers and home bui

In [121]:
coref_doc = nlp(coref_article)

In [142]:
print_rel(coref_doc, pred_name=verb, pred_synonyms=[verb]+synonyms(verb), excl_prepos=[])

(('Dow', 'ORG'), predicted, ('Dow', 'ORG'))
(('Dow', 'ORG'), have, ('Dow', 'ORG'))
(('GAF', 'ORG'), made, ('Union Carbide Corp', 'ORG'))
(('GAF', 'ORG'), acquire, ('Union Carbide Corp', 'ORG'))
(('GAF', 'ORG'), offered, ('Borg Warner Corp', 'ORG'))
(('GAF', 'ORG'), offered, ('three billion dlrs', 'MONEY'))
(('Cain Chemical', 'PERSON'), created, ('Sterling', 'ORG'))
(('Gordon Cain', 'PERSON'), led, ("Conoco Inc's", 'ORG'))


In [131]:
relations = [relation for relation in extract_rel_dep(coref_doc, pred_name=verb, pred_synonyms=[verb]+synonyms(verb), excl_prepos=[])]

In [132]:
print(relations)

[(('Dow', 'ORG'), top, ('Dow', 'ORG')), (('Dow', 'ORG'), predicted, ('Dow', 'ORG')), (('Dow', 'ORG'), have, ('Dow', 'ORG')), (('GAF', 'ORG'), made, ('Union Carbide Corp', 'ORG')), (('GAF', 'ORG'), acquire, ('Union Carbide Corp', 'ORG')), (('GAF', 'ORG'), offered, ('Borg Warner Corp', 'ORG')), (('GAF', 'ORG'), offered, ('three billion dlrs', 'MONEY')), (('Cain Chemical', 'PERSON'), created, ('Sterling', 'ORG')), (('Gordon Cain', 'PERSON'), led, ("Conoco Inc's", 'ORG'))]


In [143]:
print_rel(doc_reuters, pred_name=verb, pred_synonyms=[verb]+synonyms(verb), excl_prepos=[])

(('GAF', 'ORG'), acquire, ('Union Carbide Corp', 'ORG'))
(('GAF', 'ORG'), offered, ('Borg Warner Corp', 'ORG'))
(('GAF', 'ORG'), offered, ('three billion dlrs', 'MONEY'))
(('Cain Chemical', 'PERSON'), created, ('Sterling', 'ORG'))
(('Gordon Cain', 'PERSON'), led, ("Conoco Inc's", 'ORG'))


Back to the [table of contents](#contents).

#### b. Reuters website: merger <a name=merger></a>

In [145]:
mergers_article = """Microsoft Corp (MSFT.O) is buying "Call of Duty" maker Activision Blizzard (ATVI.O) for $68.7 billion in the biggest gaming industry deal in history as global technology giants stake their claims to a virtual future.

The all-cash deal announced by Microsoft on Tuesday, its biggest-ever acquisition, will bolster its firepower in the booming videogaming market where it takes on leaders Tencent (0700.HK) and Sony (6758.T).

It also represents the American multinational's bet on the "metaverse", virtual online worlds where people can work, play and socialize, as many of its biggest competitors are already doing. read more

"Gaming is the most dynamic and exciting category in entertainment across all platforms today and will play a key role in the development of metaverse platforms," Microsoft Chief Executive Satya Nadella said.

Microsoft's offer of $95 per share represents a premium of 45% to Activision's Friday close. Shares of Activision were at $83.35 in early trading.

The deal comes at a time of weakness for Activision, maker of games such as "Overwatch" and "Candy Crush". Its shares have slumped more than 37% since reaching a record high last year, hit by allegations of sexual harassment of employees and misconduct by several top managers.

The company is still addressing those allegations and said on Monday it had fired or pushed out more than three dozen employees and disciplined another 40 since July.

CEO Bobby Kotick, who said Microsoft reached out to him for a possible buyout, would continue to be the CEO of Activision following the deal.

In a conference call with analysts, Microsoft boss Nadella did not directly refer to the scandal but talked about the importance of culture in the company.

"It's critical for Activision Blizzard to drive forward on its renewed cultural commitments," he said, adding "the success of this acquisition will depend on it."

'METAVERSE ARMS RACE'

The global gaming market was valued at $173.70 billion in 2021, and is expected to reach $314.40 billion by 2027, according to research firm Mordor Intelligence.

Microsoft can already claim a significant beachhead in the gaming world as one of the big three console makers. It has been making big investments including up "Minecraft" maker Mojang Studios and Zenimax in multi-billion dollar deals in recent years.

It has also launched a popular cloud gaming service, which has more than 25 million subscribers.

Executives talked up Activision's 400 million monthly active users as one major attraction to the deal and how vital these communities could play in Microsoft's various metaverse plays.

Activision's library of games could give Microsoft's Xbox gaming platform an edge over Sony's Playstation, which has for years enjoyed a more steady stream of exclusive games.

"The likes of Netflix have already said they'd like to foray into gaming themselves, but Microsoft has come out swinging with today’s rather generous offer, which would make Microsoft the third largest gaming company in the world," said Sophie Lund-Yates, equity analyst at Hargreaves Lansdown.

Tech companies from Microsoft to Nvidia have placed big bets on the so-called metaverse, with the buzz around it intensifying late last year after Facebook renamed itself as Meta Platforms to reflect its focus on its virtual reality business.

"This is a significant deal for the consumer side of the business and more importantly, Microsoft acquiring Activision really starts the metaverse arms race," David Wagner, Equity Analyst and Portfolio Manager at Aptus Capital Advisors said.

"We believe the deal will get done," he said, but cautioned "This will get a lot of looks from a regulatory standpoint."

Lawmakers on Capitol Hill, who are considering a long list of antitrust bills aimed at reining in Big Tech companies like Google and Facebook, will be skeptical of this transaction, said Andre Barlow of the law firm Doyle, Barlow & Mazard PLLC.

"Microsoft is already big in gaming," he said."""
mergers_doc = nlp(clean(mergers_article))

In [170]:
print_rel(mergers_doc, pred_name=verb, pred_synonyms=[verb]+synonyms(verb), excl_prepos=[])

(('Facebook', 'ORG'), renamed, ('Meta Platforms', 'ORG'))
(('Microsoft', 'ORG'), acquiring, ('Activision', 'ORG'))


In [171]:
render(mergers_doc,style="ent")

In [126]:
merger_sent = """Microsoft Corp (MSFT.O) is buying "Call of Duty" maker Activision Blizzard (ATVI.O) for $68.7 billion in the biggest gaming industry deal in history as global technology giants stake their claims to a virtual future."""
merger_sent_doc = nlp(clean(merger_sent))

In [172]:
print_rel(merger_sent_doc, pred_name=verb, pred_synonyms=[verb]+synonyms(verb), excl_prepos=[])

In [173]:
render(merger_sent_doc,style="dep")

In [174]:
trial = nlp("GAF, which made an unsuccessful attempt in 1985 to acquire Union Carbide Corp &lt;UK>, recently offered three billion dlrs for Borg Warner Corp &lt;BOR>, a Chicago manufacturer of plastics and chemicals.")

In [178]:
render(trial, style="dep")

In [186]:
for t in trial:
    print("TOKEN: ",t,"\ndepencies:",t.dep_,"\npos: ",t.pos_,"\ntoken's head:", t.head)
    if peek(t.children):
        print("Children:")
        for child in t.children:
            print(child)
    if peek(t.lefts):
        print("Lefts:")
        for left in t.lefts:
            print(left)
    if peek(t.rights):
        print("Rights:")
        for right in t.rights:
            print(right)
    print("\n")

TOKEN:  GAF 
depencies: nsubj 
pos:  PROPN 
token's head: offered
Children:
,
made
>
,
Rights:
,
made
>
,


TOKEN:  , 
depencies: punct 
pos:  PUNCT 
token's head: GAF


TOKEN:  which 
depencies: nsubj 
pos:  PRON 
token's head: made


TOKEN:  made 
depencies: relcl 
pos:  VERB 
token's head: GAF
Children:
which
attempt
in
acquire
Lefts:
which
Rights:
attempt
in
acquire


TOKEN:  an 
depencies: det 
pos:  DET 
token's head: attempt


TOKEN:  unsuccessful 
depencies: amod 
pos:  ADJ 
token's head: attempt


TOKEN:  attempt 
depencies: dobj 
pos:  NOUN 
token's head: made
Children:
an
unsuccessful
Lefts:
an
unsuccessful


TOKEN:  in 
depencies: prep 
pos:  ADP 
token's head: made
Children:
1985
Rights:
1985


TOKEN:  1985 
depencies: pobj 
pos:  NUM 
token's head: in


TOKEN:  to 
depencies: aux 
pos:  PART 
token's head: acquire


TOKEN:  acquire 
depencies: advcl 
pos:  VERB 
token's head: made
Children:
to
Corp
Lefts:
to
Rights:
Corp


TOKEN:  Union 
depencies: compound 
pos:  PROPN 


In [180]:
trial

GAF, which made an unsuccessful attempt in 1985 to acquire Union Carbide Corp &lt;UK>, recently offered three billion dlrs for Borg Warner Corp &lt;BOR>, a Chicago manufacturer of plastics and chemicals.

In [183]:
print_rel(trial, pred_name=verb, pred_synonyms=[verb]+synonyms(verb), excl_prepos=[])

(('GAF', 'ORG'), made, ('Union Carbide Corp &lt;UK', 'ORG'))
(('GAF', 'ORG'), acquire, ('Union Carbide Corp &lt;UK', 'ORG'))
(('GAF', 'ORG'), offered, ('Borg Warner Corp &', 'ORG'))
(('GAF', 'ORG'), offered, ('three billion dlrs', 'MONEY'))


Back to the [table of contents](#contents).

### C. Categories <a name=categories></a>

#### a. Crude <a name=crude></a>

In [37]:
crude = ""
for fileid in reuters.fileids("crude"):#[0:50]:
    file = reuters.raw(fileid)
    file = clean(file)
    crude = crude + " " + file
crude = crude.strip()

In [38]:
crude_doc = nlp(crude)

In [32]:
entities = []
for ent in crude_doc.ents:
    if ent.label_ == "MONEY":
        entities.append(ent.text)

In [96]:
verb = "sell"
subjects = []
subjects_label = []
verbs = []
objects = []
objects_label = []
relations = [relation for relation in extract_rel_dep(crude_doc, pred_name=verb, pred_synonyms=[verb]+synonyms(verb), excl_prepos=[])]
for relation in relations:
    print(relation)
    subject_tuple, verb, object_tuple = relation
    subjects.append(subject_tuple[0])
    subjects_label.append(subject_tuple[1])
    verbs.append(verb)
    objects.append(object_tuple[0])
    objects_label.append(object_tuple[1])

(('GAF', 'ORG'), made, ('Union Carbide Corp', 'ORG'))
(('GAF', 'ORG'), acquire, ('Union Carbide Corp', 'ORG'))
(('GAF', 'ORG'), offered, ('Borg Warner Corp', 'ORG'))
(('GAF', 'ORG'), offered, ('three billion dlrs', 'MONEY'))
(('Cain Chemical', 'PERSON'), created, ('Sterling', 'ORG'))
(('Gordon Cain', 'PERSON'), led, ("Conoco Inc's", 'ORG'))
(('Yates', 'PERSON'), told, ('Reuters', 'ORG'))
(('Texaco', 'ORG'), told, ('Texaco', 'ORG'))
(('Texaco', 'ORG'), told, ('one billion dlr', 'MONEY'))
(('Texaco', 'ORG'), cut, ('one billion dlr', 'MONEY'))
(('OPEC', 'ORG'), agreed, ('18 dlrs', 'MONEY'))
(('OPEC', 'ORG'), limit, ('18 dlrs', 'MONEY'))
(('OPEC', 'ORG'), return, ('18 dlrs', 'MONEY'))
(('OPEC', 'ORG'), averaging, ('18 dlrs', 'MONEY'))
(('Texaco', 'ORG'), filed, ('Pennzoil', 'ORG'))
(('Texaco', 'ORG'), filed, ('11 billion dlrs', 'MONEY'))
(('Texaco', 'ORG'), failing, ('Pennzoil', 'ORG'))
(('Texaco', 'ORG'), failing, ('11 billion dlrs', 'MONEY'))
(('Texaco', 'ORG'), reach, ('Pennzoil', 'ORG'

(('Miller', 'ORG'), acknowleged, ('3.3 billion dlrs', 'MONEY'))
(('DEVON', 'PERSON'), REPORTS, ('Devon Resource Investors', 'ORG'))
(('DEVON', 'PERSON'), INCREASE, ('Devon Resource Investors', 'ORG'))
(('IMPERIAL OIL &lt;IMO.A> IN TALKS WITH', 'ORG'), owned, ('Exxon Corp', 'ORG'))
(('MOBIL', 'ORG'), RAISES, ('DLR', 'ORG'))
(('Mobil', 'ORG'), said, ('API', 'ORG'))
(('Mobil', 'ORG'), changed, ('API', 'ORG'))
(('Petrobras', 'ORG'), cancelled, ('Bank of Brazil', 'ORG'))
(('Petrobras', 'ORG'), refused, ('Bank of Brazil', 'ORG'))
(('Petrobras', 'ORG'), accept, ('Bank of Brazil', 'ORG'))
(('Texaco Canada Inc', 'ORG'), owned, ('Texaco Inc', 'ORG'))
(('Ecopetrol', 'PERSON'), exploiting, ('Occidental Petroleum Corp', 'ORG'))
(('PANCANADIAN', 'ORG'), SELL, ('IEA', 'ORG'))
(('PANCANADIAN', 'ORG'), SELL, ('IEA', 'ORG'))
(('IEA', 'ORG'), estimates, ('OECD', 'ORG'))
(('IEA', 'ORG'), put, ('OECD', 'ORG'))
(('IEA', 'ORG'), FORECASTS, ('OECD', 'ORG'))
(('IEA', 'ORG'), estimates, ('OECD', 'ORG'))
(('IEA'

(('Petrobras', 'ORG'), said, ('Banco', 'ORG'))
(('Petrobras', 'ORG'), accept, ('Banco', 'ORG'))
(('Petrobras', 'ORG'), do, ('Brasil', 'ORG'))
(('Petrobras', 'ORG'), cancelled, ('Bank of Brazil', 'ORG'))
(('Petrobras', 'ORG'), refused, ('Bank of Brazil', 'ORG'))
(('Petrobras', 'ORG'), accept, ('Bank of Brazil', 'ORG'))
(('Margoshes', 'PERSON'), said, ('Exxon', 'ORG'))
(('Margoshes', 'PERSON'), recommends, ('Imperial Oil &lt;IMO.A>', 'ORG'))
(('Margoshes', 'ORG'), said, ('Atlantic Richfield', 'ORG'))
(('Margoshes', 'ORG'), recommended, ('Atlantic Richfield', 'ORG'))
(('EXXON', 'ORG'), CLOSE, ('Exxon Corp', 'ORG'))
(('Lawson', 'PERSON'), announced, ('PRT', 'ORG'))
(('Lawson', 'PERSON'), allow, ('PRT', 'ORG'))
(('Lawson', 'PERSON'), qualify, ('PRT', 'ORG'))
(('HERRINGTON', 'PERSON'), SAYS, ('HERRINGTON', 'PERSON'))
(('HERRINGTON', 'PERSON'), RECOMMEND, ('HERRINGTON', 'PERSON'))
(('HERRINGTON', 'PERSON'), SAYS, ('HERRINGTON', 'PERSON'))
(('HERRINGTON', 'PERSON'), RECOMMEND, ('HERRINGTON', '

In [60]:
relationships = pd.DataFrame({"subject":subjects,"subject_label":subjects_label,"verb":verbs,"object":objects,"object_label":objects_label})

In [98]:
relationships.to_csv("data/relationships-3ENT_4ENT.csv")

In [99]:
edges = pd.DataFrame({"from":subjects,"to":objects,"label":verbs})
edges.to_csv("data/edges-3ENT_4ENT.csv")
nodes = pd.DataFrame({"id":subjects+objects,"Type":subjects_label+objects_label})
nodes.to_csv("data/nodes-3ENT_4ENT.csv")

In [59]:
for sentence in crude_doc.sents:
    for ent in sentence.ents:
        if ent.label_ == "MONEY" and ent.text in entities:
            displacy.render(sentence, style="ent")

In [95]:
len(subjects)

223

In [97]:
len(subjects)

480