<a href="https://colab.research.google.com/github/opensanctions/storyweb/blob/main/contrib/link_classification_experiments_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install python and other dependencies
!nvcc --version
!pip install -U pip wheel
!pip install -U 'spacy[cuda111]==3.3.0' 
!pip install pyicu normality fingerprints nltk
!python -m spacy download en_core_web_sm
!python -m spacy validate

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Sun_Feb_14_21:12:58_PST_2021
Cuda compilation tools, release 11.2, V11.2.152
Build cuda_11.2.r11.2/compiler.29618528_0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pip
  Downloading pip-23.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 22.0.4
    Uninstalling pip-22.0.4:
      Successfully uninstalled pip-22.0.4
Successfully installed pip-23.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy[cuda111]==3.3.0
  Downloading spacy-3.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.4/6.4 MB

In [4]:
import io
import csv
import requests
DATA_URL = "https://github.com/opensanctions/storyweb/blob/main/contrib/tagged_sentences_20230203.csv?raw=true"
res = requests.get(DATA_URL)
buf = io.StringIO(res.text)
sentences = []
for row in csv.DictReader(buf):
  sentences.append(row)

In [7]:
import spacy
from spacy import displacy
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
en_stopwords = stopwords.words('english')

spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [17]:
for sent in sentences:
  doc = nlp(sent['sentence'])
  tokens = []
  for token in doc:
    if token.is_stop:
      continue
    if token.pos_ not in ('VERB', 'ADJ'):
      continue
    # print(token.pos_)
    lemma = str(token.lemma_).lower()
    if lemma in en_stopwords or len(lemma) < 3:
      continue
    tokens.append(lemma)
  sent['tokens'] = tokens

In [24]:
from collections import Counter

rels = {}
overall = Counter()
for sent in sentences:
  link_type = sent['link_type']
  # print(sent['link_type'])
  for token in sent['tokens']:
    if link_type not in rels:
      rels[link_type] = Counter()
    rels[link_type][token] += 1
    overall[token] += 1
  

for link_type, counter in rels.items():
  terms = []
  for token, cnt in counter.items():
    if cnt == 1:
      continue
    freq = cnt / float(overall[token])
    terms.append((token, freq))
    # print("XXX", link_type, "TOK", token, freq)
  
  terms = sorted(terms, key=lambda t: t[1], reverse=True)
  print("XXX", link_type, "TOKENS", terms[:10])


XXX FAMILY TOKENS [('blow', 1.0), ('score', 1.0), ('hijack', 1.0), ('plunder', 1.0), ('borrow', 0.6666666666666666), ('moral', 0.5), ('trillian', 0.4444444444444444), ('little', 0.21428571428571427), ('happen', 0.2), ('invest', 0.19047619047619047)]
XXX WITHIN TOKENS [('unspecified', 1.0), ('terminate', 1.0), ('rapid', 1.0), ('recall', 1.0), ('shameless', 1.0), ('fear', 1.0), ('square', 1.0), ('assure', 1.0), ('comfortable', 1.0), ('iraqi', 1.0)]
XXX MANAGER TOKENS [('split', 1.0), ('locomotive', 1.0), ('amab', 1.0), ('ideal', 1.0), ('artful', 1.0), ('hazardous', 1.0), ('adjacent', 1.0), ('decrease', 1.0), ('guard', 1.0), ('marry', 1.0)]
XXX ASSOCIATE TOKENS [('hover', 1.0), ('aggressive', 1.0), ('recount', 1.0), ('tough', 1.0), ('ordinary', 1.0), ('petition', 1.0), ('cancel', 1.0), ('grassroots', 1.0), ('overthrown', 1.0), ('agathe', 1.0)]
XXX EMPLOYEE TOKENS [('outstanding', 1.0), ('resemble', 1.0), ('radioactive', 1.0), ('ironic', 1.0), ('lengthy', 1.0), ('occur', 1.0), ('shimmer', 