<a href="https://colab.research.google.com/github/opensanctions/storyweb/blob/main/contrib/link_classification_experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install python and other dependencies
!nvcc --version
!pip install -U pip wheel
!pip install -U 'spacy[cuda111]==3.3.0' 
!pip install pyicu normality fingerprints nltk
!python -m spacy download en_core_web_sm
!python -m spacy validate

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Sun_Feb_14_21:12:58_PST_2021
Cuda compilation tools, release 11.2, V11.2.152
Build cuda_11.2.r11.2/compiler.29618528_0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pip
  Downloading pip-23.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 22.0.4
    Uninstalling pip-22.0.4:
      Successfully uninstalled pip-22.0.4
Successfully installed pip-23.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy[cuda111]==3.3.0
  Downloading spacy-3.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.4/6.4 MB

In [4]:
import io
import csv
import requests
DATA_URL = "https://github.com/opensanctions/storyweb/blob/main/contrib/tagged_sentences_20230203.csv?raw=true"
res = requests.get(DATA_URL)
buf = io.StringIO(res.text)
sentences = []
for row in csv.DictReader(buf):
  sentences.append(row)

In [7]:
import spacy
from spacy import displacy
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
en_stopwords = stopwords.words('english')

spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [12]:
for sent in sentences:
  doc = nlp(sent['sentence'])
  tokens = []
  for token in doc:
    lemma = str(token.lemma_).lower()
    if lemma in en_stopwords or len(lemma) < 3:
      continue
    tokens.append(lemma)
  sent['tokens'] = tokens

In [14]:
from collections import Counter

rels = {}
for sent in sentences:
  link_type = sent['link_type']
  # print(sent['link_type'])
  for token in sent['tokens']:
    if link_type not in rels:
      rels[link_type] = Counter()
    rels[link_type][token] += 1
  

for link_type, counter in rels.items():
  print("XXX", link_type, "TOK", counter.most_common(50))


XXX FAMILY TOK [('gupta', 36), ('2017', 35), ('guptas', 31), ('president', 31), ('aliyev', 31), ('2016', 27), ('guptaleak', 22), ('august', 19), ('ilham', 19), ('putin', 18), ('2014', 17), ('october', 16), ('zuma', 15), ('state', 15), ('july', 14), ('november', 13), ('2011', 13), ('deal', 12), ('december', 12), ('aliyeva', 12), ('heydar', 12), ('daughter', 11), ('russian', 11), ('vladimir', 10), ('azerbaijani', 10), ('september', 9), ('tony', 9), ('eskom', 9), ('mckinsey', 9), ('transnet', 9), ('2015', 9), ('azerbaijan', 9), ('family', 8), ('million', 8), ('march', 8), ('board', 8), ('djukanovic', 8), ('igor', 8), ('cousin', 8), ('bank', 8), ('carole', 8), ('trillian', 7), ('part', 7), ('mine', 7), ('february', 7), ('june', 7), ('2013', 7), ('2010', 7), ('arzu', 7), ('leyla', 7)]
XXX WITHIN TOK [('new', 506), ('york', 452), ('university', 412), ('offshore', 278), ('institution', 270), ('london', 258), ('leaks', 254), ('database', 253), ('usa', 247), ('u.s.', 224), ('district', 166), ('