<a href="https://colab.research.google.com/github/opensanctions/storyweb/blob/main/occrp_play.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Welcome to Natural Language Processing for Journalism!

**This notebook: https://bit.ly/dh2022-nlp**
Crawler script: https://gist.github.com/pudo/2de2c650f52e3cf44f3ed758b2887ef1 

### Concepts

* Language processing vs. language generation 
* Document extraction vs. NLP
  * cf. https://pudo.org/blog/2016/06/08/poor-mans-text-mining.html
* Human text is complicated, text vs. language
* Computer linguistics vs. neural techniques 

### Examples

* Social media postings
* Government reports
* Chat logs 
* Wikipedia and The Internet 
* News reporting

In [1]:
# Install python and other dependencies
!nvcc --version
!pip install -U pip wheel
!pip install -U 'spacy[cuda111]==3.3.0' 
!pip install pyicu normality fingerprints nltk
!python -m spacy download en_core_web_sm
!python -m spacy validate

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Mon_Oct_12_20:09:46_PDT_2020
Cuda compilation tools, release 11.1, V11.1.105
Build cuda_11.1.TC455_06.29190527_0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pip
  Downloading pip-22.2.2-py3-none-any.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 5.1 MB/s 
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 21.1.3
    Uninstalling pip-21.1.3:
      Successfully uninstalled pip-21.1.3
Successfully installed pip-22.2.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy[cuda111]==3.3.0
  Downloading spacy-3.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 MB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
Collecting thinc

In [2]:
# Load article data
import random
import requests
from collections import Counter
DATA_URL = 'https://assets.pudo.org/dataharvest/articles.json'
res = requests.get(DATA_URL)
articles = res.json()

# Split up the author info 
SKIP = ['Written by', 'OCCRP', 'Organized Crime; Corruption Reporting Project', 'Organized Crime Corruption Reporting Project']
author_names = {}
for article in articles:
  author = article['author']
  if author is None:
    continue
  # Clean up the author name a bit:
  for skip in SKIP:
    author = author.replace(skip, '')
  author = author.lower()
  article['authors'] = []
  for a in author.split(';'):
    a = a.strip()
    if len(a):
      author_names.setdefault(a, 0)
      author_names[a] += 1
      article['authors'].append(a)
random.shuffle(articles)
print(len(articles))

16507


In [3]:
# Show the most prolific authors overall
authors = sorted(author_names.items(), key=lambda a: a[1], reverse=True)
# print(authors[:100])x
from pprint import pprint
pprint(articles[0])

{'author': 'Aisha Kehoe Down',
 'authors': ['aisha kehoe down'],
 'date': '2019-05-10',
 'text': 'Report: Unchecked, Rampant Money Laundering in Canada\n'
         'Canadian authorities are failing to catch 99.9 percent of money '
         'laundering in the country which means that US$75 billion to $100 '
         'billion of dirty money from all over the world are laundered through '
         'Canada each year, according to the author of a policy brief released '
         'Monday.\n'
         '“Canada has a very large money-laundering problem, predominantly '
         'sourced from foreign countries,” says the report.“Canada has a very '
         'large money-laundering problem, predominantly sourced from foreign '
         'countries,” says the report.\n'
         'The main reason for this, the brief explains, is the country’s lack '
         'of transparency surrounding beneficial owners of companies and a '
         'lack of penalty for those who break the law.\n'
         '“Canad

In [4]:
# Boot up spaCy
import spacy
from spacy import displacy

spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")

In [5]:
from collections import Counter

article = random.choice(articles)
doc = nlp(article['text'])

# Show document language:
print(doc.lang_)

# Show each word in the document:
for token in doc:
  if token.is_alpha:
    if token.tag_ == 'NN':
      print(token, token.lemma_, "pos", token.tag_)
#     print(str(token), token.lemma_, "pos", token.tag_)
# for sent in doc.sents:
#   print(sent)

en
head head pos NN
pension pension pos NN
group group pos NN
district district pos NN
court court pos NN
capital capital pos NN
year year pos NN
period period pos NN
money money pos NN
country country pos NN
scheme scheme pos NN
prison prison pos NN
jail jail pos NN
time time pos NN
year year pos NN
probation probation pos NN
group group pos NN


In [6]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
en_stopwords = stopwords.words('english')
# print(en_stopwords)

for token in doc:
  print(token.lemma_, "stopword", token.lemma_ in en_stopwords)

Armenia stopword False
: stopword False
official stopword False
jail stopword False
for stopword True
skim stopword False
Pensions stopword False

 stopword False
Thirteen stopword False
armenian stopword False
official stopword False
, stopword False
include stopword False
the stopword True
former stopword False
head stopword False
of stopword True
the stopword True
State stopword False
Social stopword False
Security stopword False
Service stopword False
( stopword False
SSSS stopword False
) stopword False
, stopword False
have stopword True
be stopword True
convict stopword False
of stopword True
embezzle stopword False
from stopword True
uncollected stopword False
pension stopword False
fund stopword False
. stopword False

 stopword False
the stopword True
group stopword False
be stopword True
find stopword False
guilty stopword False
at stopword True
a stopword True
district stopword False
court stopword False
in stopword True
the stopword True
capital stopword False
of stopword 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
counter = Counter()
for token in doc:
  if token.is_alpha and token.lemma_ not in en_stopwords:
    counter[token.lemma_] += 1

print(counter.most_common(100))

[('year', 4), ('sentence', 3), ('five', 3), ('official', 2), ('jail', 2), ('head', 2), ('SSSS', 2), ('group', 2), ('Armenia', 1), ('skim', 1), ('Pensions', 1), ('Thirteen', 1), ('armenian', 1), ('include', 1), ('former', 1), ('State', 1), ('Social', 1), ('Security', 1), ('Service', 1), ('convict', 1), ('embezzle', 1), ('uncollected', 1), ('pension', 1), ('fund', 1), ('find', 1), ('guilty', 1), ('district', 1), ('court', 1), ('capital', 1), ('Yerevan', 1), ('collect', 1), ('four', 1), ('period', 1), ('money', 1), ('take', 1), ('allocate', 1), ('pensioner', 1), ('die', 1), ('move', 1), ('country', 1), ('report', 1), ('Hetq', 1), ('Vazgen', 1), ('Khachikyan', 1), ('scheme', 1), ('underway', 1), ('prison', 1), ('month', 1), ('time', 1), ('remain', 1), ('seven', 1), ('one', 1), ('probation', 1), ('also', 1), ('fine', 1)]


In [8]:
from spacy import displacy

article = random.choice(articles)
doc = nlp(article['text'])

# Visualize the named entities: 
displacy.render(doc, style='ent', jupyter=True)

In [None]:
counter = Counter()

# Extract named entities from the articles:
with nlp.select_pipes(enable="ner"):
  docs = ((a['text'], a) for a in articles[:1000])
  for (doc, article) in nlp.pipe(docs, batch_size=50, as_tuples=True):
    for ent in doc.ents:
      if ent.label_ in ("ORG", "PERSON"):
        # print(ent.text, ent.label_)
        counter[str(ent)] += 1

# Show the most common extracted entities:
print(counter.most_common(100))

[('OCCRP', 427), ('EU', 191), ('Reuters', 135), ('Jammeh', 98), ('First Bank', 92), ('BBC', 78), ('Guardian', 71), ('Interpol', 69), ('FBI', 67), ('Moldovan', 65), ('Europol', 64), ('Putin', 63), ('Parliament', 61), ('the European Union', 60), ('Kelmendi', 57), ('NCA', 56), ('Ismayilova', 56), ('Magnitsky', 55), ('the Organized Crime and Corruption Reporting Project', 52), ('UN', 52), ('Kurchenko', 50), ('Mafia', 50), ('the Central Bank', 49), ('Laundromat', 48), ('Gruevski', 48), ('Moldova', 45), ('Kremlin', 45), ('Antonov', 45), ('Vladimir Putin', 44), ('Olifeja', 44), ('RFE/RL', 41), ('Church', 41), ('Transparency International', 39), ('COVID-19', 38), ('NSO Group', 37), ('Ilham Aliyev', 37), ('Group America', 36), ('Khadija Ismayilova', 36), ('Zaev', 35), ('un', 35), ('SCL', 35), ('Alltest', 34), ('Martínez', 34), ('Gupta', 33), ('the New York Times', 33), ('Aliyev', 33), ('Trump', 31), ('Odebrecht', 31), ('Novaya Gazeta', 30), ('Guptas', 30), ('Central Bank', 29), ('NSO', 29), ('B

In [None]:
authors = {}

# Run documents through the extractor in batch mode:
with nlp.select_pipes(enable="ner"):
  docs = ((a['text'], a) for a in articles[:20000])
  for (doc, article) in nlp.pipe(docs, batch_size=50, as_tuples=True):
    # go through the extracted entities:
    for ent in doc.ents:
      if ent.label_ in ("ORG", "PERSON"):
        entity = str(ent.text).lower()
        # filter out mentions of journalists in reporting:
        if entity in author_names:
          continue
        for author in article.get('authors', []):
          authors.setdefault(author, {})
          authors[author].setdefault(entity, 0)
          authors[author][entity] += 1

# List out the authors and their pet topics:
for author, entities in authors.items():
  entities = sorted(entities.items(), key=lambda e: e[1], reverse=True)
  print(author, " -> ", entities[:5])

damir bešlija  ->  [('eu', 58), ('europol', 49), ('nca', 34), ('afp', 33), ('olaf', 26)]
vanja lakic  ->  [('odebrecht', 55), ('guardian', 48), ('shell', 44), ('magnitsky', 40), ('commbank', 36)]
eli moskowitz  ->  [('eu', 305), ('congress', 72), ('ubo', 58), ('trump', 48), ('the european commission', 40)]
aisha kehoe down  ->  [('najib', 53), ('pmi', 45), ('danske bank', 44), ('zuma', 36), ('kolomoisky', 35)]
will neal  ->  [('europol', 83), ('nca', 75), ('eu', 70), ('sinaloa', 63), ('guardian', 51)]
jones  ->  [('eu', 37), ('moldova', 33), ('strelet', 18), ('bamc', 18), ('valeriu strelet', 15)]
zdravko ljubas  ->  [('eu', 234), ('europol', 222), ('fbk', 208), ('kremlin', 175), ('putin', 121)]
sinead carolan  ->  [('zuma', 64), ('the new york times', 50), ('eu', 39), ('gupta', 33), ('lee', 33)]
david klein  ->  [('europol', 224), ('interpol', 164), ('eu', 96), ('covid-19', 93), ('fbi', 85)]
viktoriya li  ->  [('kazakh', 7), ('nursultan nazarbayev', 7), ('interpol', 4), ('ardan', 4), (

# Where to next?

* Read this: https://investigate.ai/text-analysis/types-of-text-analysis/ 
* TF/IDF: https://melaniewalsh.github.io/Intro-Cultural-Analytics/05-Text-Analysis/03-TF-IDF-Scikit-Learn.html 
* SparkNLP: https://nlp.johnsnowlabs.com/ 
* Fasttext: https://fasttext.cc/ 
* Labling tool: https://labelstud.io/ 