In [1]:
from sqlite_db import thesisDB
import json
from unidecode import unidecode
import re
from codingtools import DisambiguationTool

db_file = 'D:/thesis/nl/data.db'
actor_file = 'D:/thesis/nl/actors/tobecoded.csv'
db = thesisDB(db_file)

In [2]:
##Get entities from the database

ents = db.getEntities()
unique_ents = set()
for rowid, ent, docid, ent_type in ents:
    if ent_type in ['PER','ORG']:
        unique_ents.add((ent,ent_type))
ents.close()

### Counting
At this point, coding should be done. This includes the following subtasks:
1. Disambiguation. Solve duplicates into a map that collects all formulations of one entity with its nominal entity. For example, the database contains multiple phrasings of Theresa May, sometimes as 'May', sometimes as 'Theresa May'. These need to be collapsed in order to identify unique actors per article. Main problem is that Ï need to keep track of every change made, so I can collapse everything on the document level before the final counting task as well.
2. Coding. Code according to the EES media study codebook. For multiple occurrences form disambiguation, only code the nominal entity and then map the code to the others when counting.
3. Counting.
  1. Construct map with key (ent,ent_type) and code as value.
  2. Get set of unique entities (collapsed using disambiguation map) for every document
  3. Code every unique entity using the codes map

In [3]:
#Step 1 Disambiguation map, automatic part
#Map set up as if nothing is changed between names
disambiguation_map = {k:k for k in unique_ents}

#Remove duplicate spelling from accents
disambiguation_map = {k:(unidecode(ent),ent_type) for k,(ent,ent_type) in disambiguation_map.items()}

#Remove lowercase words, if applicable
lcword = re.compile('^[a-z]+ ') #lowercase word at start of string
wrong_whitespace = re.compile('(?<= ) |^ | $|^[^A-Za-z]+(?=[A-Za-z])|(?<=[^A-Za-z])[^A-Za-z]+')
    #Remaining whitespace after lowercase words are removed: 
    #if preceded by another whitespace, start of string, end of string, non-alphabetical characters at start or end
disambiguation_map = {k:(re.sub(lcword,'',ent),ent_type) for k,(ent,ent_type) in disambiguation_map.items()}
disambiguation_map = {k:(re.sub(wrong_whitespace,'',ent),ent_type) for k,(ent,ent_type) in disambiguation_map.items()}

In [4]:
#Step 2 Disambiguation by hand
#Guide: Type the root entity if it is not yet the root entity. For people: firstname + last name. Click None if not an entity.
tool = DisambiguationTool(disambiguation_map)
tool.startWorking()

A Jupyter Widget

In [None]:
#Step 3A Get unique entities per document
ents = db.getEntities()
unique_ents_corpus = []
unique_ents_doc = set()

currentDoc = 1

for rowid, ent, docid, ent_type in ents:
    if docid != currentDoc:
        unique_ents_corpus.append(unique_ents_doc)
        unique_ents_doc = set()
        currentDoc = docid
    if ent_type in ['PER','ORG']:
        unique_ents_doc.add(disambiguation_map[(ent,ent_type)])
unique_ents_corpus.append(unique_ents_doc)


In [5]:
import pickle

In [10]:
pickle_name = 'tool_progress.p'
pickle.dump(tool,open(pickle_name,'wb'))

PicklingError: Can't pickle <built-in function input>: it's not the same object as builtins.input

In [11]:
tool.__dict__

{'disambiguated': Text(value='Adami', description='Root:', placeholder='Type something'),
 'map': {('Raffarin', 'PER'): ('Raffarin', 'PER'),
  ('Camerons', 'PER'): ('Camerons', 'PER'),
  ('Kok', 'PER'): ('Kok', 'PER'),
  ('EQU', 'ORG'): ('EQU', 'ORG'),
  ('Binnema', 'PER'): ('Binnema', 'PER'),
  ('CDU', 'ORG'): ('CDU', 'ORG'),
  ('Wenceslas Plein', 'PER'): ('Wenceslas Plein', 'PER'),
  ('OCHA', 'PER'): ('OCHA', 'PER'),
  ('JACCO VAN', 'PER'): ('JACCO VAN', 'PER'),
  ('Victor', 'PER'): ('Victor', 'PER'),
  ('Volgens Bolkestein', 'PER'): ('Volgens Bolkestein', 'PER'),
  ('NRC', 'ORG'): ('NRC', 'ORG'),
  ('Derk Jan Eppink', 'PER'): ('Derk Jan Eppink', 'PER'),
  ('Berlage', 'PER'): ('Berlage', 'PER'),
  ('Trumps', 'PER'): ('Trumps', 'PER'),
  ('René van de Linden', 'PER'): ('Rene van de Linden', 'PER'),
  ('Charles Taylor', 'PER'): ('Charles Taylor', 'PER'),
  ('David Kusin', 'PER'): ('David Kusin', 'PER'),
  ('Bommeljé', 'PER'): ('Bommelje', 'PER'),
  ('KOK EN', 'ORG'): ('KOK EN', 'ORG'),