In [1]:
import requests
import pandas as pd
import numpy as np
import jsonlines
import time
from tqdm import tqdm
from collections import defaultdict

# Load Publication Data from jsonlines file

In [37]:
publications = []
with jsonlines.open("./data/openalex_publications.jsonl") as f_in:
    for pub in f_in:
        publications.append(pub)


In [38]:
num_docs = len(publications)
num_docs

12706

# Extract Title and Abstract Text from publication records

In [39]:
texts = []

for pub in publications:
    title = pub['display_name']
    abstract_text = None
    
    # we have to reconstitute the abstract text based on tokens and positions
    if pub.get('abstract_inverted_index'):
        position_tokens = {}
        for token in pub['abstract_inverted_index']:
            for position in pub['abstract_inverted_index'][token]:
                position_tokens[position] = token
        abstract_text_elems = []
        for i in range(min(position_tokens.keys()), max(position_tokens.keys())+1):
            if i in position_tokens:
                abstract_text_elems.append(position_tokens[i])
            else:
                # sometimes tokens are omitted, just continue
                continue
        abstract_text = " ".join(abstract_text_elems)
        
    # Not all publications have abstracts, just use title in this case
    if abstract_text:
        texts.append("{} {}".format(title, abstract_text))
    else:
        texts.append(title)

# Geoparse texts
Use the Edinburgh Geoparser to find locations mentioned in the title & abstract.

See [https://www.ltg.ed.ac.uk/software/geoparser/](https://www.ltg.ed.ac.uk/software/geoparser/)

You must download the package and place in the working directory in a folder called geoparser-1.2 (or update the code in the cell below to locate the binary)

In [40]:
# use a dictionary to store results so we can easily resume in case of error or restart
geo_records = defaultdict(list)

Warning - This code will take about 5-6 hours to run over the sample dataset!

In [48]:
import subprocess
from lxml import etree

for doc_id in tqdm(range(0, num_docs)):
    if doc_id in geo_records:
        # already processed, continue
        continue

    text = texts[doc_id]

    r = subprocess.run(['./geoparser-1.2/scripts/run','-t', 'plain', '-g', 'geonames'], capture_output=True, input=text.encode('utf-8'))
    xml_data = r.stdout

    if xml_data:
        tree = etree.fromstring(xml_data)    
        word_map = {}
        word_order = []
        for w_elem in tree.xpath("//text/p/s/w"):
            wid = w_elem.get('id')
            word = w_elem.text
            word_map[wid] = word
            word_order.append(wid)     

        for ent_elem in tree.xpath(".//ents[@source='ner-rb']/ent[@type='location']"):
            gazref = ent_elem.get('gazref')
            in_country = ent_elem.get('in-country')
            population = ent_elem.get('pop-size')
            parts = ent_elem.xpath("./parts/part/text()")
            start = ent_elem.xpath("./parts/part/@sw")[0]
            end = ent_elem.xpath("./parts/part/@ew")[0]
            s_i = word_order.index(start)
            e_i = word_order.index(end)
            surrounding_words = word_order[s_i-10:s_i+10]
            frag = " ".join([word_map[wid] for wid in surrounding_words])

            geo_records[doc_id].append({
                'title': publications[doc_id]['display_name'],
                'gazref': gazref,
                'population': population,
                'in_country': in_country,
                'match': " ".join(parts),
                'fragment': frag,
            })
        
    if doc_id not in geo_records:
        geo_records[doc_id] = []

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12706/12706 [4:58:50<00:00,  1.41s/it]


In [57]:
# write to file using csv library, faster than converting into dataframe then writing
import csv

with open("./data/geoparsing.csv", "w") as f_out:
    fieldnames = ['doc_id', 'title', 'gazref', 'population', 'in_country', 'match', 'fragment']
    writer = csv.DictWriter(f_out, fieldnames=fieldnames)
    writer.writeheader()
    
    for doc_id in geo_records:
        for record in geo_records[doc_id]:
            record['doc_id'] = doc_id
            writer.writerow(record)

In [58]:
pd.read_csv("./data/geoparsing.csv").head(10)

Unnamed: 0,doc_id,title,gazref,population,in_country,match,fragment
0,0,A fractional calculus based model for the simu...,geonames:3374766,418224.0,CV,Cape Verde,statistics from the 2009 outbreak of the disea...
1,1,"Autochthonous Dengue Fever, Tokyo, Japan, 2014",geonames:1850147,8336599.0,JP,Tokyo,
2,1,"Autochthonous Dengue Fever, Tokyo, Japan, 2014",geonames:1861060,127417200.0,JP,Japan,
3,1,"Autochthonous Dengue Fever, Tokyo, Japan, 2014",geonames:1861060,127417200.0,JP,Japan,years with no confirmed autochthonous cases of...
4,1,"Autochthonous Dengue Fever, Tokyo, Japan, 2014",geonames:1861060,127417200.0,JP,Japan,the strain from the first patient ( 2014 ) in ...
5,3,A new fractional modelling and control strateg...,geonames:3374766,,CV,Cape Verde Islands,are proved . According to a real outbreak on t...
6,5,"Dengue Fever: Causes, Complications, and Vacci...",geonames:3996063,106202900.0,MX,Mexico,'s chimeric live-attenuated dengue vaccine can...
7,5,"Dengue Fever: Causes, Complications, and Vacci...",geonames:3469034,186112800.0,BR,Brazil,live-attenuated dengue vaccine candidate has b...
8,5,"Dengue Fever: Causes, Complications, and Vacci...",geonames:1694008,87857470.0,PH,Philippines,"candidate has been approved in Mexico , Brazil..."
9,8,Dengue fever in China,geonames:1814991,1306314000.0,CN,China,Dengue fever in China


For the best results, curate the exported file and remove false positives. The fragment column will give you a clue as to whether a correct match was made.