In [1]:
import pandas as pd
from mordecai import Geoparser
import re

In [2]:
# Get a list of predictions for the unseen documents
relevance = pd.read_csv('../data/1_document_relevance.csv')
relevance.head()

Unnamed: 0,id,0 - relevance - mean_prediction,0 - relevance - std_prediction,0 - relevance - lower_pred,0 - relevance - upper_pred
0,740010,0.009046,0.001371,0.007675,0.010417
1,3300415,0.055892,0.013545,0.042347,0.069436
2,3821128,0.269951,0.026715,0.243236,0.296665
3,711341,0.465122,0.068707,0.396415,0.533828
4,1474274,0.509145,0.117428,0.391717,0.626573


In [3]:
# Get a dataframe of seen and unseen documents and their titles and abstracts

cols = ["id","content","title","relevant"]
seen_df = pd.read_csv('../data/0_labelled_documents.csv')[cols]
unseen_df = pd.read_csv('../data/0_unlabelled_documents.csv')[cols]

df = (pd.concat([seen_df,unseen_df])
      .sort_values('id')
      .sample(frac=1, random_state=1)
      .reset_index(drop=True)
).merge(relevance, how="left")
print(df.shape)
df.head()

(398739, 8)


Unnamed: 0,id,content,title,relevant,0 - relevance - mean_prediction,0 - relevance - std_prediction,0 - relevance - lower_pred,0 - relevance - upper_pred
0,2353767,Paraiba do Sul watershed is one of the most im...,"Biomarkers in an invasive fish species, Oreoch...",0.0,0.145651,0.027436,0.118216,0.173087
1,1371352,The incidences and reproduction of the ectopar...,NON-REPRODUCTION OF VARROA-JACOBSONI IN APIS-M...,0.0,0.626608,0.0366,0.590008,0.663208
2,1403811,A German Shepherd Dog was treated initially fo...,DISSEMINATED ASPERGILLOSIS IN A DOG WITH DISKO...,0.0,0.437768,0.053597,0.384171,0.491364
3,1436591,Against the backdrop of warming of the Norther...,Marine Ecosystem Response to the Atlantic Mult...,0.0,0.826912,0.033873,0.793039,0.860785
4,1611849,"A review of the analytical, computational, and...",Improved performance of latent heat energy sto...,0.0,0.115313,0.018673,0.09664,0.133986


In [4]:
# Filter out those that are potentially relevant or relevant
df = df[
    (df["0 - relevance - upper_pred"]>=0.5) | 
    (df["relevant"]==1)
]
print(df.shape)
df.head()

(94943, 8)


Unnamed: 0,id,content,title,relevant,0 - relevance - mean_prediction,0 - relevance - std_prediction,0 - relevance - lower_pred,0 - relevance - upper_pred
1,1371352,The incidences and reproduction of the ectopar...,NON-REPRODUCTION OF VARROA-JACOBSONI IN APIS-M...,0.0,0.626608,0.0366,0.590008,0.663208
3,1436591,Against the backdrop of warming of the Norther...,Marine Ecosystem Response to the Atlantic Mult...,0.0,0.826912,0.033873,0.793039,0.860785
6,746986,Individuals often considerably differ in the t...,Migration phenology and breeding success are p...,0.0,0.616048,0.064163,0.551885,0.680211
11,701743,We quantified intergenerational above- and bel...,Intergenerational above- and belowground respo...,0.0,0.639691,0.049413,0.590278,0.689105
15,606826,The linkage between climate change and increas...,Amplified subtropical stationary waves in bore...,0.0,0.778671,0.036504,0.742167,0.815175


In [5]:
# Either get old data or make a new dataframe
try:
    processed_places = pd.read_csv('../data/places.csv')
    df = df[~df['id'].isin(processed_places['doc_id'])]
except:
    processed_places = pd.DataFrame()
    
df.shape

(14035, 8)

In [6]:
# Test the geoparser
geo = Geoparser()
geo.geoparse("I took the tube from Oxford Circus to London Bridge, via Bank")

Models path: /home/galm/software/mordecai-env/lib/python3.8/site-packages/mordecai/models/


[{'word': 'Oxford Circus',
  'spans': [{'start': 21, 'end': 34}],
  'country_predicted': 'GBR',
  'country_conf': 0.96374094,
  'geo': {'admin1': 'England',
   'lat': '51.51517',
   'lon': '-0.14181',
   'country_code3': 'GBR',
   'geonameid': '2640727',
   'place_name': 'Oxford Circus Underground Station',
   'feature_class': 'S',
   'feature_code': 'MTRO'}},
 {'word': 'London Bridge',
  'spans': [{'start': 38, 'end': 51}],
  'country_predicted': 'GBR',
  'country_conf': 0.96374094,
  'geo': {'admin1': 'England',
   'lat': '51.50821',
   'lon': '-0.08763',
   'country_code3': 'GBR',
   'geonameid': '6619889',
   'place_name': 'London Bridge',
   'feature_class': 'S',
   'feature_code': 'BDG'}}]

In [7]:
%%capture 
places = []
geos = []

# Go through the rows of the dataframe
for i, row in df.iterrows():
    
    # Get the text we want to geoparse, join title and abstract, get rid of copyright stuff
    t = row['title'] + " " + row['content']
    t = t.split("Copyright (C)")[0] 
    t = re.split("\([C-c]\) [1-2][0-9]{3} Elsevier",t)[0] 
    t = t.split("Published by Elsevier")[0] 
    t = t.split("Copyright. (C)")[0] 
    t = re.split("\. \(C\) [1-2][0-9]{3} ",t)[0] 
    t = re.split("\. \(C\) Copyright",t)[0]   
    
    # geoparse
    gp = geo.geoparse(t)
    
    rplaces = []
    continent = None
    for p in gp:
        try:
            a2 = country_alpha3_to_country_alpha2(p["country_predicted"])
            continent = country_alpha2_to_continent_code(a2)
        except:
            pass
        if "geo" in p:
            try:
                a2 = country_alpha3_to_country_alpha2(p["geo"]["country_code3"])
                continent = country_alpha2_to_continent_code(a2)
            except:
                pass
            for key, value in p["geo"].items():
                p[key] = value
            del p["geo"]
            
        p["doc_id"] = row["id"]
            
        rplaces.append(p)
        places.append(p)
    df.loc[i,"continent"] = continent
    df.loc[i,"places"] = len(rplaces)

In [8]:
# Merge all the data together
combined_place_df = processed_places.append(pd.DataFrame.from_dict(places))
print(combined_place_df.shape)
combined_place_df.to_csv("../data/places.csv", index=False)
combined_place_df.tail()

(403843, 13)


Unnamed: 0,word,spans,country_predicted,country_conf,doc_id,admin1,lat,lon,country_code3,geonameid,place_name,feature_class,feature_code
403838,Brac,"[{'start': 928, 'end': 932}]",HRV,0.883954,3435262,Splitsko-Dalmatinska,43.32,16.63722,HRV,3203594.0,Otok Brač,T,ISL
403839,Balkanostenasellus,"[{'start': 1714, 'end': 1732}]",HRV,0.239805,3435262,,,,,,,,
403840,Croatia,"[{'start': 2055, 'end': 2062}]",HRV,0.948191,3435262,,45.16667,15.5,HRV,3202326.0,Republic of Croatia,A,PCLI
403841,Quebec,"[{'start': 514, 'end': 520}]",CAN,0.906452,3806786,Quebec,52.00017,-71.99907,CAN,6115047.0,Québec,A,ADM1
403842,Canada,"[{'start': 522, 'end': 528}]",CAN,0.951695,3806786,,60.10867,-113.64258,CAN,6251999.0,Canada,A,PCLI
