In [1]:
import pandas as pd
from mordecai import Geoparser
import re

In [2]:
# Get a list of predictions for the unseen documents
relevance = pd.read_csv('../data/1_document_relevance.csv')
relevance.head()

Unnamed: 0,id,0 - relevance - mean_prediction,0 - relevance - std_prediction,0 - relevance - lower_pred,0 - relevance - upper_pred
0,3921033,0.262942,0.019405,0.243536,0.282347
1,2351389,0.065476,0.009918,0.055558,0.075394
2,1686287,0.040167,0.010916,0.02925,0.051083
3,1310127,0.320405,0.035234,0.285171,0.355639
4,1890786,0.774876,0.019936,0.75494,0.794812


In [3]:
# Get a dataframe of seen and unseen documents and their titles and abstracts

cols = ["id","content","title","relevant"]
seen_df = pd.read_csv('../data/0_labelled_documents.csv')[cols]
unseen_df = pd.read_csv('../data/0_unlabelled_documents.csv')[cols]

df = (pd.concat([seen_df,unseen_df])
      .sort_values('id')
      .sample(frac=1, random_state=1)
      .reset_index(drop=True)
).merge(relevance, how="left")
print(df.shape)
df.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


(600690, 8)


Unnamed: 0,id,content,title,relevant,0 - relevance - mean_prediction,0 - relevance - std_prediction,0 - relevance - lower_pred,0 - relevance - upper_pred
0,3921033,Recovery of phytosterol from roselle (Hibiscus...,Optimization of supercritical fluid extraction...,0.0,0.262942,0.019405,0.243536,0.282347
1,2351389,Science denialism retards evidenced-based poli...,On allegations of invasive species denialism,0.0,0.065476,0.009918,0.055558,0.075394
2,1686287,The QuikSCAT mission provided valuable daily i...,Wind characteristics in the North and Baltic S...,0.0,0.040167,0.010916,0.02925,0.051083
3,1310127,This paper is to promote a further understandi...,The interdecadal variations of the summer mons...,0.0,0.320405,0.035234,0.285171,0.355639
4,1890786,Instrumental climate records are too short to ...,Corals as climate archive,0.0,0.774876,0.019936,0.75494,0.794812


In [4]:
# Filter out those that are potentially relevant or relevant
df = df[
    (df["0 - relevance - upper_pred"]>=0.5) | 
    (df["relevant"]==1)
]
print(df.shape)
df.head()

(119292, 8)


Unnamed: 0,id,content,title,relevant,0 - relevance - mean_prediction,0 - relevance - std_prediction,0 - relevance - lower_pred,0 - relevance - upper_pred
4,1890786,Instrumental climate records are too short to ...,Corals as climate archive,0.0,0.774876,0.019936,0.75494,0.794812
16,715550,Environmentally induced change appears to be i...,Recruitment in a changing environment: the 200...,0.0,0.774296,0.040719,0.733576,0.815015
22,3827381,How human activities have altered hydrological...,Contrasting Influences of Human Activities on ...,0.0,0.693427,0.050493,0.642935,0.74392
35,680654,We tested for competition between pink salmon ...,Survival of Puget Sound chinook salmon (Oncorh...,0.0,0.386028,0.121225,0.264803,0.507253
37,214441,We conducted a night-time warming and drought ...,Effects of long-term experimental night-time w...,0.0,0.90565,0.008319,0.897332,0.913969


In [5]:
# Either get old data or make a new dataframe
try:
    processed_places = pd.read_csv('../data/places.csv')
    df = df[~df['id'].isin(processed_places['doc_id'])]
except:
    processed_places = pd.DataFrame()
    
df.shape

(46935, 8)

In [6]:
# Test the geoparser
geo = Geoparser()
geo.geoparse("I took the tube from Oxford Circus to London Bridge, via Bank")

Models path: /home/galm/software/mordecai-env/lib/python3.8/site-packages/mordecai/models/


[{'word': 'Oxford Circus',
  'spans': [{'start': 21, 'end': 34}],
  'country_predicted': 'GBR',
  'country_conf': 0.96374094,
  'geo': {'admin1': 'England',
   'lat': '51.51517',
   'lon': '-0.14181',
   'country_code3': 'GBR',
   'geonameid': '2640727',
   'place_name': 'Oxford Circus Underground Station',
   'feature_class': 'S',
   'feature_code': 'MTRO'}},
 {'word': 'London Bridge',
  'spans': [{'start': 38, 'end': 51}],
  'country_predicted': 'GBR',
  'country_conf': 0.96374094,
  'geo': {'admin1': 'England',
   'lat': '51.50821',
   'lon': '-0.08763',
   'country_code3': 'GBR',
   'geonameid': '6619889',
   'place_name': 'London Bridge',
   'feature_class': 'S',
   'feature_code': 'BDG'}}]

In [7]:
%%capture 
places = []
geos = []

# Go through the rows of the dataframe
for i, row in df.iterrows():
    
    # Get the text we want to geoparse, join title and abstract, get rid of copyright stuff
    t = row['title'] + " " + row['content']
    t = t.split("Copyright (C)")[0] 
    t = re.split("\([C-c]\) [1-2][0-9]{3} Elsevier",t)[0] 
    t = t.split("Published by Elsevier")[0] 
    t = t.split("Copyright. (C)")[0] 
    t = re.split("\. \(C\) [1-2][0-9]{3} ",t)[0] 
    t = re.split("\. \(C\) Copyright",t)[0]   
    
    # geoparse
    gp = geo.geoparse(t)
    
    rplaces = []
    continent = None
    for p in gp:
        try:
            a2 = country_alpha3_to_country_alpha2(p["country_predicted"])
            continent = country_alpha2_to_continent_code(a2)
        except:
            pass
        if "geo" in p:
            try:
                a2 = country_alpha3_to_country_alpha2(p["geo"]["country_code3"])
                continent = country_alpha2_to_continent_code(a2)
            except:
                pass
            for key, value in p["geo"].items():
                p[key] = value
            del p["geo"]
            
        p["doc_id"] = row["id"]
            
        rplaces.append(p)
        places.append(p)
    df.loc[i,"continent"] = continent
    df.loc[i,"places"] = len(rplaces)

In [8]:
# Merge all the data together
combined_place_df = processed_places.append(pd.DataFrame.from_dict(places))
print(combined_place_df.shape)
combined_place_df.to_csv("../data/places.csv", index=False)
combined_place_df.tail()

(539267, 13)


Unnamed: 0,word,spans,country_predicted,country_conf,doc_id,admin1,lat,lon,country_code3,geonameid,place_name,feature_class,feature_code
135419,Daphnia,"[{'start': 341, 'end': 348}]",USA,0.904877,3630234,Wyoming,44.95516,-109.47165,USA,5822652.0,Daphnia Lake,H,LK
135420,Vtg1,"[{'start': 881, 'end': 885}]",VNM,0.92213,3630234,Bà Rịa-Vũng Tàu,10.34599,107.08426,VNM,1562414.0,Vũng Tàu,P,PPL
135421,South America,"[{'start': 898, 'end': 911}]",,0.904877,3892528,,-14.60485,-57.65625,,6255150.0,South America,L,CONT
135422,Dolphin Gulls,"[{'start': 1140, 'end': 1153}]",CYM,0.248213,3892528,,,,,,,,
135423,Malvinas,"[{'start': 1186, 'end': 1194}]",ARG,0.283659,3892528,,,,,,,,
