In [1]:
import pandas as pd
from mordecai import Geoparser
import re

In [2]:
# Get a list of predictions for the unseen documents
relevance = pd.read_csv('../data/1_document_relevance.csv')
relevance.head()

Unnamed: 0,id,0 - relevance - mean_prediction,0 - relevance - std_prediction,0 - relevance - lower_pred,0 - relevance - upper_pred
0,1783633,0.034508,0.006457,0.028052,0.040965
1,1455384,0.37696,0.036163,0.340796,0.413123
2,1340101,0.524275,0.063795,0.460479,0.58807
3,1461135,0.231735,0.028298,0.203437,0.260033
4,1344041,0.272186,0.051346,0.22084,0.323532


In [3]:
# Get a dataframe of seen and unseen documents and their titles and abstracts

cols = ["id","content","title","relevant"]
seen_df = pd.read_csv('../data/0_labelled_documents.csv')[cols]
unseen_df = pd.read_csv('../data/0_unlabelled_documents.csv')[cols]

df = (pd.concat([seen_df,unseen_df])
      .sort_values('id')
      .sample(frac=1, random_state=1)
      .reset_index(drop=True)
).merge(relevance, how="left")
print(df.shape)
df.head()

(378365, 8)


Unnamed: 0,id,content,title,relevant,0 - relevance - mean_prediction,0 - relevance - std_prediction,0 - relevance - lower_pred,0 - relevance - upper_pred
0,1783633,Physicochemical properties and chemism of atmo...,Chemism of Atmospheric Precipitation as a Cons...,0.0,0.034508,0.006457,0.028052,0.040965
1,1455384,Wetlands occur where biotic and abiotic condit...,High altitude montane wetland vegetation class...,0.0,0.37696,0.036163,0.340796,0.413123
2,1340101,The atmospheric forcing on the Barents Sea ice...,Atmospheric forcing on the Barents Sea winter ...,0.0,0.524275,0.063795,0.460479,0.58807
3,1461135,An important prerequisite to better understand...,River-aquifer exchange fluxes under monsoonal ...,0.0,0.231735,0.028298,0.203437,0.260033
4,1344041,"Pacific coast, until recent work provided data...",Fishing in Peru between 10000 and 3750 BP,0.0,0.272186,0.051346,0.22084,0.323532


In [4]:
# Filter out those that are potentially relevant or relevant
df = df[
    (df["0 - relevance - upper_pred"]>0.5) | 
    (df["relevant"]==1)
]
print(df.shape)
df.head()

(90194, 8)


Unnamed: 0,id,content,title,relevant,0 - relevance - mean_prediction,0 - relevance - std_prediction,0 - relevance - lower_pred,0 - relevance - upper_pred
2,1340101,The atmospheric forcing on the Barents Sea ice...,Atmospheric forcing on the Barents Sea winter ...,0.0,0.524275,0.063795,0.460479,0.58807
12,222401,Sea-level rise and frequent intense hurricanes...,Purple Pitcher Plant (Sarracenia rosea) Diebac...,0.0,0.665822,0.062294,0.603528,0.728116
13,689500,This study analyses long-term water quality da...,Long-term changes in hydrological pathways in ...,0.0,0.829573,0.018043,0.81153,0.847616
21,3307272,Rationale: There is significant evidence of in...,Increase in Pediatric Respiratory Visits Assoc...,0.0,0.650128,0.033529,0.616599,0.683657
28,508734,Range limits of many plant species are expecte...,"LAND-USE PATTERN, FOREST MIGRATION, AND GLOBAL...",0.0,0.46749,0.047726,0.419764,0.515216


In [5]:
# Either get old data or make a new dataframe
try:
    processed_places = pd.read_csv('../data/places.csv')
    df = df[~df['id'].isin(processed_places['doc_id'])]
except:
    processed_places = pd.DataFrame()
    
df.shape

(90189, 8)

In [6]:
# Test the geoparser
geo = Geoparser()
geo.geoparse("I took the tube from Oxford Circus to London Bridge, via Bank")

Models path: /home/galm/software/mordecai-env/lib/python3.8/site-packages/mordecai/models/


[{'word': 'Oxford Circus',
  'spans': [{'start': 21, 'end': 34}],
  'country_predicted': 'GBR',
  'country_conf': 0.96374094,
  'geo': {'admin1': 'England',
   'lat': '51.51517',
   'lon': '-0.14181',
   'country_code3': 'GBR',
   'geonameid': '2640727',
   'place_name': 'Oxford Circus Underground Station',
   'feature_class': 'S',
   'feature_code': 'MTRO'}},
 {'word': 'London Bridge',
  'spans': [{'start': 38, 'end': 51}],
  'country_predicted': 'GBR',
  'country_conf': 0.96374094,
  'geo': {'admin1': 'England',
   'lat': '51.50821',
   'lon': '-0.08763',
   'country_code3': 'GBR',
   'geonameid': '6619889',
   'place_name': 'London Bridge',
   'feature_class': 'S',
   'feature_code': 'BDG'}}]

In [7]:
places = []
geos = []

# Go through the rows of the dataframe
for i, row in df.iterrows():
    
    # Get the text we want to geoparse, join title and abstract, get rid of copyright stuff
    t = row['title'] + " " + row['content']
    t = t.split("Copyright (C)")[0] 
    t = re.split("\([C-c]\) [1-2][0-9]{3} Elsevier",t)[0] 
    t = t.split("Published by Elsevier")[0] 
    t = t.split("Copyright. (C)")[0] 
    t = re.split("\. \(C\) [1-2][0-9]{3} ",t)[0] 
    t = re.split("\. \(C\) Copyright",t)[0]   
    
    # geoparse
    gp = geo.geoparse(t)
    
    rplaces = []
    continent = None
    for p in gp:
        try:
            a2 = country_alpha3_to_country_alpha2(p["country_predicted"])
            continent = country_alpha2_to_continent_code(a2)
        except:
            pass
        if "geo" in p:
            try:
                a2 = country_alpha3_to_country_alpha2(p["geo"]["country_code3"])
                continent = country_alpha2_to_continent_code(a2)
            except:
                pass
            for key, value in p["geo"].items():
                p[key] = value
            del p["geo"]
            
        p["doc_id"] = row["id"]
            
        rplaces.append(p)
        places.append(p)
    df.loc[i,"continent"] = continent
    df.loc[i,"places"] = len(rplaces)

Traceback (most recent call last):
  File "/home/galm/software/mordecai-env/lib/python3.8/site-packages/mordecai/geoparse.py", line 726, in infer_country
    prediction = self.country_model.predict(i['matrix']).transpose()[0]
  File "/home/galm/software/mordecai-env/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py", line 130, in _method_wrapper
    return method(self, *args, **kwargs)
  File "/home/galm/software/mordecai-env/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py", line 1599, in predict
    tmp_batch_outputs = predict_function(iterator)
  File "/home/galm/software/mordecai-env/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py", line 780, in __call__
    result = self._call(*args, **kwds)
  File "/home/galm/software/mordecai-env/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py", line 814, in _call
    results = self._stateful_fn(*args, **kwds)
  File "/home/galm/software/mordecai-env/lib/python3.8

None
None
None


Traceback (most recent call last):
  File "/home/galm/software/mordecai-env/lib/python3.8/site-packages/mordecai/geoparse.py", line 726, in infer_country
    prediction = self.country_model.predict(i['matrix']).transpose()[0]
  File "/home/galm/software/mordecai-env/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py", line 130, in _method_wrapper
    return method(self, *args, **kwargs)
  File "/home/galm/software/mordecai-env/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py", line 1599, in predict
    tmp_batch_outputs = predict_function(iterator)
  File "/home/galm/software/mordecai-env/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py", line 780, in __call__
    result = self._call(*args, **kwds)
  File "/home/galm/software/mordecai-env/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py", line 814, in _call
    results = self._stateful_fn(*args, **kwds)
  File "/home/galm/software/mordecai-env/lib/python3.8

None
None
None
None


Traceback (most recent call last):
  File "/home/galm/software/mordecai-env/lib/python3.8/site-packages/mordecai/geoparse.py", line 726, in infer_country
    prediction = self.country_model.predict(i['matrix']).transpose()[0]
  File "/home/galm/software/mordecai-env/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py", line 130, in _method_wrapper
    return method(self, *args, **kwargs)
  File "/home/galm/software/mordecai-env/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py", line 1599, in predict
    tmp_batch_outputs = predict_function(iterator)
  File "/home/galm/software/mordecai-env/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py", line 780, in __call__
    result = self._call(*args, **kwds)
  File "/home/galm/software/mordecai-env/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py", line 814, in _call
    results = self._stateful_fn(*args, **kwds)
  File "/home/galm/software/mordecai-env/lib/python3.8

None


Traceback (most recent call last):
  File "/home/galm/software/mordecai-env/lib/python3.8/site-packages/mordecai/geoparse.py", line 726, in infer_country
    prediction = self.country_model.predict(i['matrix']).transpose()[0]
  File "/home/galm/software/mordecai-env/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py", line 130, in _method_wrapper
    return method(self, *args, **kwargs)
  File "/home/galm/software/mordecai-env/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py", line 1599, in predict
    tmp_batch_outputs = predict_function(iterator)
  File "/home/galm/software/mordecai-env/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py", line 780, in __call__
    result = self._call(*args, **kwds)
  File "/home/galm/software/mordecai-env/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py", line 814, in _call
    results = self._stateful_fn(*args, **kwds)
  File "/home/galm/software/mordecai-env/lib/python3.8

None
None


Traceback (most recent call last):
  File "/home/galm/software/mordecai-env/lib/python3.8/site-packages/mordecai/geoparse.py", line 726, in infer_country
    prediction = self.country_model.predict(i['matrix']).transpose()[0]
  File "/home/galm/software/mordecai-env/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py", line 130, in _method_wrapper
    return method(self, *args, **kwargs)
  File "/home/galm/software/mordecai-env/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py", line 1599, in predict
    tmp_batch_outputs = predict_function(iterator)
  File "/home/galm/software/mordecai-env/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py", line 780, in __call__
    result = self._call(*args, **kwds)
  File "/home/galm/software/mordecai-env/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py", line 814, in _call
    results = self._stateful_fn(*args, **kwds)
  File "/home/galm/software/mordecai-env/lib/python3.8

None


KeyboardInterrupt: 

In [None]:
# Merge all the data together
combined_place_df = processed_places.append(pd.DataFrame.from_dict(places))
print(combined_place_df.shape)
combined_place_df.to_csv("../data/places.csv", index=False)
combined_place_df