### imports

In [5]:
import pandas as pd
import re
import csv
from IPython.display import display, clear_output
import nltk 
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize.treebank import TreebankWordDetokenizer
import string
from string import capwords
from sklearn.feature_extraction.text import CountVectorizer
import geograpy3
from cliff.api import Cliff
import json 


### loops through abstracts and matches checks if words are a country or a state name according to a csv file containing country and state names

In [6]:
def match_countries(text):
    countries_in_abstract = []
    for word in text.split():
        if word in countries_list and word not in countries_in_abstract:
            countries_in_abstract.append(word)
#         if word in abbr_country_dict and abbr_country_dict[word] not in countries_in_abstract:
#             countries_in_abstract.append(abbr_country_dict[word])
        if word in province_to_country and province_to_country[word] not in countries_in_abstract:
            countries_in_abstract.append(province_to_country[word])
        if word in demonym_dict and demonym_dict[word] not in countries_in_abstract:
            countries_in_abstract.append(demonym_dict[word])
    return countries_in_abstract


### uses geograpy3 API to attempt to assign countries to a text input

In [7]:
def geograpy_countries(input):
    places = geograpy3.get_place_context(text = input)
    return places.countries

### fills all the "countries" column for abstracts that were not assigned a column with "NA" values

In [8]:
def fill_na(array):
    if(len(array) < 1):
        return "NA"
    else:
        return array

In [9]:
df = pd.read_csv('../results/cleaned_tokenized.csv')

In [10]:
display(df.head())

Unnamed: 0,text,tokenized_text,no_stopwords_tokenized_text,no_stopwords_text,tdm
0,fracturing and subsidence of the land surface ...,"['fracturing', 'and', 'subsidence', 'of', 'the...","['fracturing', 'subsidence', 'land', 'surface'...",fracturing subsidence land surface caused with...,amount aquifers area artesian bench bo...
1,an analysis of instabilities caused by salinit...,"['an', 'analysis', 'of', 'instabilities', 'cau...","['analysis', 'instabilities', 'caused', 'salin...",analysis instabilities caused salinity gradien...,always amplitude analyse analysis aquif...
2,pollution of the rhine and water supply dutch ...,"['pollution', 'of', 'the', 'rhine', 'and', 'wa...","['pollution', 'rhine', 'water', 'supply', 'dut...",pollution rhine water supply dutch water suppl...,activities almost approximately article ...
3,development and in place leaching of mountain ...,"['development', 'and', 'in', 'place', 'leachin...","['development', 'place', 'leaching', 'mountain...",development place leaching mountain city chalc...,aid analyzed annualized approximately b...
4,the study of groundwater movement in boreholes...,"['the', 'study', 'of', 'groundwater', 'movemen...","['study', 'groundwater', 'movement', 'borehole...",study groundwater movement boreholes performed...,among aquifers authors avoiding based ...


### matches words in the abstract to countries by searching for country names, province names, and demonyms

In [12]:
abbr_country_dict = {}
with open("../data/iso_list_of_countries.csv") as countries:
    reader = csv.reader(countries, delimiter=',',quotechar= '"',quoting= csv.QUOTE_MINIMAL)
    for row in reader:
        name = row[0]
        if name == "name":
            continue
        abbr = row[1]
        countries_list.append(name.lower())
        abbr_country_dict[abbr] = name.lower()

In [14]:
demonym_dict = {"hawaiian": "hawaii", "alaskan": "alaska", "indian": "india", "chinese": "china", "american": "america", "russian": "russia", "indonesian": "indonesia", "pakistani": "pakistan", "nigerian": "nigeria"}


In [15]:
province_to_country = {}
with open('../data/provinces.json') as json_file: 
        provinces = json.load(json_file) 
for p_dict in provinces:
    if "english" in p_dict:
        province_name = p_dict['english'].lower()
    else:
        province_name = p_dict['name'].lower()
    province_to_country[province_name] = abbr_country_dict[p_dict['country']]
    
    

In [16]:

df['countries'] = df['text'].apply(lambda x: match_countries(x))
df['countries'] = df['countries'].apply(lambda x: fill_na(x))

In [17]:
display(df.head())


Unnamed: 0,text,tokenized_text,no_stopwords_tokenized_text,no_stopwords_text,tdm,countries
0,fracturing and subsidence of the land surface ...,"['fracturing', 'and', 'subsidence', 'of', 'the...","['fracturing', 'subsidence', 'land', 'surface'...",fracturing subsidence land surface caused with...,amount aquifers area artesian bench bo...,[united states of america]
1,an analysis of instabilities caused by salinit...,"['an', 'analysis', 'of', 'instabilities', 'cau...","['analysis', 'instabilities', 'caused', 'salin...",analysis instabilities caused salinity gradien...,always amplitude analyse analysis aquif...,
2,pollution of the rhine and water supply dutch ...,"['pollution', 'of', 'the', 'rhine', 'and', 'wa...","['pollution', 'rhine', 'water', 'supply', 'dut...",pollution rhine water supply dutch water suppl...,activities almost approximately article ...,[united kingdom of great britain and northern ...
3,development and in place leaching of mountain ...,"['development', 'and', 'in', 'place', 'leachin...","['development', 'place', 'leaching', 'mountain...",development place leaching mountain city chalc...,aid analyzed annualized approximately b...,[united states of america]
4,the study of groundwater movement in boreholes...,"['the', 'study', 'of', 'groundwater', 'movemen...","['study', 'groundwater', 'movement', 'borehole...",study groundwater movement boreholes performed...,among aquifers authors avoiding based ...,[spain]


### check what  % of values in the "countries" column have been filled

In [18]:
country_list = df['countries'].tolist()
count = 0 
for country in country_list:
    if len(country) > 0:
        count +=1
print(count)
print(len(df.index))
print(count/len(df.index))
        

65452
65452
1.0


In [19]:
print(geograpy_countries("I like China and India"))

['British Indian Ocean Territory', 'China', 'United States', 'Russia']


In [20]:
display(df.head())

Unnamed: 0,text,tokenized_text,no_stopwords_tokenized_text,no_stopwords_text,tdm,countries
0,fracturing and subsidence of the land surface ...,"['fracturing', 'and', 'subsidence', 'of', 'the...","['fracturing', 'subsidence', 'land', 'surface'...",fracturing subsidence land surface caused with...,amount aquifers area artesian bench bo...,[united states of america]
1,an analysis of instabilities caused by salinit...,"['an', 'analysis', 'of', 'instabilities', 'cau...","['analysis', 'instabilities', 'caused', 'salin...",analysis instabilities caused salinity gradien...,always amplitude analyse analysis aquif...,
2,pollution of the rhine and water supply dutch ...,"['pollution', 'of', 'the', 'rhine', 'and', 'wa...","['pollution', 'rhine', 'water', 'supply', 'dut...",pollution rhine water supply dutch water suppl...,activities almost approximately article ...,[united kingdom of great britain and northern ...
3,development and in place leaching of mountain ...,"['development', 'and', 'in', 'place', 'leachin...","['development', 'place', 'leaching', 'mountain...",development place leaching mountain city chalc...,aid analyzed annualized approximately b...,[united states of america]
4,the study of groundwater movement in boreholes...,"['the', 'study', 'of', 'groundwater', 'movemen...","['study', 'groundwater', 'movement', 'borehole...",study groundwater movement boreholes performed...,among aquifers authors avoiding based ...,[spain]


In [23]:
print(df.loc[4, 'text'])

the study of groundwater movement in boreholes performed at the site for the future beninar dam spain using radioactive tracers in the study of sites for the construction of new dams it is necessary to know among other characteristics of the soil the following onesi ground impermeability at site to be sure of the resistance and stability of the dam ii basin impermeability for the stored waters avoiding uncontrollated losses a valuable information on this subject may be obtained through the study of waters movement at the saturated zone the method here described is based on marking the water column with a radioactive isotope and has been investigated and employed by the authors in more than  boreholes the purpose of thos e experiences was in most of the cases to know the behaviour of the groundwater aquifers as well as their recharge conditions exploitability and more importants parameters in similar manner has been successfully used at boreholes performed for the geological research of

In [24]:
df.to_csv("../results/abstracts_cleaned_tokenized_geo.csv",index=False)