In [1]:
import pandas as pd
import re
import csv
from IPython.display import display, clear_output
import geopandas



In [2]:
def filter_aquifer_name_capitals(tokens, index):
    lowerIndex = higherIndex = index
    afterPreps = ["in", "on", "at", "of", "under", "over", "through", "for"]
    beginningArticles = ["the", "a", "an"]
    
    for i in range(max(0, index-4), index):
        word = tokens[i]
        if word[0].isupper():
            if (lowerIndex == index and index - i <= 2) or (lowerIndex == i+1):
                lowerIndex = i 
    
    for i in range(index, min(index+4, len(tokens))):
        word = tokens[i]
        if (i == index + 1) and (word.lower() == "the" or (word.lower() not in afterPreps)):
            break
        if word[0].isupper():
            if (higherIndex == index and i - index <= 2) or (higherIndex == i-1):
                higherIndex = i
                
    if len(tokens[lowerIndex:higherIndex + 1]) <= 1 or tokens[higherIndex].lower() in afterPreps or ((higherIndex-lowerIndex)<=1 and tokens[lowerIndex].lower() in beginningArticles):
        return None 
    
    return " ".join(tokens[lowerIndex:higherIndex + 1]) 
    

In [3]:
def match_aquifer_name_regex(text):
    aquifer_match_plural = re.findall('(?:[^ ]+ ){0,3}aquifers(?: [^ ]+){0,3}', str(text))
    basin_match_plural = re.findall('(?:[^ ]+ ){0,3}basins(?: [^ ]+){0,3}', str(text)) 
    aquifer_match = re.findall('(?:[^ ]+ ){0,3}aquifer(?: [^ ]+){0,3}', str(text))
    basin_match = re.findall('(?:[^ ]+ ){0,3}basin(?: [^ ]+){0,3}', str(text)) 
    all_matches = aquifer_match + basin_match + basin_match_plural  + basin_match_plural
    if len(all_matches) < 1:
        return float('NaN')
    return all_matches

In [4]:
def match_aquifer_names_capitals(text):
    tokens = text.split()
    keywords = ['aquifer', 'Aquifer', 'basin', 'Basin', 'aquifers', 'Aquifers', 'basins', 'Basins']
    indexes = [i for i in range(len(tokens)) if tokens[i] in keywords]
    all_matches = [filter_aquifer_name_capitals(tokens, index) for index in indexes]
    all_matches_cleaned = [x for x in all_matches if x is not None]
            
    if all_matches_cleaned == []:
        return float('NaN')
    return all_matches_cleaned
    
    

In [5]:
def match_usgs_list(arr):
    if(type(arr)) is float:
        return float('NaN')
    matches = []
    for element in arr:
        with open('../list_of_aquifer_names_usgs.txt', 'r') as aquifer_names:
            for line in aquifer_names:
                print(line.lower())
                print(element.lower())
                if(line.lower() in element.lower()):
                     matches.append(line)
    if(len(matches) < 1):
        return float('NaN')
    return matches

In [6]:
df = pd.read_csv('../results/abstracts_cleaned_tokenized_geo_new_oct_5.csv')
display(df.head())

Unnamed: 0,original_text,text,tokenized_text,no_stopwords_tokenized_text,no_stopwords_text,tdm,countries
0,FRACTURING AND SUBSIDENCE OF THE LAND SURFACE ...,fracturing and subsidence of the land surface ...,"['fracturing', 'and', 'subsidence', 'of', 'the...","['fracturing', 'subsidence', 'land', 'surface'...",fracturing subsidence land surface caused with...,amount aquifers area artesian bench bo...,
1,An analysis of instabilities caused by salinit...,an analysis of instabilities caused by salinit...,"['an', 'analysis', 'of', 'instabilities', 'cau...","['analysis', 'instabilities', 'caused', 'salin...",analysis instabilities caused salinity gradien...,always amplitude analyse analysis aquif...,
2,Pollution of the Rhine and water supply DutchA...,pollution of the rhine and water supply dutcha...,"['pollution', 'of', 'the', 'rhine', 'and', 'wa...","['pollution', 'rhine', 'water', 'supply', 'dut...",pollution rhine water supply dutcha water supp...,activities almost approximately article ...,['united kingdom of great britain and northern...
3,DEVELOPMENT AND IN PLACE LEACHING OF MOUNTAIN ...,development and in place leaching of mountain ...,"['development', 'and', 'in', 'place', 'leachin...","['development', 'place', 'leaching', 'mountain...",development place leaching mountain city chalc...,aid analyzed annualized approximately b...,['united states of america']
4,The study of groundwater movement in boreholes...,the study of groundwater movement in boreholes...,"['the', 'study', 'of', 'groundwater', 'movemen...","['study', 'groundwater', 'movement', 'borehole...",study groundwater movement boreholes performed...,among aquifers authors avoiding based ...,['spain']


In [7]:
display(df.tail())

Unnamed: 0,original_text,text,tokenized_text,no_stopwords_tokenized_text,no_stopwords_text,tdm,countries
65429,Conceptual uncertainties in solubility calcula...,conceptual uncertainties in solubility calcula...,"['conceptual', 'uncertainties', 'in', 'solubil...","['conceptual', 'uncertainties', 'solubility', ...",conceptual uncertainties solubility calculatio...,account actinide also analysis answers ...,
65430,From political to environmental conflict in th...,from political to environmental conflict in th...,"['from', 'political', 'to', 'environmental', '...","['political', 'environmental', 'conflict', 'de...",political environmental conflict development t...,achieve activity along among aquifers ...,['israel']
65431,Fissure behavior in the chihuahuan desert and ...,fissure behavior in the chihuahuan desert and ...,"['fissure', 'behavior', 'in', 'the', 'chihuahu...","['fissure', 'behavior', 'chihuahuan', 'desert'...",fissure behavior chihuahuan desert depth estim...,aquifers authors base become behavior ...,['united states of america']
65432,Integrated study of the Judy Field Block a an...,integrated study of the judy field block a an...,"['integrated', 'study', 'of', 'the', 'judy', '...","['integrated', 'study', 'judy', 'field', 'bloc...",integrated study judy field block overpressure...,accumulation across ago allows almost ...,
65433,Land farm design and management in Bolivia A c...,land farm design and management in bolivia a c...,"['land', 'farm', 'design', 'and', 'management'...","['land', 'farm', 'design', 'management', 'boli...",land farm design management bolivia casestudya...,abstract acceptable accomplished additio...,"['argentina', 'norway']"


In [8]:
df['aquifer_mentions_capital_algorithm'] = df['original_text'].apply(lambda x: match_aquifer_names_capitals(x))
df['aquifer_mentions_regex_algorithm'] = df['text'].apply(lambda x: match_aquifer_name_regex(x))

In [9]:
display(df.head())

Unnamed: 0,original_text,text,tokenized_text,no_stopwords_tokenized_text,no_stopwords_text,tdm,countries,aquifer_mentions_capital_algorithm,aquifer_mentions_regex_algorithm
0,FRACTURING AND SUBSIDENCE OF THE LAND SURFACE ...,fracturing and subsidence of the land surface ...,"['fracturing', 'and', 'subsidence', 'of', 'the...","['fracturing', 'subsidence', 'land', 'surface'...",fracturing subsidence land surface caused with...,amount aquifers area artesian bench bo...,,,[the lower artesian aquifer]
1,An analysis of instabilities caused by salinit...,an analysis of instabilities caused by salinit...,"['an', 'analysis', 'of', 'instabilities', 'cau...","['analysis', 'instabilities', 'caused', 'salin...",analysis instabilities caused salinity gradien...,always amplitude analyse analysis aquif...,,,[transport through the aquifer is considerably...
2,Pollution of the Rhine and water supply DutchA...,pollution of the rhine and water supply dutcha...,"['pollution', 'of', 'the', 'rhine', 'and', 'wa...","['pollution', 'rhine', 'water', 'supply', 'dut...",pollution rhine water supply dutcha water supp...,activities almost approximately article ...,['united kingdom of great britain and northern...,,
3,DEVELOPMENT AND IN PLACE LEACHING OF MOUNTAIN ...,development and in place leaching of mountain ...,"['development', 'and', 'in', 'place', 'leachin...","['development', 'place', 'leaching', 'mountain...",development place leaching mountain city chalc...,aid analyzed annualized approximately b...,['united states of america'],,
4,The study of groundwater movement in boreholes...,the study of groundwater movement in boreholes...,"['the', 'study', 'of', 'groundwater', 'movemen...","['study', 'groundwater', 'movement', 'borehole...",study groundwater movement boreholes performed...,among aquifers authors avoiding based ...,['spain'],,"[of the groundwater aquifer, the dam ii basin ..."


In [10]:
counter = 0
matched = 0; 
for row in df.itertuples():
    #nan is type float
  if type(row[8])is not float:
#         print(row[8])
#         counter +=1
        matched += 1 
print(matched)
print(len(df.index))
print(matched/len(df.index))
    

10538
65434
0.16104777332885045


In [11]:
print(df.loc[0, 'original_text'])

FRACTURING AND SUBSIDENCE OF THE LAND SURFACE CAUSED BY THE WITHDRAWAL OF GROUND WATER IN THE MILFORD AREA UTAHLand subsidence in the Milford area is demonstrated by three lines of evidence  collapse structures  well casings that protrude higher above the land surface than when first placed in the borehole and  lower elevations at National Ocean Survey formerly U S Coast and Geodetic Survey bench marks in  than in  This evidence shows that land subsidence in the Milford area is of two types each having a different origin One type has a nearsurface origin in the claysilt zone in the upper part of the principal groundwater reservoir and the other is in the lower artesian aquifers of the principal groundwater reservoir The amount of observed subsidence ranges from   ft   m at the bench mark at Read to about  ft   m at collapse structures in the Hay Springs area


In [12]:
aquiferDf_regex= df.dropna(subset=["aquifer_mentions_regex_algorithm"])
aquiferDf_regex["aquifer_mentions_regex_algorithm"].to_csv("../results/aquifer_names_regex_algorithm.csv")    
display(aquiferDf_regex.head(5))

  


Unnamed: 0,original_text,text,tokenized_text,no_stopwords_tokenized_text,no_stopwords_text,tdm,countries,aquifer_mentions_capital_algorithm,aquifer_mentions_regex_algorithm
0,FRACTURING AND SUBSIDENCE OF THE LAND SURFACE ...,fracturing and subsidence of the land surface ...,"['fracturing', 'and', 'subsidence', 'of', 'the...","['fracturing', 'subsidence', 'land', 'surface'...",fracturing subsidence land surface caused with...,amount aquifers area artesian bench bo...,,,[the lower artesian aquifer]
1,An analysis of instabilities caused by salinit...,an analysis of instabilities caused by salinit...,"['an', 'analysis', 'of', 'instabilities', 'cau...","['analysis', 'instabilities', 'caused', 'salin...",analysis instabilities caused salinity gradien...,always amplitude analyse analysis aquif...,,,[transport through the aquifer is considerably...
4,The study of groundwater movement in boreholes...,the study of groundwater movement in boreholes...,"['the', 'study', 'of', 'groundwater', 'movemen...","['study', 'groundwater', 'movement', 'borehole...",study groundwater movement boreholes performed...,among aquifers authors avoiding based ...,['spain'],,"[of the groundwater aquifer, the dam ii basin ..."
5,Groundwater study of a volcanic area near Band...,groundwater study of a volcanic area near band...,"['groundwater', 'study', 'of', 'a', 'volcanic'...","['groundwater', 'study', 'volcanic', 'area', '...",groundwater study volcanic area near bandung j...,abundant activities analysis aquifers a...,['indonesia'],,[concerning rainfall infiltration aquifer]
8,AVAILABILITY OF WATER FOR COAL CONVERSIONCoal ...,availability of water for coal conversioncoal ...,"['availability', 'of', 'water', 'for', 'coal',...","['availability', 'water', 'coal', 'conversionc...",availability water coal conversioncoal abundan...,abundant allocates alternative availabil...,['united states of america'],"[Missouri River Basin, Fort UnionPowder Basin]",[upper missouri river basin and tributaries ye...


In [13]:
aquiferDf_capitals = df.dropna(subset=["aquifer_mentions_capital_algorithm"])
aquiferDf_capitals["aquifer_mentions_capital_algorithm"].to_csv("../results/aquifer_names_capital_algorithm.csv")    
display(aquiferDf_capitals.head(5))

  


Unnamed: 0,original_text,text,tokenized_text,no_stopwords_tokenized_text,no_stopwords_text,tdm,countries,aquifer_mentions_capital_algorithm,aquifer_mentions_regex_algorithm
8,AVAILABILITY OF WATER FOR COAL CONVERSIONCoal ...,availability of water for coal conversioncoal ...,"['availability', 'of', 'water', 'for', 'coal',...","['availability', 'water', 'coal', 'conversionc...",availability water coal conversioncoal abundan...,abundant allocates alternative availabil...,['united states of america'],"[Missouri River Basin, Fort UnionPowder Basin]",[upper missouri river basin and tributaries ye...
25,COMPARATIVE ECONOMICS AND ENERGY REQUIREMENTS ...,comparative economics and energy requirements ...,"['comparative', 'economics', 'and', 'energy', ...","['comparative', 'economics', 'energy', 'requir...",comparative economics energy requirements vari...,alone ana annual aquifers assumed avai...,"['peru', 'united states of america']",[River groundwater basin],"[recharge of highquality aquifer, ana river gr..."
27,Comparative economics and enrgy requirements o...,comparative economics and enrgy requirements o...,"['comparative', 'economics', 'and', 'enrgy', '...","['comparative', 'economics', 'enrgy', 'require...",comparative economics enrgy requirements vario...,alone ana annual aquifers assumed avai...,"['peru', 'united states of america']",[River groundwater basin],"[recharge of highquality aquifer, ana river gr..."
49,A Seismic Ground Water Survey in New Hampshir...,a seismic ground water survey in new hampshir...,"['a', 'seismic', 'ground', 'water', 'survey', ...","['seismic', 'ground', 'water', 'survey', 'new'...",seismic ground water survey new hampshirethe d...,according along anomalies apparently at...,['united kingdom of great britain and northern...,[HampshireThe drainage basin of Barbadoes Pond],[new hampshirethe drainage basin of barbadoes ...
58,STRATIGRAPHIC AND HYDROLOGIC RELATIONSHIP OF T...,stratigraphic and hydrologic relationship of t...,"['stratigraphic', 'and', 'hydrologic', 'relati...","['stratigraphic', 'hydrologic', 'relationship'...",stratigraphic hydrologic relationship piney po...,acts alloway altitude analyses aquifer ...,"['jersey', 'united states of america']","[Piney Point aquifer, Piney Point aquifer in C...","[the piney point aquifer and the alloway, the ..."


In [14]:
# df['usgs_aquifer_matches'] = df['aquifer_mentions'].apply(lambda x: match_usgs_list(x)) 

In [15]:
# matchDf = df.dropna(subset=["usgs_aquifer_matches"])
# display(matchDf.head())

In [16]:
geo_df = geopandas.read_file("../data/WHYMAP_shapefiles/whymap_GW_aquifers_v1_poly.shp")

In [17]:
pd.set_option('display.max_columns', 999)
display(geo_df.head(30))

Unnamed: 0,HYGEO2,ICE,CONTINENT,geometry
0,33,88,99,"POLYGON ((-27.61889 81.47500, -27.84575 81.464..."
1,33,88,99,"POLYGON ((-39.03056 83.28638, -39.17389 83.286..."
2,33,88,99,"POLYGON ((-38.84028 83.10748, -38.89918 83.108..."
3,33,88,99,"POLYGON ((-41.25250 83.28526, -41.39028 83.292..."
4,33,88,99,"POLYGON ((-42.14888 83.24136, -42.38194 83.245..."
5,33,88,99,"POLYGON ((-41.15944 83.20665, -41.31555 83.207..."
6,33,88,99,"POLYGON ((-38.35888 83.13275, -38.46112 83.133..."
7,33,88,99,"POLYGON ((-39.87528 82.97861, -39.88861 82.979..."
8,33,88,99,"POLYGON ((-39.61389 82.99444, -39.70306 82.996..."
9,33,88,1,"POLYGON ((-76.23083 82.44470, -76.18417 82.453..."


In [18]:
for col in geo_df.columns: 
    print(col) 

HYGEO2
ICE
CONTINENT
geometry


In [19]:
print(geo_df.loc[2,'geometry'])

POLYGON ((-38.84027676799997 83.10748143300003, -38.89917674299994 83.10833672400008, -38.93666417999998 83.11137962200007, -38.94833094199998 83.11359548600007, -39.05777464899995 83.12637666600006, -39.10083445399994 83.12996980900004, -39.21250018099994 83.13304415500005, -39.49306219599998 83.14804379600008, -39.58306518599994 83.15611016600008, -39.82583369999998 83.19385843600008, -39.88416262699997 83.20972242900007, -39.88861544599996 83.21582632500008, -39.92471538299998 83.23275245600007, -40.01722503199994 83.25167074300003, -40.05193714599994 83.25610650000004, -40.08417082199998 83.25750524600005, -40.11194751099998 83.25665750900004, -40.15166920299998 83.25417102900008, -40.19971413399998 83.25277252400008, -40.25444472099997 83.25277252400008, -40.46944536099994 83.25525882500006, -40.53444362499994 83.25832476000005, -40.62694768299997 83.26611186700006, -40.66221357099994 83.27053678900006, -40.66833604799996 83.27443947500007, -40.67333732099996 83.28028083200007, -4

In [20]:
geo_df.plot(column= 'HYGEO2')

<matplotlib.axes._subplots.AxesSubplot at 0x11cdd9410>

In [21]:
df.drop("aquifer_mentions_regex_algorithm", axis=1, inplace=True)
df.to_csv("../dataframes/12_2_capital_algorithms.csv")