## This notebook downloads the pages from the search engines of journals with the information about papers in seismology published between 2010 and 2021 

In [None]:
# import packages

import re

from bs4 import BeautifulSoup

import selenium
from selenium import webdriver

import time

import codecs

import os

#### Configure local paths and create directories

In [None]:
root = ! pwd
root = root[0]

print("using root directory:", root)

CHROME_DRIVER_PATH=root+"/deps/chromedriver"

In [None]:
if not os.path.exists(root+"/pages"):
    os.mkdir(root+"/pages") 

#### Initialize webdriver

In [None]:
option = webdriver.ChromeOptions()
option.add_argument("--incognito")
browser = webdriver.Chrome(executable_path=CHROME_DRIVER_PATH, options=option)

#### Define years/months/days of interest for collecting data 

In [None]:
# years 
years = [
    "2010",
    "2011",
    "2012",
    "2013",
    "2014",
    "2015",
    "2016",
    "2017",
    "2018",
    "2019",
    "2020",
#    "2021"
    
]

# months 
months = [
    "01",
    "02",
    "03",
    "04",
    "05",
    "06",
    "07",
    "08",
    "09",
    "10",
    "11",
    "12"
]

# number of days in month 
days_in_month = [
    "31",
    "28",
    "31",
    "30",
    "31",
    "30",
    "31",
    "31",
    "30",
    "31",
    "30",
    "31"
]

### AGU Journals: GRL, JGR: Solid Earth, G3

#### Define journals to scrape and url template for search engine

In [None]:
# template (RESTRICT TO PAPERS WITH TERMS IN ABSTRACT: seism, earthquake)
agu_search_template = "https://agupubs.onlinelibrary.wiley.com/action/doSearch?field1=Abstract&text1=seism+OR+earthquake&field2=AllField&text2=&field3=AllField&text3=&publication[]={publication}&Ppub=&AfterMonth={month}&AfterYear={year}&BeforeMonth={month}&BeforeYear={year}&startPage={start_page}&sortBy=Earliest&"

#[journal name, template]
journal_templates = [
    ["GRL","19448007"],
    ["JGRSolidEarth","21699356"],
    ["G3","15252027"],
]

# Be careful: page number starts from 0 for this journal (last checked Aug. 2023)

#### Define scrape function adapted to journal and run a test

In [None]:
def fetch_page(browser, url):
    browser.get(url)
    if browser.current_url != url:
        print("unexpected page url.\n current: {} \n expected: {}".format(browser.current_url,url))
        return "", ""
    
    # create a fingerprint for this page
    titles = []
    elements = browser.find_elements_by_class_name("publication_title")
    for e in elements:
        titles.append(e.text)
    fingerprint = " ".join(titles)

    return browser.page_source, fingerprint

# test

url = agu_search_template.format(publication="15252027", year="2015", month="05", start_page=0)
html, fingerprint = fetch_page(browser, url)

with codecs.open("test/test_page.html", "w", "utf8") as outfile:
    outfile.write(html)

#### Collect pages with publication info

In [None]:
previous_fingerprint = ""


for journal_name, journal_id in journal_templates:
    print("scraping journal:", journal_name)
    for year in years:
        print("scraping year:", year)
        for month in months:
            print("scraping month:", month)
            for page in range(100):
                url = agu_search_template.format(publication=journal_id, year=year, month=month, start_page=page)
                html, fingerprint = fetch_page(browser, url)
                if len(html) == 0:
                    print("nothing to save for", url)
                    continue
                if fingerprint == previous_fingerprint:
                    print("done on page", page)
                    # page already seen, move to the next month
                    break
                if len(fingerprint) == 0:
                    # page already seen, move to the next month
                    print("done on page", page)
                    break
                previous_fingerprint = fingerprint

                filename = "{name}_{year}_{month}_{page}.html".format(name=journal_name, year=year, month=month, page=page)
                with codecs.open("pages/"+filename, "w", "utf8") as outfile:
                    outfile.write(html)

                time.sleep(2)
            

### Geophysical Journal International 

#### Define url template for search engine

In [None]:
#url template  (last checked Aug. 2023)

#gji_template = "https://academic.oup.com/gji/search-results?sort=Date+%e2%80%93+Newest+First&f_TocHeadingTitle=Seismology&f_ArticleTypeDisplayName=Review+ArticleANDResearch+Article&fl_SiteID=5282&rg_ArticleDate={month}/01/{year}%20TO%20{month}/{last_day}/{year}&dateFilterType=range&noDateTypes=true&qb=%7b%22q%22%3a%22%22%7d&page={start_page}"

gji_template = "https://ui.adsabs.harvard.edu/search/q=%20abs%3A(seism%20OR%20earthquake)%20year%3A{year}%20bibstem%3AGeoJI&sort=date%20desc%2C%20bibcode%20desc&p_={page}"

#### Define scrape function adapted to journal and run a test

In [None]:
def fetch_gji_page(browser, url):
    browser.get(url)
    if browser.current_url != url:
        print("unexpected page url.\n current: {} \n expected: {}".format(browser.current_url,url))
        return "", ""
    time.sleep(2)
    # create a fingerprint for this page
    titles = []
    elements = browser.find_elements_by_class_name("s-results-title")
    for e in elements:
        titles.append(e.text)
    fingerprint = " ".join(titles)

    return browser.page_source, fingerprint

# run test

url = gji_template.format(year="2013", page=1)
html, fingerprint = fetch_gji_page(browser, url)

with codecs.open("test/test_gji_page.html", "w", "utf8") as outfile:
    outfile.write(html)
    


#### Collect pages with publication info

In [None]:
previous_fingerprint = ""

for year in years:
    print("scraping year:", year)
    for page in range(151):
        print("page:", page)
        url = gji_template.format(year=year, page=page)
        html, fingerprint = fetch_gji_page(browser, url)

        if len(html) == 0:
            print("nothing to save for", url)
            continue
        if fingerprint == previous_fingerprint:
            # page already seen, move to the next month
            print("done on page", page)
            break
        if len(fingerprint) == 0:
            print("no articles found")
            break
        
        previous_fingerprint = fingerprint
        
        filename = "GJI_{year}_{month}_{page}.html".format(year=year, month="00", page=page)
        with codecs.open("pages/"+filename, "w", "utf8") as outfile:
            outfile.write(html)

        time.sleep(2)
                

### GEOPHYSICS

#### Define url template for search engine

In [None]:
# template  (last checked Aug. 2023)

geophysics_template = "https://library.seg.org/action/doSearch?field1=Abstract&text1=seism*+OR+earthquake&publication=&publication%5B%5D=gpysa7&Ppub=&AfterMonth={month}&AfterYear={year}&BeforeMonth={month}&BeforeYear={year}&startPage={start_page}&sortBy=Earliest"

# Be careful: page number starts from 0 for this journal

#### Define scrape function adapted to journal and run a test

In [None]:
def fetch_geophysics_page(browser, url):
    browser.get(url)
    if browser.current_url != url:
        print("unexpected page url.\n current: {} \n expected: {}".format(browser.current_url,url))
        return "", ""
    
    # create a fingerprint for this page
    titles = []
    elements = browser.find_elements_by_class_name("issue-item__title")
    for e in elements:
        titles.append(e.text)
    fingerprint = " ".join(titles)

    return browser.page_source, fingerprint

# run test

url = geophysics_template.format(year="2013", month="01", start_page=0)
html, fingerprint = fetch_geophysics_page(browser, url)

with codecs.open("test/test_geophysics_page.html", "w", "utf8") as outfile:
    outfile.write(html)

#### Collect pages with publication info

In [None]:
previous_fingerprint = ""

for year in years:
    print("scraping year:", year)
    for i, month in enumerate(months):
        print("scraping month:", month)
        for page in range(100):
            url = geophysics_template.format(
                year=year,
                month=month,
                start_page=page)
            html, fingerprint = fetch_geophysics_page(browser, url)
            if len(html) == 0:
                print("nothing to save for", url)
                continue
            if fingerprint == previous_fingerprint:
                # page already seen, move to the next month
                print("done on page", page)
                break
            if len(fingerprint) == 0:
                # page already seen, move to the next month
                print("done on page", page)
                break
            previous_fingerprint = fingerprint

            filename = "GEOPHYSICS_{year}_{month}_{page}.html".format(year=year, month=month, page=page)
            with codecs.open("pages/"+filename, "w", "utf8") as outfile:
                outfile.write(html)

            time.sleep(2)
                

### GeoScienceWorld journal: BSSA and SRL

#### Define url template for search engine and journals

In [None]:
# url template (last checked Aug. 2023)
gsw_template = "https://pubs.geoscienceworld.org/search-results?page={page_number}&f_JournalDisplayName={journal_name}&fl_ContentType=Journal+Article+OR+Journal+OR+Book+OR+Book+Chapter+OR+GeoRef+Record&fl_JournalID={publication}&rg_PublicationDate={month}%2f{day}%2f{year}+TO+{month}%2f{last_day}%2f{year}&restypeid=3&f_ArticleTypeDisplayName=Research+Article" 

gsw_journal_values = [
    ["Bulletin+of+the+Seismological+Society+of+America","66"],
    ["Seismological+Research+Letters","65"]
]

#### Define scrape function adapted to journal and run a test

In [None]:
def fetch_gsw_page(browser, url):
    browser.get(url)
    if browser.current_url != url:
        print("unexpected page url.\n current: {} \n expected: {}".format(browser.current_url,url))
        return "", ""
    
    # create a fingerprint for this page
    titles = []
    elements = browser.find_elements_by_class_name("al-title")
    for e in elements:
        titles.append(e.text)
    fingerprint = " ".join(titles)

    return browser.page_source, fingerprint

# run test

url = gsw_template.format(
                    journal_name="Seismological+Research+Letters",
                    publication="65",
                    year="2016",
                    month="03",
                    day="01",
                    last_day="31",
                    page_number=1)
print(url)
html, fingerprint = fetch_gsw_page(browser, url)

with codecs.open("test/test_gsw_page.html", "w", "utf8") as outfile:
    outfile.write(html)

#### Collect pages with publication info

In [None]:
previous_fingerprint = ""

for journal_name, journal_id in gsw_journal_values:
    print("scraping journal:", journal_name)
    for year in years:
        print("scraping year:", year)
        for i, month in enumerate(months):
            print("scraping month:", month)
            last_day = days_in_month[i]
            for page in range(1,101):
                url = gsw_template.format(
                    journal_name=journal_name,
                    publication=journal_id,
                    year=year,
                    month=month,
                    day="01",
                    last_day=last_day,
                    page_number=page)
                html, fingerprint = fetch_gsw_page(browser, url)
                if len(html) == 0:
                    print("nothing to save for", url)
                    continue
                if fingerprint == previous_fingerprint:
                    # page already seen, move to the next month
                    print("done on page", page)
                    break
                if len(fingerprint) == 0:
                    # page already seen, move to the next month
                    print("done on page", page)
                    break
                previous_fingerprint = fingerprint

                filename = "{name}_{year}_{month}_{page}.html".format(name=journal_name, year=year, month=month, page=page)
                with codecs.open("pages/"+filename, "w", "utf8") as outfile:
                    outfile.write(html)

                time.sleep(20)
                

### Nature geoscience

#### Define url template and scrape function adapted to journal and run a test

In [None]:
# url template  (last checked Aug. 2023)

ngs_template = "https://www.nature.com/search?article_type=research%2Creviews&date_range={year}-{year}&journal=ngeo&subject=seismology&order=relevance&page={page}" 


# scrape function

def fetch_ngs_page(browser, url):
    browser.get(url)
    if browser.current_url != url:
        print("unexpected page url.\n current: {} \n expected: {}".format(browser.current_url,url))
        return "", ""
    
    # create a fingerprint for this page
    titles = []
    elements = browser.find_elements_by_class_name("h3")
    for e in elements:
        titles.append(e.text)
    fingerprint = " ".join(titles)

    return browser.page_source, fingerprint

url = ngs_template.format(year="2015", page=1)
html, fingerprint = fetch_ngs_page(browser, url)

with codecs.open("test/test_ngs_page.html", "w", "utf8") as outfile:
    outfile.write(html)

#### Collect pages with publication info

In [None]:
previous_fingerprint = ""

for year in years:
    print("scraping year:", year)
    for page in range(1,101):
        url = ngs_template.format(year=year, page=page)
        html, fingerprint = fetch_ngs_page(browser, url)
        if len(html) == 0:
            print("nothing to save for", url)
            continue
        if fingerprint == previous_fingerprint:
            # page already seen, move to the next month
            print("done on page", page)
            break
        if len(fingerprint) == 0:
            # page already seen, move to the next month
            print("done on page", page)
            break
        previous_fingerprint = fingerprint

        filename = "{name}_{year}_{month}_{page}.html".format(name="NatureGeoscience", year=year, month="0", page=page)
        with codecs.open("pages/"+filename, "w", "utf8") as outfile:
            outfile.write(html)

        time.sleep(2)


### Nature

#### Define url template and scrape function adapted to journal and run a test

In [None]:
# url template  (last checked Aug. 2023)

nature_template = "https://www.nature.com/search?article_type=research%2Creviews&date_range={year}-{year}&journal=nature&subject=seismology&order=relevance&page={page}" 


# scrape function

def fetch_nature_page(browser, url):
    browser.get(url)
    if browser.current_url != url:
        print("unexpected page url.\n current: {} \n expected: {}".format(browser.current_url,url))
        return "", ""
    
    # create a fingerprint for this page
    titles = []
    elements = browser.find_elements_by_class_name("h3")
    for e in elements:
        titles.append(e.text)
    fingerprint = " ".join(titles)

    return browser.page_source, fingerprint

# run test

url = nature_template.format(year="2015", page=1)
html, fingerprint = fetch_nature_page(browser, url)

with codecs.open("test/test_nature_page.html", "w", "utf8") as outfile:
    outfile.write(html)

#### Collect pages with publication info

In [None]:
previous_fingerprint = ""

for year in years:
    print("scraping year:", year)
    for page in range(1,101):
        url = nature_template.format(year=year, page=page)
        html, fingerprint = fetch_nature_page(browser, url)
        if len(html) == 0:
            print("nothing to save for", url)
            continue
        if fingerprint == previous_fingerprint:
            # page already seen, move to the next month
            print("done on page", page)
            break
        if len(fingerprint) == 0:
            # page already seen, move to the next month
            print("done on page", page)
            break
        previous_fingerprint = fingerprint

        filename = "{name}_{year}_{month}_{page}.html".format(name="Nature", year=year, month="0", page=page)
        with codecs.open("pages/"+filename, "w", "utf8") as outfile:
            outfile.write(html)

        time.sleep(2)


### Solid Earth

#### Define url template and scrape function adapted to journal and run a test

In [None]:
# url template  (last checked Aug. 2023)

se_template = "https://editor.copernicus.org/search.php?abstract=earthquake+seism&startYear={year}&endYear={year}&paperVersion=final&journal=431&page={page}"


# scrape function

def fetch_se_page(browser, url):
    browser.get(url)
    if browser.current_url != url:
        print("unexpected page url.\n current: {} \n expected: {}".format(browser.current_url,url))
        return "", ""
    
    # create a fingerprint for this page
    titles = []
    elements = browser.find_elements_by_class_name("article-title")
    for e in elements:
        titles.append(e.text)
    fingerprint = " ".join(titles)

    return browser.page_source, fingerprint

# run test

url = se_template.format(year="2015", page=2)
html, fingerprint = fetch_se_page(browser, url)

with codecs.open("test/test_se_page.html", "w", "utf8") as outfile:
    outfile.write(html)

#### Collect pages with publication info

In [None]:
previous_fingerprint = ""

for year in years:
    print("scraping year:", year)
    for page in range(1,101):
        url = se_template.format(year=year, page=page)
        html, fingerprint = fetch_se_page(browser, url)
        if len(html) == 0:
            print("nothing to save for", url)
            continue
        if fingerprint == previous_fingerprint:
            # page already seen, move to the next month
            print("done on page", page)
            break
        if len(fingerprint) == 0:
            # page already seen, move to the next month
            print("done on page", page)
            break
        previous_fingerprint = fingerprint

        filename = "{name}_{year}_{month}_{page}.html".format(name="SolidEarth", year=year, month="0", page=page)
        with codecs.open("pages/"+filename, "w", "utf8") as outfile:
            outfile.write(html)

        time.sleep(2)


### Science AAAS

#### Define url template 

In [None]:
## This was the one used for our publication

science_template = "http://classic.sciencemag.org/search?journalcode=sci&volume=&firstpage=&submit=yes&doi=&submit=yes&fulltext=&andorexactfulltext=or&titleabstract=seism%2C+earthquake&andorexacttitleabs=or&title=&andorexacttitle=and&author1=&author2=&fmonth=Jan&fyear={year}&tmonth=Dec&tyear={year}&hits=125&sortspec=date&submit=yes&resourcetype=HWCIT&tocsectionid=Original+Research&submit=yes&submit.x=46&submit.y=8"

# However, it seems to have changed. Please, try the following (last checked Aug. 2023):

#science_template = "https://www.science.org/action/doSearch?field1=AllField&text1=seism+OR+earthquake&ConceptID%5B%5D=505154&ConceptID=&publication%5B%5D=science&publication=&Ppub=&AfterMonth=1&AfterYear={year}&BeforeMonth=12&BeforeYear={year}&startPage=0&adobe_mc=MCMID%3D50468266442497235571139804379786180598%7CMCORGID%3D242B6472541199F70A4C98A6%2540AdobeOrg%7CTS%3D1693394584&adobe_mc=MCMID%3D50468266442497235571139804379786180598%7CMCORGID%3D242B6472541199F70A4C98A6%2540AdobeOrg%7CTS%3D1693394738"



#### Define scrape function adapted to journal and run a test

In [None]:
def fetch_science_page(browser, url):
    browser.get(url)
    if browser.current_url != url:
        print("unexpected page url.\n current: {} \n expected: {}".format(browser.current_url,url))
        return "", ""
    time.sleep(10)
    # create a fingerprint for this page
    titles = []
    elements = browser.find_elements_by_class_name("cit-first-element")
    for e in elements:
        titles.append(e.text)
    fingerprint = " ".join(titles)

    return browser.page_source, fingerprint

# run test

url = science_template.format(year="2018")
html, fingerprint = fetch_science_page(browser, url)

with codecs.open("test/test_science_page.html", "w", "utf8") as outfile:
    outfile.write(html)

#### Collect pages with publication info

In [None]:
for year in years:
    print("scraping year:", year)
    url = science_template.format(year=year)
    html, fingerprint = fetch_science_page(browser, url)
    if len(html) == 0:
        print("nothing to save for", url)
        continue
    if len(fingerprint) == 0:
        print("no articles found")
        break
    filename = "{name}_{year}_{month}_{page}.html".format(name="Science", year=year, month="0", page="1")
    with codecs.open("pages/"+filename, "w", "utf8") as outfile:
        outfile.write(html)

    time.sleep(2)

### Science Direct

#### Define url template and scrape function adapted to journal and run a test

In [None]:
# url template  (last checked Aug. 2023)

science_direct_template = 'https://ui.adsabs.harvard.edu/search/q=bibstem%3A%22{journal}%22%20year%3A{year}%20%20abs%3A(seism%20OR%20earthquake)&sort=date%20desc%2C%20bibcode%20desc&p_={page}'
#page starts from 0

sd_journal_values = ["E%26PSL","Tectp","PEPI"] # journals of interest


# scrape function

def fetch_sd_page(browser, url):
    browser.get(url)
    if browser.current_url != url:
        print("unexpected page url.\n current: {} \n expected: {}".format(browser.current_url,url))
        return "", ""
    time.sleep(4)
    # create a fingerprint for this page
    titles = []
    elements = browser.find_elements_by_class_name("s-results-title")
    for e in elements:
        titles.append(e.text)
    fingerprint = " ".join(titles)

    return browser.page_source, fingerprint

# run test

journal = sd_journal_values[0]
url = science_direct_template.format(journal=journal, year="2011", page=0)
html, fingerprint = fetch_sd_page(browser, url)

with codecs.open("test/test_sd_page.html", "w", "utf8") as outfile:
    outfile.write(html)

#### Collect pages with publication info

In [None]:
previous_fingerprint = ""

for journal in sd_journal_values:
    print("scraping journal:", journal)
    for year in years:
        print("scraping year:", year)
        for page in range(101):
            print("page:", page)
            url = science_direct_template.format(journal=journal, year=year, page=page)
            html, fingerprint = fetch_sd_page(browser, url)
            if len(html) == 0:
                print("nothing to save for", url)
                continue
            if fingerprint == previous_fingerprint:
                # page already seen, move to the next month
                print("done on page", page)
                break
            if len(fingerprint) == 0:
                # page already seen, move to the next month
                print("done on page", page)
                break
            
            previous_fingerprint = fingerprint

            filename = "{name}_{year}_{month}_{page}.html".format(name=journal, year=year, month="0", page=page)
            with codecs.open("pages/"+filename, "w", "utf8") as outfile:
                outfile.write(html)

            time.sleep(2)