In [None]:
# get the links to relevant PDF documents for the issues previously selected
# relevant PDF means PDF with potential information about the editorial board
# the selection is based on a list of keywords

# imports
import pandas as pd
from bs4 import BeautifulSoup
import time
import numpy as np

pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)

In [None]:
publishers = ["Wiley", "Springer", "Taylor", "Elsevier"]
# publishers = ["Wiley"]

start_year = 2017
end_year = 2021

key_words_wiley = ["issue information", "editorial board", "editorial-board", "publication information"]
key_words_elsevier = ["issue information", "editorial board", "editorial-board", "editorial advisory board", "editors", "publication information"]
key_words_springer = key_words_wiley
key_words_taylor = key_words_wiley

key_words_publishers = {"Wiley": key_words_wiley, "Elsevier": key_words_elsevier, "Taylor": key_words_taylor, "Springer": key_words_springer}

# key_words = ["issue information", "editors", "editor", "editorial", "éditorial"]
# key_words = ["issue information", "editorial board"]

for publisher in publishers:
    key_words = key_words_publishers[publisher]
    print("Parsing publisher", publisher, "...")
    df = pd.read_csv("Journals" + publisher + ".csv")
    # display(df)
    df_null = df.isnull()
    # print(df_null)
    journal_ids = df["Journal_Id"].to_numpy()
    # print(journal_ids)
    df_pdf = pd.DataFrame(columns = ["Journal_Id", "Year", "Authors", "Title", "DOI", "Link1", "Link2", "Keyword"])
    # print(df_pdf)
    df_got_link_to_pdf = pd.DataFrame(columns = ["Journal_Id"])
    for year in range(start_year, end_year + 1):
        df_got_link_to_pdf[str(year)] = np.nan   
    # print(df_got_link_to_pdf)
    
    for i in range(len(journal_ids)): # for each journal
        journal_id = journal_ids[i]
        # print(journal_id)
        got_links = {"Journal_Id": journal_id}
        for year in range(start_year, end_year + 1): # for each year
            got_link = False # to fill dataframe df_got_link_to_pdf
            
            # check the journal has an issue for this year
            if not df_null.loc[i, "links_" + str(year)]:
                # open the file and parse it
                content = open(publisher.lower() + "/journal" + str(journal_id) + "/year" + str(year) + ".txt", "r")
                soup = BeautifulSoup(content, 'html.parser')
                content.close()
                
                # start extracting data
                body = soup.find("tbody")
                if body != None:
                    rows = body.find_all("tr")
                    for row in rows:
                        if row != None:
                            cells = row.find_all("td")
                            for key_word in key_words:
                                # check if one of the keywords is included in the article title
                                if key_word in cells[1].text.lower():
                                    authors = cells[0].text
                                    title = cells[1].text.split("DOI: ")[0].strip()
                                    doi = cells[1].text.split("DOI: ")[1]
                                    link1 = cells[4].find_all("a")[0].get("href")
                                    link2 = cells[4].find_all("a")[2].get("href")

                                    # add info in df_pdf
                                    df_pdf = df_pdf.append({"Journal_Id": journal_id,"Year": year, "Authors": authors, "Title":title, "DOI": doi, "Link1": link1, "Link2": link2, "Keyword": key_word}, ignore_index=True)
                                    got_link = True

                                    break # to add the row only once
            got_links[str(year)] = got_link
            
        df_got_link_to_pdf = df_got_link_to_pdf.append(got_links, ignore_index=True)
    
    got_link_all_years = []
    for row_index in range(len(df_got_link_to_pdf)):
        to_add = False
        for j in df_got_link_to_pdf.loc[row_index].values.tolist():
            if j == True:
                to_add = True
                break
        got_link_all_years.append(to_add)
    df_got_link_to_pdf["All_Years"] = pd.Series(got_link_all_years)
    
    # save
    df_pdf.to_csv("Journals" + publisher + "PDFs.csv", index=False)
    df_got_link_to_pdf.to_csv("Journals" + publisher + "GotPDFs.csv", index=False)

In [None]:
# optional: get list of Elsevier info pages (for Michael)

df = pd.read_csv("JournalsElsevierPDFs.csv")
df = df[["DOI"]]
df.to_csv("ElsevierDOIs.csv", index = False)