In [1]:
# analysis of results from previous steps
# this data analysis helped us decide to:
# - only consider Wikey and Elsevier for the analysis
# - not consider years 2021 and 2022

# imports
import pandas as pd
from bs4 import BeautifulSoup
import time
import numpy as np

pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)

In [2]:
# selection with list of keywords
publishers = {"Wiley": 1330, "Springer": 1632, "Taylor": 1294, "Elsevier": 2423}
for publisher in publishers.keys():
    print("Publisher: " + publisher)
    df = pd.read_csv("Journals" + publisher + "PDFs.csv")
    print("Number of journals:", publishers[publisher])
    percentage1 = len(df)/(publishers[publisher]*5)*100
    print("Number of links to PDFs:", len(df), "("+ str("%.2f" %percentage1)+'%)')
    df_bool = pd.read_csv("Journals" + publisher + "gotPDFs.csv")
    percentage2 = (len(df_bool) - df_bool["All_Years"].sum())/len(df_bool)*100
    print("Number of journals with NO info about editors:", len(df_bool) - df_bool["All_Years"].sum()) #, "("+ str("%.2f" %percentage2)+'%)\n'
    # display(df)

Publisher: Wiley
Number of journals: 1330
Number of links to PDFs: 3624 (54.50%)
Number of journals with NO info about editors: 394
Publisher: Springer
Number of journals: 1632
Number of links to PDFs: 80 (0.98%)
Number of journals with NO info about editors: 1604
Publisher: Taylor
Number of journals: 1294
Number of links to PDFs: 62 (0.96%)
Number of journals with NO info about editors: 1255
Publisher: Elsevier
Number of journals: 2423
Number of links to PDFs: 4799 (39.61%)
Number of journals with NO info about editors: 1094


In [3]:
# after getting PDF for Wiley

df = pd.read_csv("JournalsWileyPDFs.csv")
# display(df)
count_null_values = df["LinksPDF"].isna().sum()
print("Number of PDF downloaded:", len(df) - count_null_values)
print("Number of links to PDF that didn't work:", count_null_values)

Number of PDF downloaded: 3592
Number of links to PDF that didn't work: 32


In [4]:
# More checks for Wiley

df = pd.read_csv("JournalsWileyPDFs.csv")
df_bool = pd.read_csv("JournalsWileyGotPDFs.csv")
df_wiley = pd.read_csv("JournalsWiley.csv")
display(df_bool)

Unnamed: 0,Journal_Id,2017,2018,2019,2020,2021,All_Years
0,23,0.0,1.0,1.0,1.0,0.0,True
1,30,0.0,0.0,0.0,0.0,0.0,False
2,37,1.0,1.0,1.0,1.0,0.0,True
3,56,1.0,1.0,1.0,1.0,0.0,True
4,193,1.0,1.0,1.0,1.0,1.0,True
...,...,...,...,...,...,...,...
1325,25815,1.0,1.0,1.0,1.0,0.0,True
1326,25978,0.0,1.0,1.0,1.0,0.0,True
1327,25982,0.0,1.0,1.0,1.0,0.0,True
1328,25985,0.0,0.0,0.0,0.0,0.0,False


In [5]:
df_wiley_tmp = df_wiley[['links_2017', 'links_2018', 'links_2019', 'links_2020', 'links_2021']]


nb_journal_no_issue = 0 # nuymber of journal with no issue over the 5 years (not even 1)
list_journal_no_issue = []

list_nb_issues = {}
for row_index in range(len(df_wiley_tmp)):
    count = 0
    for j in df_wiley_tmp.loc[row_index].values.tolist():
        if str(j) == 'nan':
            count += 1  
    if count == 5: # we analyze over 5 years
        list_journal_no_issue.append(df_wiley.loc[row_index].values.tolist()[1])
        nb_journal_no_issue += 1
    list_nb_issues[df_wiley.loc[row_index].values.tolist()[1]] = 5 - count
        
print(nb_journal_no_issue)
df_list_journal_no_issue = pd.DataFrame(list_journal_no_issue, columns = ["Journal_Id"])

print(list_journal_no_issue)

# find those 217 journals that still have issue but no editorial information 
df_bool_tmp = df_bool[['Journal_Id', 'All_Years']]
list_journal_no_pdf = df_bool_tmp[df_bool_tmp.All_Years == False]

df_merge = list_journal_no_pdf.merge(df_list_journal_no_issue, how='left', indicator=True)
df_merge = df_merge[(df_merge['_merge']=='left_only')].copy() 
df_merge = df_merge.drop(columns='_merge').copy()
list_journals_with_issue_but_no_link = df_merge[['Journal_Id']].to_numpy().flatten()
print(list_journals_with_issue_but_no_link) 
print(len(list_journals_with_issue_but_no_link))

# count number of journals who have more than 0 editorial info but not 5
#  print(list_nb_issues)
list_nb_links = {}
list_got_2021 = {}
for row_index in range(len(df_bool)):
    count = 0
    get_2021 = False
    list_got_pdf = df_bool.loc[row_index].values.tolist()
    for i in range(1, 6):
        count += list_got_pdf[i]
        if i == 5:
            if list_got_pdf[i] == 1:
                get_2021 = True
    list_nb_links[list_got_pdf[0]] = int(count)
    list_got_2021[list_got_pdf[0]] = get_2021
# print(list_nb_links)
print(list_got_2021)

list_journals_got_pdf_for_some_years = []
list_journals_didnot_get_pdf_2021 = []
for journal in list_nb_issues:
    if list_nb_issues[journal] != list_nb_links[journal] and 0 < list_nb_links[journal] < 5:
        list_journals_got_pdf_for_some_years.append(journal)
        if list_got_2021[journal] == False:
            list_journals_didnot_get_pdf_2021.append(journal)
print(list_journals_got_pdf_for_some_years)  
print(len(list_journals_got_pdf_for_some_years))
print(list_journals_didnot_get_pdf_2021)
print(len(list_journals_didnot_get_pdf_2021))

# check how many journals only have 2021 missing


# check why didn't manage to convert pdf to txt

# df_wiley_tmp["All_Years"] = pd.Series(got_issue_all_years)
# display(df_wiley_tmp)

177
[213, 264, 288, 317, 358, 363, 681, 935, 977, 1083, 1404, 1497, 1524, 1662, 1727, 1758, 1961, 2083, 2138, 2397, 2405, 2425, 2436, 2810, 2973, 3007, 3008, 3009, 3287, 3749, 3884, 3924, 4049, 4054, 4185, 4346, 4492, 4581, 4827, 4902, 4993, 4999, 5068, 5125, 5684, 5781, 5869, 6218, 6275, 6429, 6443, 6444, 6445, 6584, 6760, 7059, 7126, 7279, 7305, 7499, 7515, 7533, 7548, 7602, 7648, 7718, 7720, 7796, 8069, 8095, 8458, 8560, 9392, 9441, 9751, 10000, 10532, 10640, 10882, 10988, 11889, 12325, 12380, 12449, 12461, 13073, 13310, 13418, 13503, 14385, 14478, 14544, 14962, 15078, 15103, 15182, 15303, 15304, 15415, 15464, 15500, 15501, 15605, 15814, 16022, 16390, 16545, 16561, 16627, 17098, 17136, 17603, 17694, 17793, 17857, 17891, 18131, 18164, 18168, 18223, 18546, 18679, 18849, 18991, 19399, 19412, 19434, 19515, 19528, 19632, 19921, 19927, 20218, 20255, 20269, 20272, 20548, 20650, 20719, 21115, 21160, 21610, 21831, 22079, 22105, 22205, 22228, 22439, 22440, 22643, 22667, 23142, 23150, 23284, 2

In [6]:
print("Publisher: Wiley\n")

print("Number of journals: 1330")
print("Number of journals with no issue in:")
print("- 2017:", df_wiley_tmp['links_2017'].isna().sum())
print("- 2018:", df_wiley_tmp['links_2018'].isna().sum())
print("- 2019:", df_wiley_tmp['links_2019'].isna().sum())
print("- 2020:", df_wiley_tmp['links_2020'].isna().sum())
print("- 2021:", df_wiley_tmp['links_2021'].isna().sum())
print("Total number of missing issues over 5 years:", df_wiley_tmp.isna().sum().sum())
total_nb_issues = 1330*5 - df_wiley_tmp.isna().sum().sum()
print("Total number of issues over 5 years:", total_nb_issues)

percentage1 = len(df)/total_nb_issues*100
print("Number of links to PDFs:", len(df), "("+ str("%.2f" %percentage1)+'%)\n')

percentage2 = (len(df_bool) - df_bool["All_Years"].sum())/len(df_bool)*100
print("Number of journals with NO info about editors (but still have issues):", len(df_bool) - df_bool["All_Years"].sum() - nb_journal_no_issue) #, "("+ str("%.2f" %percentage2)+'%)\n'
# have the list of those journals in list_journals_with_issue_but_no_link

print("\nNumber of journals with info missing for some years:", len(list_journals_got_pdf_for_some_years))

print("\nNumber of journals with info missing for 2021:", len(list_journals_didnot_get_pdf_2021))

Publisher: Wiley

Number of journals: 1330
Number of journals with no issue in:
- 2017: 185
- 2018: 184
- 2019: 187
- 2020: 198
- 2021: 950
Total number of missing issues over 5 years: 1704
Total number of issues over 5 years: 4946
Number of links to PDFs: 3624 (73.27%)

Number of journals with NO info about editors (but still have issues): 217

Number of journals with info missing for some years: 468

Number of journals with info missing for 2021: 442


In [17]:
# check for all publishers

publishers = {"Wiley": 1330, "Springer": 1632, "Taylor": 1294, "Elsevier": 2423}
# publishers = {"Taylor": 1294}
# publishers = {"Wiley": 1330}

for publisher in publishers:
    print("\nPublisher: ", publisher)
    df_pdf = pd.read_csv("Journals" + publisher + "PDFs.csv")
    df_bool = pd.read_csv("Journals" + publisher + "GotPDFs.csv")
    df = pd.read_csv("Journals" + publisher + ".csv")
    
    df_tmp = df[['links_2017', 'links_2018', 'links_2019', 'links_2020', 'links_2021']]
    
    nb_journal_no_issue = 0 # nuymber of journal with no issue over the 5 years (not even 1)
    list_journal_no_issue = []

    list_nb_issues = {}
    for row_index in range(len(df_tmp)):
        count = 0
        for j in df_tmp.loc[row_index].values.tolist():
            if str(j) == 'nan':
                count += 1  
        if count == 5: # we analyze over 5 years
            list_journal_no_issue.append(df.loc[row_index].values.tolist()[1])
            nb_journal_no_issue += 1
        list_nb_issues[df.loc[row_index].values.tolist()[1]] = 5 - count

    # print(nb_journal_no_issue)
    df_list_journal_no_issue = pd.DataFrame(list_journal_no_issue, columns = ["Journal_Id"])

    # print(list_journal_no_issue)

    # find those 217 journals that still have issue but no editorial information 
    df_bool_tmp = df_bool[['Journal_Id', 'All_Years']]
    list_journal_no_pdf = df_bool_tmp[df_bool_tmp.All_Years == False]

    df_merge = list_journal_no_pdf.merge(df_list_journal_no_issue, how='left', indicator=True)
    df_merge = df_merge[(df_merge['_merge']=='left_only')].copy() 
    df_merge = df_merge.drop(columns='_merge').copy()
    list_journals_with_issue_but_no_link = df_merge[['Journal_Id']].to_numpy().flatten()
#     print(list_journals_with_issue_but_no_link) 
#     print(len(list_journals_with_issue_but_no_link))

    # count number of journals who have more than 0 editorial info but not 5
    #  print(list_nb_issues)
    list_nb_links = {}
    list_got_2021 = {}
    for row_index in range(len(df_bool)):
        count = 0
        get_2021 = False
        list_got_pdf = df_bool.loc[row_index].values.tolist()
        for i in range(1, 6):
            count += list_got_pdf[i]
            if i == 5:
                if list_got_pdf[i] == 1:
                    get_2021 = True
        list_nb_links[list_got_pdf[0]] = int(count)
        list_got_2021[list_got_pdf[0]] = get_2021
    # print(list_nb_links) # for each journal, number of links to pdf
    # print(list_got_2021) # for each journal, true if got link for 2021

    list_journals_got_pdf_for_some_years = []
    list_journals_didnot_get_pdf_2021 = []
    list_journals_did_only_not_get_pdf_2021 = []
    for journal in list_nb_issues:
        if list_nb_issues[journal] != list_nb_links[journal] and 0 < list_nb_links[journal] < 5:
            list_journals_got_pdf_for_some_years.append(journal)
            if list_got_2021[journal] == False:
                list_journals_didnot_get_pdf_2021.append(journal)
            if list_got_2021[journal] == False and list_nb_links[journal] == 4:
                list_journals_did_only_not_get_pdf_2021.append(journal)

    #print(list_journals_got_pdf_for_some_years)  
    #print(len(list_journals_got_pdf_for_some_years))
    #print(list_journals_didnot_get_pdf_2021)
    #print(len(list_journals_didnot_get_pdf_2021))
    #print(list_journals_did_only_not_get_pdf_2021)
    #print(len(list_journals_did_only_not_get_pdf_2021))
    
    print("Number of journals:", publishers[publisher])
    print("Number of journals with no issue in:")
    print("- 2017:", df_tmp['links_2017'].isna().sum())
    print("- 2018:", df_tmp['links_2018'].isna().sum())
    print("- 2019:", df_tmp['links_2019'].isna().sum())
    print("- 2020:", df_tmp['links_2020'].isna().sum())
    print("- 2021:", df_tmp['links_2021'].isna().sum())
    print("Total number of missing issues over 5 years:", df_tmp.isna().sum().sum())
    total_nb_issues = publishers[publisher]*5 - df_tmp.isna().sum().sum()
    print("Total number of issues over 5 years:", total_nb_issues)

    percentage1 = len(df_pdf)/total_nb_issues*100
    print("Number of links to PDFs:", len(df_pdf), "("+ str("%.2f" %percentage1)+'%)\n')

    percentage2 = (len(df_bool) - df_bool["All_Years"].sum())/len(df_bool)*100
    print("Number of journals with NO info about editors (but still have issues):", len(df_bool) - df_bool["All_Years"].sum() - nb_journal_no_issue) #, "("+ str("%.2f" %percentage2)+'%)\n'
    # have the list of those journals in list_journals_with_issue_but_no_link

    print("\nNumber of journals with info missing for some years:", len(list_journals_got_pdf_for_some_years))

    print("\nNumber of journals with info missing for 2021:", len(list_journals_didnot_get_pdf_2021))
    
    print("\nNumber of journals with info missing for 2021 ONLY:", len(list_journals_did_only_not_get_pdf_2021))
    
    


Publisher:  Wiley
Number of journals: 1330
Number of journals with no issue in:
- 2017: 185
- 2018: 184
- 2019: 187
- 2020: 198
- 2021: 950
Total number of missing issues over 5 years: 1704
Total number of issues over 5 years: 4946
Number of links to PDFs: 3624 (73.27%)

Number of journals with NO info about editors (but still have issues): 217

Number of journals with info missing for some years: 468

Number of journals with info missing for 2021: 442

Number of journals with info missing for 2021 ONLY: 143

Publisher:  Springer
Number of journals: 1632
Number of journals with no issue in:
- 2017: 213
- 2018: 243
- 2019: 256
- 2020: 273
- 2021: 1493
Total number of missing issues over 5 years: 2478
Total number of issues over 5 years: 5682
Number of links to PDFs: 80 (1.41%)

Number of journals with NO info about editors (but still have issues): 1398

Number of journals with info missing for some years: 19

Number of journals with info missing for 2021: 19

Number of journals with in

In [18]:
# check for all publishers without 2021

publishers = {"Wiley": 1330, "Springer": 1632, "Taylor": 1294, "Elsevier": 2423}
# publishers = {"Elsevier": 2423}
# publishers = {"Wiley": 1330}

for publisher in publishers:
    print("\nPublisher: ", publisher)
    df_pdf = pd.read_csv("Journals" + publisher + "PDFs.csv")
    df_bool = pd.read_csv("Journals" + publisher + "GotPDFs.csv")
    df = pd.read_csv("Journals" + publisher + ".csv")
    
    df_tmp = df[['links_2017', 'links_2018', 'links_2019', 'links_2020']]
    
    nb_journal_no_issue = 0 # nuymber of journal with no issue over the 4 years (not even 1)
    list_journal_no_issue = []

    list_nb_issues = {}
    for row_index in range(len(df_tmp)):
        count = 0
        for j in df_tmp.loc[row_index].values.tolist():
            if str(j) == 'nan':
                count += 1  
        if count == 4: # we analyze over 4 years
            list_journal_no_issue.append(df.loc[row_index].values.tolist()[1])
            nb_journal_no_issue += 1
        list_nb_issues[df.loc[row_index].values.tolist()[1]] = 4 - count

    # print(nb_journal_no_issue)
    df_list_journal_no_issue = pd.DataFrame(list_journal_no_issue, columns = ["Journal_Id"])

    # print(list_journal_no_issue)

    # find those 217 journals that still have issue but no editorial information 
    df_bool_tmp = df_bool[['Journal_Id', 'All_Years']]
    list_journal_no_pdf = df_bool_tmp[df_bool_tmp.All_Years == False]

    df_merge = list_journal_no_pdf.merge(df_list_journal_no_issue, how='left', indicator=True)
    df_merge = df_merge[(df_merge['_merge']=='left_only')].copy() 
    df_merge = df_merge.drop(columns='_merge').copy()
    list_journals_with_issue_but_no_link = df_merge[['Journal_Id']].to_numpy().flatten()
    # print(list_journals_with_issue_but_no_link) 
    # print(len(list_journals_with_issue_but_no_link))

    # count number of journals who have more than 0 editorial info but not 5
    #  print(list_nb_issues)
    list_nb_links = {}
#     list_got_2021 = {}
    for row_index in range(len(df_bool)):
        count = 0
#         get_2021 = False
        list_got_pdf = df_bool.loc[row_index].values.tolist()
        for i in range(1, 5):
            count += list_got_pdf[i]
#             if i == 4:
#                 if list_got_pdf[i] == 1:
#                     get_2021 = True
        list_nb_links[list_got_pdf[0]] = int(count)
#         list_got_2021[list_got_pdf[0]] = get_2021
    # print(list_nb_links) # for each journal, number of links to pdf
    # print(list_got_2021) # for each journal, true if got link for 2021

    list_journals_got_pdf_for_some_years = []
#     list_journals_didnot_get_pdf_2021 = []
#     list_journals_did_only_not_get_pdf_2021 = []
    for journal in list_nb_issues:
        if list_nb_issues[journal] != list_nb_links[journal] and 0 < list_nb_links[journal] < 4:
            list_journals_got_pdf_for_some_years.append(journal)
#             if list_got_2021[journal] == False:
#                 list_journals_didnot_get_pdf_2021.append(journal)
#             if list_got_2021[journal] == False and list_nb_links[journal] == 4:
#                 list_journals_did_only_not_get_pdf_2021.append(journal)

#     print(list_journals_got_pdf_for_some_years)  
#     print(len(list_journals_got_pdf_for_some_years))
    #print(list_journals_didnot_get_pdf_2021)
    #print(len(list_journals_didnot_get_pdf_2021))
#     print(list_journals_did_only_not_get_pdf_2021)
#     print(len(list_journals_did_only_not_get_pdf_2021))
    
    print("Number of journals:", publishers[publisher])
#     print("Number of journals with no issue in:")
#     print("- 2017:", df_tmp['links_2017'].isna().sum())
#     print("- 2018:", df_tmp['links_2018'].isna().sum())
#     print("- 2019:", df_tmp['links_2019'].isna().sum())
#     print("- 2020:", df_tmp['links_2020'].isna().sum())
#     print("- 2021:", df_tmp['links_2021'].isna().sum())
    print("Total number of missing issues over 4 years:", df_tmp.isna().sum().sum())
    total_nb_issues = publishers[publisher]*4 - df_tmp.isna().sum().sum()
    print("Total number of issues over 4 years:", total_nb_issues)

    percentage1 = len(df_pdf)/total_nb_issues*100
    print("Number of links to PDFs:", len(df_pdf), "("+ str("%.2f" %percentage1)+'%)\n')

#     percentage2 = (len(df_bool) - df_bool["All_Years"].sum())/len(df_bool)*100
#     print("Number of journals with NO info about editors (but still have issues):", len(df_bool) - df_bool["All_Years"].sum() - nb_journal_no_issue) #, "("+ str("%.2f" %percentage2)+'%)\n'
#     # have the list of those journals in list_journals_with_issue_but_no_link

#     print("\nNumber of journals with info missing for some years:", len(list_journals_got_pdf_for_some_years))

#     print("\nNumber of journals with info missing for 2021:", len(list_journals_didnot_get_pdf_2021))
    
#     print("\nNumber of journals with info missing for 2021 ONLY:", len(list_journals_did_only_not_get_pdf_2021)




Publisher:  Wiley
Number of journals: 1330
Total number of missing issues over 4 years: 754
Total number of issues over 4 years: 4566
Number of links to PDFs: 3624 (79.37%)


Publisher:  Springer
Number of journals: 1632
Total number of missing issues over 4 years: 985
Total number of issues over 4 years: 5543
Number of links to PDFs: 80 (1.44%)


Publisher:  Taylor
Number of journals: 1294
Total number of missing issues over 4 years: 1033
Total number of issues over 4 years: 4143
Number of links to PDFs: 62 (1.50%)


Publisher:  Elsevier
Number of journals: 2423
Total number of missing issues over 4 years: 3220
Total number of issues over 4 years: 6472
Number of links to PDFs: 4799 (74.15%)

