In [1]:
import re
import os
import time
import PyPDF2
import requests
import urllib.request

from tika import parser
from pathlib import Path
from slugify import slugify
from bs4 import BeautifulSoup
from dicttoxml import dicttoxml
from xml.dom.minidom import parseString

In [2]:
ROOT_DIR = os.path.dirname(os.getcwd())
ROOT_DIR

'C:\\Users\\sgmcart3\\Documents\\Projects\\NLP\\isif_journal_pages'

In [3]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

### Scrape web pages

In [4]:
page = requests.get("https://isif.org/journals/all")
soup = BeautifulSoup(page.content, 'html.parser')

In [5]:
journal_listing = soup.find(class_="view-isif-journals-listing")
journal_items = journal_listing.find_all(class_="field-name-field-journal-articles")

In [6]:
depositor_name = ""
depositor_email = ""

In [7]:
for journal_item in journal_items:
    journal_url = journal_item.find(class_="field-name-field-journal-full-print").find("a", href=True)["href"]

    journal_page = requests.get(journal_url)
    journal_soup = BeautifulSoup(journal_page.content, 'html.parser')

    journal_date = journal_soup.find(class_="field-name-field-journal-date").find(class_="field-items").find(class_="date-display-single").get_text()
    journal_date_month = journal_date.split(",")[0]
    journal_date_year = journal_date.split(",")[1].replace(" ", "")
    journal_vol_num = journal_soup.find(class_="field-name-field-journal-vol-num").find(class_="field-items").get_text()
    journal_issue_num = journal_soup.find(class_="field-name-field-journal-issue-num").find(class_="field-items").get_text()
    journal_issn = journal_soup.find(class_="field-name-field-journal-issn").find(class_="field-items").get_text()
    journal_entries = journal_soup.find(class_="field-name-field-journal-entry-ref").find(class_="field-items").findChildren(class_="field-item", recursive=False)

    xml_output_dir = f"{ROOT_DIR}/output/xml/{journal_vol_num}/{journal_issue_num}/"
    Path(xml_output_dir).mkdir(parents=True, exist_ok=True)
    
    html_output_dir = f"{ROOT_DIR}/output/html/{journal_vol_num}/{journal_issue_num}/"
    Path(html_output_dir).mkdir(parents=True, exist_ok=True)
    
    paper_dir = f"{ROOT_DIR}/data/papers/{journal_vol_num}/{journal_issue_num}/"
    preprocessed_paper_dir = f"{ROOT_DIR}/data/preprocessed_papers/{journal_vol_num}/{journal_issue_num}/"
    meta_paper_dir = f"{ROOT_DIR}/data/meta_papers/{journal_vol_num}/{journal_issue_num}/"
    Path(paper_dir).mkdir(parents=True, exist_ok=True)
    Path(preprocessed_paper_dir).mkdir(parents=True, exist_ok=True)
    Path(meta_paper_dir).mkdir(parents=True, exist_ok=True)
    
    entries = {}
    for journal_entry in journal_entries:
        try:
            entry_title = journal_entry.find(class_="field-name-field-journal-entry-file").find("a", href=True).get_text()
            entry_pdf_url = journal_entry.find(class_="field-name-field-journal-entry-file").find("a", href=True)["href"]
            authors = journal_entry.find_all(class_="field-name-je-fc-author-displayname")
            entry_authors = [author.get_text() for author in authors]
            entry_pages = journal_entry.find(class_="field-name-field-journal-entry-pages").get_text()
        
            if os.path.exists(f"{preprocessed_paper_dir}/{slugify(entry_title)}.pdf") is False:
                response = urllib.request.urlopen(entry_pdf_url)
                file = open(f"{paper_dir}/{slugify(entry_title)}.pdf", 'wb')
                file.write(response.read())
                file.close()

            if os.path.exists(f"{preprocessed_paper_dir}/{slugify(entry_title)}.txt") is False:
                raw = parser.from_file(f"{paper_dir}/{slugify(entry_title)}.pdf")
                content = str(raw["content"])
                meta = str(raw["metadata"])
                # h.update(content.encode('utf-8'))
                # hashcode = h.hexdigest()
                with open(f"{preprocessed_paper_dir}/{slugify(entry_title)}.txt", 'w', encoding='utf-8') as fid:
                    fid.writelines(content)
                with open(f"{meta_paper_dir}/{slugify(entry_title)}.txt", 'w', encoding='utf-8') as fid:
                    fid.writelines(meta)
        except Exception as e:
            print(e)
#             print(journal_entry, "\n\n")
            continue
                    
        extract_refs = False
        recording_ref = False
        references = {}
        ref_count = 1
        with open(f"{preprocessed_paper_dir}/{slugify(entry_title)}.txt", "r", encoding="utf-8") as f:
            ref_string = ""
            for line in f:
                if "REFERENCES" in line: extract_refs = True
                if extract_refs:
                    # Check for author name
                    if any(" ".join(author.split()) in line for author in entry_authors):
                        references[ref_count] = ref_string
                        ref_string = None
                        break

                    # Record references
                    if ("[" in line) and (recording_ref is False):
                        recording_ref = True
                        ref_string = line.split("]")[1]
                        continue
                    elif ("[" in line) and (recording_ref is True): # Start new ref
                        references[ref_count] = ref_string
                        ref_count += 1
                        ref_string = ""
                    elif recording_ref:
                        ref_string += " " + line.replace("\n", "")
            if ref_string is not None:
                references[ref_count] = ref_string

        entries[entry_title] = {
            "pdf_url": entry_pdf_url,
            "authors": entry_authors,
            "pages": entry_pages,
            "references": references,
        }
    
    for entry_title in entries.keys():
        # Write XML
        with open(f"{xml_output_dir}/{slugify(entry_title)}.xml", "w", encoding="utf-8") as f:
            f.write('<doi_batch xmlns="http://www.crossref.org/schema/4.3.7" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" version="4.3.7" xsi:schemaLocation="http://www.crossref.org/schema/4.3.7 http://www.crossref.org/schema/deposit/crossref4.3.7.xsd">\n')

            f.write("<head>\n")
            f.write(f"<doi_batch_id></doi_batch_id>\n")
            f.write(f"<timestamp>{time.time()}</timestamp>\n")
            f.write("<depositor>\n")
            f.write(f"<depositor_name>{depositor_name}</depositor_name>\n")
            f.write(f"<email_address>{depositor_email}</email_address>\n")
            f.write("</depositor>\n")
            f.write("<registrant>WEB-FORM</registrant>\n")
            f.write("</head>\n")  # End of Head

            f.write("<body>\n")
            f.write("<journal>\n")

            f.write("<journal_metadata>\n")
            f.write("<full_title>Journal of Advances in Information Fusion</full_title>\n")
            f.write("<abbrev_title>JAIF</abbrev_title>\n")
            f.write(f"<issn media_type=\"print\">{journal_issn}</issn>\n")
            f.write("</journal_metadata>\n")  # End journal metadata

            f.write("<journal_issue>\n")
            f.write("<publication_date media_type=\"print\">\n")
            f.write(f"<month>{journal_date_month}</month>\n")
            f.write(f"<day></day>\n")
            f.write(f"<year>{journal_date_year}</year>\n")
            f.write("</publication_date>\n")

            f.write("<journal_volume>\n")
            f.write(f"<volume>{journal_vol_num}</volume>\n")
            f.write("</journal_volume>\n")
            f.write(f"<issue>{journal_issue_num}</issue>\n")
            f.write("</journal_issue>\n")  # End journal issue

            f.write("<journal_article publication_type=\"full_text\">\n")
            f.write("<titles>\n")
            f.write(f"<title>{entry_title}</title>\n")
            f.write("</titles>\n")
            f.write("<contributors>\n")
            for author in entries[entry_title]["authors"]:
                names = author.split(" ")
                first_names = " ".join(names[:-1])
                last_name = names[-1]
                f.write("<person_name sequence=\"first\" contributor_role=\"author\">\n")
                f.write(f"<given_name>{first_names}</given_name>\n")
                f.write(f"<surname>{last_name}</surname>\n")
                f.write(f"<ORCID></ORCID>\n")
                f.write("</person_name>\n")
            f.write("</contributors>\n")

            f.write("<publication_date media_type=\"print\">\n")
            f.write(f"<month>{journal_date_month}</month>\n")
            f.write(f"<day></day>\n")
            f.write(f"<year>{journal_date_year}</year>\n")
            f.write("</publication_date>\n")

            pages = entries[entry_title]["pages"].replace("pg ", "").split("-")

            f.write("<pages>\n")
            f.write(f"<first_page>{pages[0]}</first_page>\n")
            f.write(f"<last_page>{pages[1]}</last_page>\n")
            f.write("</pages>\n")

            f.write("<citation_list>\n")
            for ref_id in entries[entry_title]["references"].keys():
                f.write(f'<citation key=\"ref-{ref_id}">\n')
                f.write(f"<unstructured_citation>{' '.join(entries[entry_title]['references'][ref_id].split())}</unstructured_citation>\n")
                f.write("</citation>\n")
            f.write("</citation_list>\n")
            

            f.write("</journal_article>\n")  # End journal article

            f.write("</journal>\n")  # End of journal
            f.write("</body>\n")  # End of body

            f.write(f"</doi_batch>")  # End of XML
    
        # Write HTML
        with open(f"{html_output_dir}/{slugify(entry_title)}.html", "w", encoding="utf-8") as f:
            DOI = ""
            f.write("<html>\n")
            f.write("<body>\n")
            f.write("<div class=\"article-parents\">\n")
            f.write("<ul class=\"article-journal-name\">\n")
            f.write("<li>Journal of Advances in Information Fusion (JAIF)</li>\n")
            f.write(f"<li>Volume {journal_vol_num}</li>\n")
            f.write(f"<li>Issue {journal_issue_num}</li>\n")
            f.write(f"<li>Pages {pages[0]} - {pages[1]}</li>\n")
            f.write(f"<li>{journal_date_month}, {journal_date_year}</li>\n")
            f.write(f"<li class=\"article-doi\"><a href=\"https://doi.org/{DOI}\">https://doi.org/{DOI}</a></li>\n")
            f.write("</ul>\n")
            f.write("</div>\n\n")
            
            f.write(f"<div class=\"article-header\">\n"),
            f.write(f"<h1 class=\"article-title\">{entry_title}</h1>\n")
#             authors = ", ".join(entries[entry_title]["authors"])

            if len(entries[entry_title]["authors"]) > 2:
                authors = ", ".join(entries[entry_title]["authors"][:-1]) + " and " + str(entries[entry_title]["authors"][-1])
            elif len(entries[entry_title]["authors"]) == 2:
                authors = " and ".join(entries[entry_title]["authors"])
            elif len(entries[entry_title]["authors"]) == 1:
                authors = entries[entry_title]["authors"][0]

            f.write(f"<p class=\"article-authors\">{authors}</p>\n")
            f.write(f"</div>\n\n")

            f.write("<div class=\"references\">\n")
            f.write("<h2>References</h2>\n")
            f.write("<ul class=\"unstructured-references\">\n")
            for ref_id in entries[entry_title]["references"].keys():
                f.write(f'<li class=\"ref-{ref_id}">')
                f.write(f"{' '.join(entries[entry_title]['references'][ref_id].split())}")
                f.write("</li>\n")
            f.write("</ul>\n")
            f.write("</div>\n\n")
            
            f.write("</body>\n")  # End of body
            f.write("</html>") # End of HTML

'NoneType' object has no attribute 'find'
'NoneType' object has no attribute 'get_text'
'NoneType' object has no attribute 'get_text'
'NoneType' object has no attribute 'find'
