## **Use GROBID to convert PDF to TEI-xml**

- https://grobid.readthedocs.io/en/latest/Grobid-service/
- local grobid server http://localhost:8070/
- run grobid: "./gradlew run"
- search for "virtual" in .ocr 
- -> if successfull -> Grobid

In [None]:
import pandas as pd
import os
from shutil import rmtree
import time
import json

import requests
import bs4 as bs
import zipfile

from tqdm.notebook import tqdm

In [None]:
from grobid_client.grobid_client import GrobidClient

client = GrobidClient(config_path="../../grobid-0.7.2/grobid_client_python/config.json")
#client.process("processFulltextDocument", "/mnt/data/covid/pdfs", n=20)

client.config["batch_size"] = 20000
client.config["timeout"] = 100
client.config["sleep_time"] = 10

client.config

In [None]:
def find_target_words(path):
    target_words = ["virtual"]
    with open(path, "r", encoding='utf-8', errors='ignore') as f:
        ocr = f.readlines()
    text = "".join(ocr).replace("-\n", "").lower() # cleaning linebreaks
    if "virtual" in text:
        return True
    else:
        return False

# Find pdf paths that are contain relevant words
# Delete the other pdfs
def get_pdf_paths(df, journal):
    basepath = f"../../data/fulltexts/{journal}/"
    pdf_path_list = []
    for volume in df.volume.sort_values().unique():
        for issue in df.loc[df.volume == volume].issue.sort_values().unique():
            for doi in df.loc[(df.volume == volume) & (df.issue == issue)].index:
                if f"{doi[8:]}.pdf" in os.listdir(f"{basepath}{volume}/{issue}/{doi[8:]}/"):
                    # look in OCR-file
                    if os.path.isfile(f"{basepath}{volume}/{issue}/{doi[8:]}/{doi[8:]}.ocr"):
                        if find_target_words(f"{basepath}{volume}/{issue}/{doi[8:]}/{doi[8:]}.ocr"):
                            pdf_path_list.append(f"{basepath}{volume}/{issue}/{doi[8:]}/")
                        else:
                            os.remove(f"{basepath}{volume}/{issue}/{doi[8:]}/{doi[8:]}.pdf")
    return pdf_path_list

# convert pdf to tei
def pdf2tei(path):
    client.process("processFulltextDocument", 
                   path, 
                   output=path, 
                   n=15, 
                   consolidate_header=False, 
                   consolidate_citations=True, 
                   include_raw_citations=False, 
                   include_raw_affiliations=False, 
                   segment_sentences=False, 
                   tei_coordinates=False)

In [None]:
pdf_path_list = get_pdf_paths(df, journal)

for pdf_path in tqdm(pdf_path_list):
    
    pdf2tei(pdf_path)
    if os.path.isfile(f"{pdf_path}{pdf_path.split('/')[-2]}.tei.xml"):
        os.remove(f"{pdf_path}{pdf_path.split('/')[-2]}.pdf")
    else:
        print(pdf_path)

## **Extract text from TEI-XML and APS-XML**

In [None]:
import shutil
from bs4 import BeautifulSoup, Tag
import lxml
import re

import multiprocessing as mp
cpu_count = mp.cpu_count() - 4

In [None]:
# get paths
def get_paths(journal):
    basepath = f"../../data/fulltexts/{journal}/"
    path_list = []
    for r, d, f in os.walk(basepath):
        for file in f:
            if file.endswith(".xml"):
                path_list.append(r + "/" + file)
    return path_list

# Check if text contains target words
def find_target_words(path):
    with open(path, "r") as f:
        xml = f.read()
    soup = BeautifulSoup(xml, 'lxml-xml')
    text = "\n".join([p.text for p in soup.find_all("p")])
    text = text.replace("- ", "")
    if "virtual" in text.lower():
        return path
    else:
        return None

def clean_tei(path):
    
    with open(path, "r") as f:
        xml = f.read()
    f.close()
    soup = BeautifulSoup(xml, 'lxml-xml')
    
    # get title
    text = ""
    text += soup.find("title").text
    text += " \n"
    
    # add paragraphs
    paragraphs = soup.find_all("p")
    #remove <ref> tags
    for par in paragraphs:
        while par.find_all("ref"):
            par.ref.decompose()
    text += " \n".join([p.text for p in paragraphs])
    
    # add figure descriptions
    text += " \n".join([x.figDesc.text for x in soup.find_all("figure")])
    text = text.replace("- ", "")
    
    with open(path[:-7] + "txt", "w") as f:
        f.write(text)
    f.close()
    os.remove(path)
    
def clean_xml(path):
    
    with open(path, "r") as f:
        xml = f.read()
    f.close()
    soup = BeautifulSoup(xml, 'lxml-xml')
    body = soup.find("body")

    # get title
    text = ""
    text += body.find("title").text
    text += " \n"

    # add paragraphs
    paragraphs = body.find_all("p")
    for par in paragraphs:
        while par.find_all("xref"):
            par.xref.decompose()
        if par.find_all("inline-formula"):
            for x in par.find_all("inline-formula"):
                x.decompose()
        if par.find_all("disp-formula"):
            for x in par.find_all("disp-formula"):
                x.decompose()
        if par.find_all("disp-formula-group"):
            for x in par.find_all("disp-formula-group"):
                x.decompose()
    text += " \n".join([p.text for p in paragraphs])

    with open(path[:-3] + "txt", "w") as f:
        f.write(text)
    f.close()
    os.remove(path)

In [None]:
# Look for texts containing "virtual"
print("get_paths", time.strftime("%H:%M:%S", time.gmtime()))
path_list = get_paths(journal)
print(len(path_list), "found", time.strftime("%H:%M:%S", time.gmtime()))
print("find_target_words", time.strftime("%H:%M:%S", time.gmtime()))
if __name__ == "__main__":
    with mp.Pool(cpu_count) as pool:
        found_list = pool.map(find_target_words, path_list)
found_list = [x for x in found_list if x]

# copy to upload folder
print("copy_files", time.strftime("%H:%M:%S", time.gmtime()))
target_path = f"../../data/target_texts/{journal}/"
if not os.path.isdir(target_path):
    os.mkdir(target_path)
for path in found_list:
    shutil.copyfile(path, f"{target_path}{path.split('/')[-1]}")
    
# clean TEI
print("clean_tei", time.strftime("%H:%M:%S", time.gmtime()))
tei_list = [target_path+file for file in os.listdir(target_path) if file.endswith("tei.xml")]
if __name__ == "__main__":
    with mp.Pool(cpu_count) as pool:
        found_list = pool.map(clean_tei, tei_list)
        
# clean XML
print("clean_xml", time.strftime("%H:%M:%S", time.gmtime()))
xml_list = [target_path+file for file in os.listdir(target_path) if file.endswith("xml")]
if __name__ == "__main__":
    with mp.Pool(cpu_count) as pool:
        found_list = pool.map(clean_xml, xml_list)
        
print("done", time.strftime("%H:%M:%S", time.gmtime()))