## **Further Cleaning of the Fulltexts**

In [None]:
import pandas as pd
import os
import re
from tqdm.notebook import tqdm
from datetime import datetime
from collections import Counter
from itertools import chain

import scispacy
import spacy
from string import punctuation
from nltk.tokenize import sent_tokenize, word_tokenize

import multiprocessing as mp
cpu_count = mp.cpu_count() - 2

In [None]:
# gibt Pfade zu den Texten zurück, in denen vorher schon "virtual"
# die noch nicht gecleant wurden

def get_paths(journal, basepath, outpath):
    
    textstodo = set([x for x in os.listdir(basepath + journal)])
    textsdone = set([x for x in os.listdir(outpath + journal)])
    textstodo = [basepath + journal + "/" + x for x in textstodo.difference(textsdone)]
    
    return textstodo

def rejoin_sents(cleaned_sents):
    
    joined_sents = []
    temp_sent = ""
    for sent in cleaned_sents:
        if sent[-1].isalpha():
            sent += "."
            joined_sents.append((temp_sent + sent).replace("  ", " "))
            temp_sent = ""
        elif sent[-1] == ".":
            joined_sents.append((temp_sent + sent).replace("  ", " "))
            temp_sent = ""
        else:
            temp_sent += sent + " "
            
    if len(joined_sents) == 0:
        joined_sents.append(temp_sent)
            
    return joined_sents

def clean_paragraphs(paragraphs):
    
    cleaned_paragraphs = []
    
    for p in paragraphs:

        p = p.replace("{", "(") # wird oft falsch von OCR erkannt
        p = p.replace("}", ")")
        p = re.sub("\(.*?\)", "", p) # alles in Klammern raus

        # sentence segmentation
        doc = nlp(p)

        # cleaning
        cleaned_sents = []
        for sent in doc.sents:

            sent_tokens = []
            for token in sent:
                # Nur Zahlen, rein alphabetische Wörter (da fliegt halt schlechtes OCR raus) und bestimmte Punktuationen
                if token.is_digit or token.is_alpha or token.text in {",", ";", ".", ":"}:
                    sent_tokens.append(token.text_with_ws)

            sent = "".join(sent_tokens)

            if len(sent) > 10: cleaned_sents.append(sent)

        if len(cleaned_sents) > 0:
            out_p = "\n".join(cleaned_sents)
            out_p = out_p.replace("  ", " ")
            if out_p[0] == " ": out_p = out_p[1:]
            out_p = out_p.replace(" . ", ". ")
            out_p = out_p.replace(" , ", " ")
            out_p = out_p.replace(" , ", ", ")
            out_p = out_p.replace("..", ".")
            out_p = out_p.replace(". .", "")
            out_p = out_p.replace(" . ", "")
            out_p = out_p.replace(", ,", "")
            out_p = out_p.replace(" , ", "")
            out_p = out_p.replace(" ; ", "")
            out_p = out_p.replace(" ; ", ";")
            out_p = out_p.replace(" ;", "; ")
            if out_p[-1].isalpha(): out_p += "."

            cleaned_paragraphs.append(out_p)
    
    return cleaned_paragraphs

# init clean
# save to disc
def innit(path):
    
    with open(path, "r") as f: 
        paragraphs = f.read().split("\n")
        
    cleaned_paragraphs = clean_paragraphs(paragraphs)
    
    # paragraphs seperated by double newline
    with open(outpath + path.split("/")[-2] + "/" + path.split("/")[-1], "w") as f:
        f.write("\n\n".join(cleaned_paragraphs))

In [None]:
nlp = spacy.load("en_core_sci_lg")

basepath = "../../data/target_texts/"
outpath = "../../data/cleaned_texts/"

journals = ["rmp", "pr", "pra", "prb", "prc", "prd", "pre", "prl"]

for journal in journals:
    
    print(journal, " - ", datetime.now().strftime("%H:%M:%S"))
    
    if not os.path.isdir(outpath + journal):
        os.mkdir(outpath + journal)
    
    paths = get_paths(journal, basepath, outpath)
    
    if __name__ == "__main__":
        with mp.Pool(cpu_count) as pool:
            pool.map(innit, paths)

### **Connect and Store Data in DataFrame**

In [None]:
journals = ["rmp", "pr", "pra", "prb", "prc", "prd", "pre", "prl"]
dfs = [pd.read_pickle(f"../../data/combined_metadata/{journal}_metadata.pkl") for journal in journals if os.path.isfile(f"../../data/combined_metadata/{journal}_metadata.pkl")]
df = pd.concat(dfs, ignore_index=False)

In [None]:
## check if cleaned texts still contain "virtual"
outpath = "../../data/cleaned_texts/"
target_dois = {}
counter = 0

for journal in tqdm(journals):
    for path in os.listdir(outpath + journal):
        with open(outpath + journal + "/" + path, "r") as f:
            text = f.read()
        search_text = text.lower()
        if "virtual" in search_text:
            doi = "10.1103/" + path[:-4]
            target_dois[doi] = text
        else:
            counter += 1
print(counter, "Texte fallen noch raus.")

# merge together
target_df = df.drop(["volume", "issue", "bibcode", "aff", "arxiv_handle", "first_author", "database", "section"], axis=1).copy()
target_df = target_df.assign(text = pd.Series({**target_dois}))
target_df = target_df[target_df.text.notna()] # nur Artikel mit volltext (dh mit virtual)
target_df.to_pickle("../../data/cleaned_texts/cleaned_texts_df.pkl")

In [None]:
para_df = pd.read_pickle("../../data/cleaned_texts/filtered_paragraphs.pkl")

In [None]:
vdeps = [x for x in para_df.virtual_deps if x]
Counter([x for x in chain.from_iterable(vdeps) if "virtually" in x]).most_common(10)

In [None]:
def filter_adverbs(deps):
    
    keep = {"virtually_excited_state", 
        "virtually_excited", 
        "virtually_energy",
        "virtually_effect",
        "virtually_state",
        "virtually_bound_state"}
    
    if not deps:
        return None
    
    return_deps = []
    for dep in deps:
        if not target_word in dep or dep in keep:
            return_deps.append(dep)
    if len(return_deps) > 0:
        return return_deps
    else:
        return None
    
# Merge dependencies into cleaned_texts df
def get_list(llist):
    temp = []
    for x in llist:
        if x:
            for y in x:
                temp.append(y)
    return temp

In [None]:
target_word = "virtually"
para_df["virtual_deps"] = para_df.virtual_deps.apply(filter_adverbs)

para_df = para_df.loc[(para_df.virtual_deps.notna())].copy()
para_df = para_df.reset_index(drop=True)

para_df["length"] = para_df.lemmas.apply(lambda x : len(list(chain.from_iterable(x))))
para_df = para_df.loc[para_df.length > 4].copy()
para_df = para_df.loc[para_df.length < 1500].copy()
para_df = para_df.reset_index(drop=True)

para_df["paragraph_id"] = para_df.index

para_df.to_pickle("../../data/cleaned_texts/filtered_paragraphs.pkl")

In [None]:
df = pd.read_pickle("../../data/cleaned_texts/cleaned_texts_df.pkl")
df = df.drop(["virtual_deps"], axis=1)

p_df = para_df.groupby("doi").agg(
    {"virtual_deps" : get_list})


df = df.merge(p_df, how="left", left_index=True, right_index=True)

df.virtual_deps = df.virtual_deps.apply(lambda x: x if type(x) != float and len(x) > 0 else None)

df.to_pickle("../../data/cleaned_texts/cleaned_texts_df.pkl")

## **Dependency Parsing and fixing of common OCR errors**

In [None]:
df = pd.read_pickle("../../data/cleaned_texts/cleaned_texts_df.pkl")
df = df[df.virtual_deps.notna()].copy()
df = df.drop(["virtual_deps"], axis=1)

In [None]:
# Fehler bei denen OCR die Leerzeichen nicht erkannt hat lösen wie virtualparticle, etc. lösen,
def fix_ocr(text):
    text = text.lower()
    sents = sent_tokenize(text)
    new_sents = []
    for sent in sents:
        words = word_tokenize(sent)
        new_words = []
        for word in words:
            if "virtual" in word and word not in {"virtual", "virtually", "virtuality", "virtualities"}:
                if "virtualities" in word:
                    new_words.append("virtualities")
                    new_words.append(word.split("virtualities")[1])
                elif "virtuality" in word:
                    new_words.append("virtuality")
                    new_words.append(word.split("virtuality")[1])
                elif "virtually" in word:
                    new_words.append("virtually")
                    new_words.append(word.split("virtually")[1])
                else:
                    new_words.append("virtual")
                    new_words.append(word.split("virtual")[1])
            else:
                new_words.append(word)
        new_sents.append(" ".join(new_words))
    new_text = " ".join(new_sents)
    new_text = new_text.replace(" .", ".")
    new_text = new_text.replace(" ,", ",")
    new_text = new_text.replace(" ;", ";")
    new_text = new_text.replace(" :", ":")
    return new_text
    
def depparsing(doi):
    
    deps = []
    doc = nlp(df.at[doi, "text"])
    for sent in doc.sents:
        for token in sent:
            if "virtual" in token.text:
                temp_dep = [token.text]
                if token.text == "virtuality" or token.text == "virtualities":
                    for child in token.children:
                        temp_dep.append(child.text)
                else:
                    head = token.head
                    while head.pos_ != "NOUN":
                        temp_dep.append(head.text)
                        if head == head.head: # prevent infinity loop
                            break
                        head = head.head
                    temp_dep.append(head.text)
                deps.append(temp_dep)
    return deps
        
    
def join_deps(deps):
    temp = []
    # fix error for adverbs
    for dep in deps:
        if len(dep) >= 2:
            if dep[-1] == dep[-2]:
                dep = dep[:-1]
        if len(dep) > 0:
            temp.append("_".join(dep))
    if len(temp) > 0:
        return temp
    else:
        return None
    
    
# cleans dependencies to merge plural / singular
def clean_a_bit(deps):

    skip_set = {"nucleus", "axis", "indices", "hypothesis"}
    return_deps = []
    for dep in deps:
        temp_dep = []
        for word in dep:
            if word[-1] == "s":
                if word in skip_set:
                    temp_dep.append(word)
                    continue
                if word[-2:] == "ss":
                    temp_dep.append(word)
                    continue
                if word[-3:] == "ies":
                    temp_dep.append(word[:-3] + "y")
                    continue
                if word[-4:] == "sses":
                    temp_dep.append(word[:-2])
                    continue
                temp_dep.append(word[:-1])
            else:
                temp_dep.append(word)

        return_deps.append(temp_dep)
    return return_deps

In [None]:
# Clean OCR errors

for index in tqdm(df.index):
    
    df.at[index, "text"] = fix_ocr(df.at[index, "text"])
    
df.to_pickle("../../data/cleaned_texts/virtual_fulltexts.df")

In [None]:
# Dependency Parsing

df = pd.read_pickle("../../data/cleaned_texts/virtual_fulltexts.df")

nlp = spacy.load("en_core_sci_lg")

df["deps"] = None
df["cleaned_deps"] = None

if __name__ == "__main__":
    with mp.Pool(cpu_count) as pool:
        dep_list = pool.map(depparsing, df.index)

for doi, deps in zip(df.index, dep_list):
    
    cleaned_deps = clean_a_bit(deps)
    df.at[doi, "deps"] = join_deps(deps)
    df.at[doi, "cleaned_deps"] = join_deps(cleaned_deps)
    
df.to_pickle("../../data/cleaned_texts/virtual_fulltexts.df")