# PREPROCESSING

## Imports

In [35]:
import json
import nltk
import pickle
import re
import spacy
import string

from csv import reader
from ewiser.spacy.disambiguate import Disambiguator
from pathlib import Path


## Constant Paths

In [4]:
# PATHS: INFILES
EWISER_CHECKPOINT = Path('./ewiser.semcor+wngt.pt')

# Path to directory containing COCA text files
COCA_DIR = Path("./COCA_text/text_2012-2015_ksr")

# Directorty to Elsevier OA text files
ELSEVIER_OA_DIR = Path("./elsevier-oa/json")

# Path to file containing names of all files in ELSEVIER_OA_DIR
ELSEVIER_OA_INDEX = Path("./elsevier-oa/os-ccby-40k-ids.csv")

# PATHS: OUTFILES

# Path to file containing names of all COCA text files
PREPROC_DIR = Path("./coca-preproc-spacy/")

# Path to outfile containing the filtered version of ELSEVIER Corpus
ELSEVIER_DATA_PATH = Path("./raw_data.json")

# Path to directory where preprocessed data will be saved
PREPROC_DIR = Path("./elsevier-preproc-spacy/")

# Path to file where all unique subject areas will be saved
SUBJAREAS = Path("./subjareas.txt")

## Loading the preprocessing pipeline

In [5]:
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
wsd = Disambiguator(EWISER_CHECKPOINT, lang="en")
wsd.enable(nlp, 'wsd')

2023-06-13 16:08:50 | INFO | pytorch_pretrained_bert.modeling | Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .
2023-06-13 16:08:51 | INFO | pytorch_pretrained_bert.tokenization | loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt from cache at /Users/norahahr/.pytorch_pretrained_bert/cee054f6aafe5e2cf816d2228704e326446785f940f5451a5b26033516a4ac3d.e13dbb970cb325137104fb2e5f36fe865f27746c6b526f6352861b1980eb80b1
2023-06-13 16:08:51 | INFO | pytorch_pretrained_bert.modeling | loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz from cache at /Users/norahahr/.pytorch_pretrained_bert/7fb0534b83c42daee7d3ddb0ebaa81387925b71665d6ea195c5447f1077454cd.eea60d9ebb03c75bb36302aa9d241d3b7a04bba39c360cf035e8bf8140816233
2023-06-13 16:08:51 | INFO | pytorch_pretrained_bert.modeling | extracting archive file /Users/norahahr/.pytorch_pretrained_bert/7fb0534b83c42d

# PREPROCESSING: COCA

The preprocessing of the COCA is fairly stright forward. Particular sequences of noise are removed from the corpus and the texts are separated from each other. 

In [36]:
def filter_text(doc):
    """
    Input path to file containing several texts from the COCA corpus. Removes noise and splits texts by common separator.
    Returns list with cleaned texts.
    """
    with open(doc, "r") as f:
        text = f.read()
    
    text = re.sub(r"\#|\@ \@ \@ \@ \@ \@ \@ \@ \@ \@", "", text)
    text = re.sub(r"\@\@\d{7}", "@@", text)

    texts = text.split("@@")

    return texts

def coca_preprocess_doc(doc):
    """Input a text from the COCA corpus and preprocess it."""

    text_sentences = nltk.sent_tokenize(doc)
    filtered_sentences = [sentence.translate(str.maketrans('', '', string.punctuation)) 
                          for sentence in text_sentences]
    text_docs = list(nlp.pipe(filtered_sentences))


    preproc_doc = {
        'text_docs': text_docs
    }

    return preproc_doc

In [7]:
genres = ['mag', 'fic', 'news', 'spok']

all_texts = []
    
for genre in genres:
    filtered_texts = filter_text(f"{COCA_DIR}/2015_{genre}.txt")

    for text in filtered_texts:
        pp_text = coca_preprocess_doc(doc)
        all_texts.append(pp_text)

with open(f'{PREPROC_DIR}/2015.pickle', 'wb') as f:
    pickle.dump(all_texts, f)
            

# PREPROCESSING: ELSEVIER OA CC-BY

Only the subject area and the textual data is of relevance for the analysis. Thus, all other data is filtered from the corpus. The full body text is also reconstructed with the original sentence order. All articles are stored with document ID, a list of subject areas, and full text as documents in a json-file.

In [None]:
json_obj = []

with open(ELSEVIER_OA_INDEX) as id_file:
    all_ids = reader(id_file)

    for id in all_ids:
        with open(f"{ELSEVIER_OA_DIR}/{(id[0])}.json") as file:
            data = json.load(file)

            res = sorted(data["body_text"], key=lambda x: x['startOffset'])
            body_text = " ".join([res[index]["sentence"] for index in range(len(res))])

            article_data = {"docID": data.get("docId", "null"),
                            "subjareas": data["metadata"].get("subjareas", "null"),
                            "body_text": body_text}

            json_obj.append(article_data)

with open(ELSEVIER_DATA_PATH, "w") as outfile:
    json.dump(json_obj, outfile)

Next, we find all academic disciplines that are contained within the corpus and we thereafter preprocess the docuemnts. 

In [1]:
unique_subjareas = set()

with open(ELSEVIER_DATA_PATH, "r") as f:
    elsevier_data = json.load(f)

    for doc in elsevier_data:
        subjareas = doc.get("subjareas", [])
        unique_subjareas.update(subjareas)

# Print the unique subjareas
print(unique_subjareas)

with open(SUBJAREAS, "w") as f:
    for subjarea in unique_subjareas:
        f.write("%s\n" % subjarea)

{'ECON', 'DENT', 'ENGI', 'MULT', 'EART', 'ENVI', 'NEUR', 'NURS', 'MEDI', 'ENER', 'VETE', 'MATE', 'PHYS', 'MATH', 'SOCI', 'IMMU', 'DECI', 'PSYC', 'ARTS', 'AGRI', 'COMP', 'PHAR', 'CENG', 'HEAL', 'CHEM', 'BIOC', 'BUSI'}


In [46]:
def elsevier_preprocess_document(doc):
    """
    Input a fail containing and article from the Elsevier OA CC-BY corpus. 
    Return a dictionary containing a preprocessed version of the body text, the document ID,
    and the subject areas that the article belongs to. 
    """
    doc_id = doc['docID']
    subj_areas = doc['subjareas']
    body_text = doc['body_text']

    text_sentences = nltk.sent_tokenize(body_text)
    
    filtered_sentences = [sentence.translate(str.maketrans('', '', string.punctuation)) 
                          for sentence in text_sentences]
    text_docs = list(nlp.pipe(filtered_sentences))

    preproc_doc = {
        'docID': doc_id,
        'subjareas': subj_areas,
        'body_text_docs': text_docs
    }

    return preproc_doc

In [None]:
subjareas = list(unique_subjareas)

for subject in subjareas:
    print(subject)
    
    subject_list = []

    for doc in elsevier_data:
        if subject in doc['subjareas']:
            subject_list.append(doc)

    subject_list = [elsevier_preprocess_document(doc) for doc in subject_list]

    with open(f'{PREPROC_DIR}/{subject}.pickle', 'wb') as f:
            pickle.dump(subject_list, f)