In [1]:
import re
import string
import contractions
from nltk import word_tokenize, pos_tag
from collections import Counter

In [2]:
abstracts_list = []
with open('data/abstracts.txt', 'r', encoding = "UTF-8") as f:  # Read Abstracts
    for line in f:
        abstracts_list.append(line.split('|--|')[1].replace("\n", ""))  

In [16]:
def remove_round_brackets(txt):
    return re.sub('\(.*?\)','',txt)

def remove_punctuation_marks(txt):
    trans = str.maketrans('','', string.punctuation)
    return txt.translate(trans)

def fix_white_space(txt):
    return ' '.join(txt.split())

def convert_to_lowercase(txt):
    return txt.lower()

def remove_contraction(txt):
    return contractions.fix(txt)

def tokenize_text(txt):
    return word_tokenize(txt)

def denoise_text(txt):
    txt = remove_round_brackets(txt)
    txt = remove_punctuation_marks(txt)
    txt = fix_white_space(txt)
    return txt

def normalize_text(txt):
    text = convert_to_lowercase(txt)
    text = remove_contraction(txt)
    tokens = tokenize_text(txt)
    return tokens

def remove_tags(tokens):
    tags = pos_tag(tokens, tagset='universal')
    unique_tags = list(Counter(tags))

    all_tags = []
    for tag in unique_tags:
        if(tag[1]=="ADV" or tag[1]=="ADP" or tag[1]=="CONJ" or tag[1]=="DET" or tag[1]=="NUM" or tag[1]=="PRT"or tag[1]=="PRON"):
            all_tags.append(tag[0])

    return [item for item in tokens if item not in all_tags]

def remove_common_words(tokens):
    common_words = ['is', 'are', 'can', 'be', 'has', 'have', 'been']
    return [item for item in tokens if item not in common_words]

def clean_up(tokens):
    tokens = remove_tags(tokens)
    tokens = remove_common_words(tokens)
    return tokens

abstracts_preprocessed = []
for i, txt in enumerate(abstracts_list):
    denoised_txt = denoise_text(txt)
    tokens = normalize_text(denoised_txt)
    abstracts_preprocessed.append(clean_up(tokens))
    

In [17]:
with open('data/abstracts_preprocessed.txt', 'w', encoding = "UTF-8") as f: # Write Pre-processed Abstracts
    f.write("#".join([",".join(item) for item in abstracts_preprocessed]))