In [None]:
import re
import string
import glob
import gzip
import xml.etree.ElementTree as ET
from multiprocessing import Pool, cpu_count
import numpy as np
from smart_open import smart_open
from tqdm import tqdm


def preprocessing(path, corpus_file='corpus_file.txt'):
    """
    Creates corpus based on articles from annual baseline of Medline/Pubmed Database
    
    Parameters
    ----------
    path : string
        Path to folder with files .gz from pubmed
    corpus_file: string
        Name of file, in which corpus will be created
    """
    with smart_open(corpus_file, mode='w+') as corpus, Pool(cpu_count()) as p:
        for article in p.imap(process_article_set, glob.glob('{}*.gz'.format(path))):
            corpus.write(article)


def process_article_set(file):
    translator = str.maketrans('', '', string.punctuation)
    with gzip.open(file) as xml_file:
        try:
            article_set = ET.parse(xml_file).getroot()
            results = ''
            for article in article_set:
                results += process_article(article, translator)
            return results
        except ET.ParseError:
            return ''


def process_article(article, translator):
    title = article.find('MedlineCitation/Article/ArticleTitle')
    abstract = article.find('MedlineCitation/Article/Abstract/AbstractText')
    mesh_heading = process_mesh_heading(article.find('MedlineCitation/MeshHeadingList'))
    article_data = ''
    if title is not None:
        article_data += process_raw_text(title, translator)
    if abstract is not None:
        article_data += process_raw_text(abstract, translator)
    if mesh_heading is not None:
        article_data += process_raw_text(mesh_heading, translator)
    return '{}\n'.format(article_data.strip().replace('\n', ''))


def process_raw_text(data, translator):
    """Removes punctuation and uppercase from given string."""
    try:
        return ' '.join(data.text.lower().translate(translator).split())
    except AttributeError:
        return ''


def process_mesh_heading(data):
    """Reads meshheadinglist and returns names of descriptors."""
    return ' '.join(map(lambda x: x.text.lower(), data.findall(
        'MeshHeading/DescriptorName'))) if data is not None else ''


def length_of_local_context(path):
    """Finds the length of local context window, which fully covers every single article"""
    with open(path, 'r+') as corpus, Pool(cpu_count()) as p:
        results = [cur_length for cur_length in p.imap(count_words, corpus)]
        return 'Average length: {} \n Max: {} \n Median: {}'.format(
            np.mean(results), np.max(results), np.median(results))

    
def count_words(text):
    return len(split_words(text))


def split_words(text):
    return text.split()

Removing nosiy data like most common english words and integers

In [None]:
with smart_open('google-10000-english.txt', 'r+') as f:
    top_english_words = [word.strip('\n') for word in f]


def cast_to_integer(word):
    try:
        return int(word)
    except:
        return None
    
def process_line(line):
    word, counter = line.strip('\n').split()
    if word in top_english_words:
        return None
    if cast_to_integer(word):
        return None
    return line 


with smart_open('vocabulary10.txt', 'r+') as vocab, smart_open('filtered_vocab_10.txt', 'w+') as output, Pool(cpu_count()) as p:
    for line in tqdm(p.imap(process_line, vocab)):
        if line:
            output.write('{}\n'.format(line))