In [1]:
import multiprocessing
from typing import Iterator
import spacy

In [2]:
nlp = spacy.load('en')

In [3]:
# String Constants
INPUTFILE = '/Volumes/Lexar/plaintext/titleabstract.txt'
OUTPUTFILE = '/Volumes/Lexar/plaintext/titleabstract_tokens.txt'
UNISENT = '/Volumes/Lexar/plaintext/titleabstract_uni_sent.txt'

In [4]:
def get_doc(infile: str) -> Iterator[str]:
    '''Yield new line terminated strings with newlines stripped
    Parameters
        infile: full path string to file containing strings
    Returns
        each string with newline stripped
    '''
    with open(infile, 'r', encoding='utf-8') as inf:
        for line in inf:
            yield line.strip()

In [5]:
def doc2tokens(infile: str) -> Iterator[str]:
    '''Lemantize and remove punctuation from strings (documents)
    Parameters
        infile: full string path to file with newline terminated strings
    Returns
        an iterator of lematized strings with punctuation removed

    '''
    for doc in nlp.pipe(get_doc(infile),
                        batch_size=10000,
                        n_threads=multiprocessing.cpu_count()):
        yield ' '.join((token.lemma_
                        for token in doc
                        if not token.is_punct))

In [6]:
def doctokens2file(infile: str, outfile: str) -> str:
    '''Write lematized strings (documents) with punctuation removed
    to file. Each document is a newline separated string.
    Parameters
        infile: full string path to original strings
        outfile: full string path to file to write processed strings
    Returns
        outfile: full string path to file with processed strings
    '''
    with open(outfile, 'w', encoding='utf-8') as outf:
        for tokens in doc2tokens(infile):
            outf.write(tokens + '\n')
    return outfile

In [7]:
doctokens2file(INPUTFILE, OUTPUTFILE)

'/Volumes/Lexar/plaintext/titleabstract_tokens.txt'

In [8]:
def process_sentence(infile: str) -> Iterator[str]:
    ''' Split a newline terminated string(document) into sentences using spacy
    Parameters
        infile: text file containing newline terminated strings
    Returns
        Iterator of lematized newline terminated sentences

    '''
    for doc in nlp.pipe(get_doc(infile),
                        batch_size=10000,
                        n_threads=multiprocessing.cpu_count()):
        for sent in doc.sents:
            yield ' '.join((token.lemma_
                            for token in sent
                            if not token.is_punct)) + '\n'

In [9]:
def senttokens2file(infile: str, outfile: str) -> str:
    '''Write sentences to a file. These are unigram sentences
       used for further processing.
    Parameters
        infile: text file containing newline terminated document strings
        outfile: text file containing newline terminated sentence strings
    Returns
        outfile: string full path of outfile to pass through
    '''
    with open(outfile, 'w', encoding='utf-8') as outf:
        for sentence in process_sentence(infile):
            outf.write(sentence)
    return outfile

In [10]:
senttokens2file(INPUTFILE, UNISENT)

'/Volumes/Lexar/plaintext/titleabstract_uni_sent.txt'