In [1]:
import re
import pandas as pd
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm', disable=['tagger', 'parser', 'ner'])
nlp.add_pipe(nlp.create_pipe('sentencizer'))

In [3]:
stopwordfile = 'stopwords/stopwords.txt'

def get_stopwords():
    "Return a set of stopwords read in from a file."
    with open(stopwordfile) as f:
        stopwords = []
        for line in f:
            stopwords.append(line.strip("\n"))
    # Convert to set for performance
    stopwords_set = set(stopwords)
    return stopwords_set

stopwords = get_stopwords()

In [4]:
inputfile = 'nytimes.tsv'
def read_data(inputfile):
    "Read in a tab-separated file with date, headline and news content"
    df = pd.read_csv(inputfile, sep='\t', header=None,
                     names=['date', 'headline', 'content'])
    df['date'] = pd.to_datetime(df['date'], format="%Y-%m-%d")
    return df

In [5]:
df = read_data(inputfile)
df.head(3)

Unnamed: 0,date,headline,content
0,2016-06-30,washington nationals max scherzer baffles mets...,Stellar pitching kept the Mets afloat in the f...
1,2016-06-30,mayor de blasios counsel to leave next month t...,Mayor Bill de Blasio’s counsel and chief legal...
2,2016-06-30,three men charged in killing of cuomo administ...,In the early morning hours of Labor Day last y...


In [8]:
df.shape

(8821, 4)

In [9]:
limit = 8821

def cleaner(df):
    "Extract relevant text from DataFrame using a regex"
    # Regex pattern for only alphanumeric, hyphenated text with 3 or more chars
    pattern = re.compile(r"[A-Za-z0-9\-]{3,50}")
    df['clean'] = df['content'].str.findall(pattern).str.join(' ')
    if limit > 0:
        return df.iloc[:limit, :].copy()
    else:
        return df

In [10]:
df_preproc = cleaner(df)
df_preproc.head(3)

Unnamed: 0,date,headline,content,clean
0,2016-06-30,washington nationals max scherzer baffles mets...,Stellar pitching kept the Mets afloat in the f...,Stellar pitching kept the Mets afloat the firs...
1,2016-06-30,mayor de blasios counsel to leave next month t...,Mayor Bill de Blasio’s counsel and chief legal...,Mayor Bill Blasio counsel and chief legal advi...
2,2016-06-30,three men charged in killing of cuomo administ...,In the early morning hours of Labor Day last y...,the early morning hours Labor Day last year gr...


In [11]:
def lemmatize(text):
    """Perform lemmatization and stopword removal in the clean text
       Returns a list of lemmas
    """
    doc = nlp(text)
    lemma_list = [str(tok.lemma_).lower() for tok in doc
                  if tok.is_alpha and tok.text.lower() not in stopwords]
    return lemma_list

In [12]:
%%time
df_preproc['preproc'] = df_preproc['clean'].apply(lemmatize)
df_preproc[['date', 'content', 'preproc']].head(3)

CPU times: user 48.4 s, sys: 154 ms, total: 48.6 s
Wall time: 48.6 s


Unnamed: 0,date,content,preproc
0,2016-06-30,Stellar pitching kept the Mets afloat in the f...,"[stellar, pitch, keep, mets, afloat, half, sea..."
1,2016-06-30,Mayor Bill de Blasio’s counsel and chief legal...,"[mayor, bill, blasio, counsel, chief, legal, a..."
2,2016-06-30,In the early morning hours of Labor Day last y...,"[early, labor, group, gunman, street, gang, cr..."


In [13]:
def lemmatize_pipe(doc):
    lemma_list = [str(tok.lemma_).lower() for tok in doc
                  if tok.is_alpha and tok.text.lower() not in stopwords] 
    return lemma_list

def preprocess_pipe(texts):
    preproc_pipe = []
    for doc in nlp.pipe(texts, batch_size=20):
        preproc_pipe.append(lemmatize_pipe(doc))
    return preproc_pipe

In [14]:
%%time
df_preproc['preproc_pipe'] = preprocess_pipe(df_preproc['clean'])
df_preproc[['date', 'content', 'preproc_pipe']].head(3)

CPU times: user 49.8 s, sys: 120 ms, total: 50 s
Wall time: 50 s


Unnamed: 0,date,content,preproc_pipe
0,2016-06-30,Stellar pitching kept the Mets afloat in the f...,"[stellar, pitch, keep, mets, afloat, half, sea..."
1,2016-06-30,Mayor Bill de Blasio’s counsel and chief legal...,"[mayor, bill, blasio, counsel, chief, legal, a..."
2,2016-06-30,In the early morning hours of Labor Day last y...,"[early, labor, group, gunman, street, gang, cr..."


In [15]:
from joblib import Parallel, delayed

def chunker(iterable, total_length, chunksize):
    return (iterable[pos: pos + chunksize] for pos in range(0, total_length, chunksize))

def flatten(list_of_lists):
    "Flatten a list of lists to a combined list"
    return [item for sublist in list_of_lists for item in sublist]

def process_chunk(texts):
    preproc_pipe = []
    for doc in nlp.pipe(texts, batch_size=20):
        preproc_pipe.append(lemmatize_pipe(doc))
    return preproc_pipe

def preprocess_parallel(texts, chunksize=100):
    executor = Parallel(n_jobs=7, backend='multiprocessing', prefer="processes")
    do = delayed(process_chunk)
    tasks = (do(chunk) for chunk in chunker(texts, len(df_preproc), chunksize=chunksize))
    result = executor(tasks)
    return flatten(result)

In [16]:
%%time
df_preproc['preproc_parallel'] = preprocess_parallel(df_preproc['clean'], chunksize=1000)
df_preproc[['date', 'content', 'preproc_parallel']].head(3)

CPU times: user 757 ms, sys: 268 ms, total: 1.03 s
Wall time: 15.4 s


Unnamed: 0,date,content,preproc_parallel
0,2016-06-30,Stellar pitching kept the Mets afloat in the f...,"[stellar, pitch, keep, mets, afloat, half, sea..."
1,2016-06-30,Mayor Bill de Blasio’s counsel and chief legal...,"[mayor, bill, blasio, counsel, chief, legal, a..."
2,2016-06-30,In the early morning hours of Labor Day last y...,"[early, labor, group, gunman, street, gang, cr..."


In [17]:
stopwords = list(stopwords)

In [18]:
%%time
df_preproc['preproc_stopword_list'] = df_preproc['clean'].apply(lemmatize)
df_preproc[['date', 'content', 'preproc_stopword_list']].head(3)

CPU times: user 1min 25s, sys: 184 ms, total: 1min 25s
Wall time: 1min 25s


Unnamed: 0,date,content,preproc_stopword_list
0,2016-06-30,Stellar pitching kept the Mets afloat in the f...,"[stellar, pitch, keep, mets, afloat, half, sea..."
1,2016-06-30,Mayor Bill de Blasio’s counsel and chief legal...,"[mayor, bill, blasio, counsel, chief, legal, a..."
2,2016-06-30,In the early morning hours of Labor Day last y...,"[early, labor, group, gunman, street, gang, cr..."
