In [1]:
import spacy
from nltk.corpus import stopwords
from joblib import Parallel, delayed
import pandas as pd

In [2]:
nlp = spacy.load('en_core_web_sm', disable=['tagger', 'parser', 'ner'])
stop_words = set(stopwords.words('english'))

def lemmatize_pipe(doc):
    lemma_list = [str(tok.lemma_).lower() for tok in doc
                  if tok.is_alpha and tok.text.lower() not in stop_words] 
    return lemma_list

def chunker(iterable, total_length, chunksize):
    return (iterable[pos: pos + chunksize] for pos in range(0, total_length, chunksize))

def flatten(list_of_lists):
    "Flatten a list of lists to a combined list"
    return [item for sublist in list_of_lists for item in sublist]

def process_chunk(texts):
    preproc_pipe = []
    for doc in nlp.pipe(texts, batch_size=20):
        preproc_pipe.append(lemmatize_pipe(doc))
    return preproc_pipe

def preprocess_parallel(texts, chunksize=100):
    executor = Parallel(n_jobs=7, backend='multiprocessing', prefer="processes")
    do = delayed(process_chunk)
    tasks = (do(chunk) for chunk in chunker(texts, len(texts), chunksize=chunksize))
    result = executor(tasks)
    return flatten(result)


In [4]:
df = pd.read_parquet('../data/1-raw/TSLA/tsla-news-master.parquet')

sample = df.sample(frac=0.3)

In [None]:
sample['title'] = preprocess_parallel(sample['title'], chunksize=3000)