In [8]:
from typing import List, Union
from tqdm import tqdm
import string
import datetime
from dateutil import parser
import multiprocessing as mp

import pandas as pd
import numpy as np
import spacy
import nltk
from nltk.corpus import stopwords
from sklearn.base import TransformerMixin, BaseEstimator


In [9]:
df = pd.read_parquet('../data/preprocessing_output/cleaned_mails_v0.parquet.gzip')

### Text Normalization Utilities

In [10]:
def spacy_normalization_process(text):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    
    lemma_list = []
    for token in doc:
        lemma_list.append(token.lemma_)
    print("Tokenize+Lemmatize:")
    print(lemma_list)
    
    filtered_sentence =[] 
    for word in lemma_list:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            filtered_sentence.append(word) 
    
    #Remove punctuation
    punctuations="?:!.,;"
    for word in filtered_sentence:
        if word in punctuations:
            filtered_sentence.remove(word)
    
    return filtered_sentence

In [11]:
nlp = spacy.load("en_core_web_sm")
stops = stopwords.words("english")

def _normalize(comment, lowercase, remove_stopwords):
    if lowercase:
        comment = comment.lower()
    comment = nlp(comment)
    lemmatized = list()
    for word in comment:
        lemma = word.lemma_.strip()
        if lemma:
            if not remove_stopwords or (remove_stopwords and lemma not in stops):
                lemmatized.append(lemma)
    return " ".join(lemmatized)


# result = test['body'].apply(_normalize, lowercase=True, remove_stopwords=True).to_frame()

In [None]:



nlp = spacy.load("en_core_web_sm")

class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self,
                 nlp = nlp,
                 n_jobs=1):
        """
        Text preprocessing transformer includes steps:
            1. Punctuation removal
            2. Stop words removal
            3. Lemmatization

        nlp  - spacy model
        n_jobs - parallel jobs to run
        """
        self.nlp = nlp
        self.n_jobs = n_jobs

    def fit(self, X, y=None):
        return self
    

    def transform(self, X, *_):
        X_copy = X.copy()

        partitions = 1
        cores = mp.cpu_count()
        if self.n_jobs <= -1:
            partitions = cores
        elif self.n_jobs <= 0:
            return X_copy.apply(self._preprocess_text)
        else:
            partitions = min(self.n_jobs, cores)

        data_split = np.array_split(X_copy, partitions)
        pool = mp.Pool(cores)
        data = pd.concat(pool.map(self._preprocess_part, data_split))
        pool.close()
        pool.join()

        return data
    
    
    def _remove_punct(self, doc):
        return (t for t in doc if t.text not in string.punctuation)
    

    def _remove_stop_words(self, doc):
        return (t for t in doc if not t.is_stop)
    

    def _lemmatize(self, doc):
        return ' '.join(t.lemma_ for t in doc)
    

    def _preprocess_text(self, text):
        doc = self.nlp(text)
        removed_punct = self._remove_punct(doc)
        removed_stop_words = self._remove_stop_words(removed_punct)
        return self._lemmatize(removed_stop_words)
    
    
    def _preprocess_part(self, part):
        return part.apply(self._preprocess_text)

In [None]:

# clf.fit(X_train, y_train)
# clf.predict(X_test)
nlp = spacy.load("en_core_web_sm")
Normalizer = TextPreprocessor(nlp, -1)
Normalizer.transform(test['body'])