In [1]:
from typing import List, Union
from tqdm import tqdm
import string
import datetime
from dateutil import parser
import multiprocessing as mp

import pandas as pd
import numpy as np
import spacy
import nltk
from nltk.corpus import stopwords
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.model_selection import train_test_split


  from .autonotebook import tqdm as notebook_tqdm


### Data Cleaning Utilities

In [2]:
def change_date_type(dates: Union[pd.DataFrame, pd.Series]) -> List:
    """
    Formats string column into datetime object
    """
    column = []
    
    for date in dates:
        column.append(parser.parse(date).strftime("%d-%m-%Y %H:%M:%S"))
    
    series = pd.Series(column)
    return pd.to_datetime(series)


def str_to_list(row):
    """convert a string List into a List"""
    row = str(row).strip("[]").replace("'","")
    return row


def parsed_email_processing(df: pd.DataFrame) -> pd.DataFrame:
    """
    Basic email formatting and cleaning
    """
    
    df['Date'] = change_date_type(df['Date'])
    
    df['body'] = df['body'].str.replace('\n','').str.replace('\t','')
    
    df['To'] = df['To'].astype('str')\
        .str.replace('b','')\
        .apply(str_to_list)
        
    df['From'] = df['From'].astype('str')\
        .str.replace('b','')\
        .apply(str_to_list)
    
    return df

### Text Normalization Utilities

In [3]:
def spacy_normalization_process(text):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    
    lemma_list = []
    for token in doc:
        lemma_list.append(token.lemma_)
    print("Tokenize+Lemmatize:")
    print(lemma_list)
    
    filtered_sentence =[] 
    for word in lemma_list:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            filtered_sentence.append(word) 
    
    #Remove punctuation
    punctuations="?:!.,;"
    for word in filtered_sentence:
        if word in punctuations:
            filtered_sentence.remove(word)
    
    return filtered_sentence

In [4]:
nlp = spacy.load("en_core_web_sm")
stops = stopwords.words("english")

def _normalize(comment, lowercase, remove_stopwords):
    if lowercase:
        comment = comment.lower()
    comment = nlp(comment)
    lemmatized = list()
    for word in comment:
        lemma = word.lemma_.strip()
        if lemma:
            if not remove_stopwords or (remove_stopwords and lemma not in stops):
                lemmatized.append(lemma)
    return " ".join(lemmatized)


# result = test['body'].apply(_normalize, lowercase=True, remove_stopwords=True).to_frame()

In [5]:
nlp = spacy.load("en_core_web_sm")

class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self,
                 nlp = nlp,
                 n_jobs=1):
        """
        Text preprocessing transformer includes steps:
            1. Punctuation removal
            2. Stop words removal
            3. Lemmatization

        nlp  - spacy model
        n_jobs - parallel jobs to run
        """
        self.nlp = nlp
        self.n_jobs = n_jobs

    def fit(self, X, y=None):
        return self
    

    def transform(self, X, *_):
        X_copy = X.copy()

        partitions = 1
        cores = mp.cpu_count()
        if self.n_jobs <= -1:
            partitions = cores
        elif self.n_jobs <= 0:
            return X_copy.apply(self._preprocess_text)
        else:
            partitions = min(self.n_jobs, cores)

        data_split = np.array_split(X_copy, partitions)
        pool = mp.Pool(cores)
        data = pd.concat(pool.map(self._preprocess_part, data_split))
        pool.close()
        pool.join()

        return data
    
    
    def _remove_punct(self, doc):
        return (t for t in doc if t.text not in string.punctuation)
    

    def _remove_stop_words(self, doc):
        return (t for t in doc if not t.is_stop)
    

    def _lemmatize(self, doc):
        return ' '.join(t.lemma_ for t in doc)
    

    def _preprocess_text(self, text):
        doc = self.nlp(text)
        removed_punct = self._remove_punct(doc)
        removed_stop_words = self._remove_stop_words(removed_punct)
        return self._lemmatize(removed_stop_words)
    
    
    def _preprocess_part(self, part):
        return part.apply(self._preprocess_text)

### Text features

In [6]:
not_spam_df = pd.read_parquet('../data/labeled_data/nonspam.parquet.gzip')
spam_df = pd.read_parquet('../data/labeled_data/spam.parquet.gzip')

In [7]:
complete_df = pd.concat([spam_df, not_spam_df]).reset_index(drop=True)

In [105]:
# complete_df.to_parquet('../data/labeled_data/labeled_data_v0.parquet.gzip', compression='gzip', engine='fastparquet')
complete_df.to_csv('../data/labeled_data/labeled_data_v0.csv', index=False)

In [8]:
complete_df = parsed_email_processing(complete_df)

In [9]:
nlp = spacy.load("en_core_web_sm")
Normalizer = TextPreprocessor(nlp, -1)
complete_df['body_transformed'] = Normalizer.transform(complete_df['body'])

In [37]:
complete_df[complete_df['spam'] == 0]['body'].to_list()[1]

'OUTLOOK EMAIL NOTIFICATIONYour Date of Migration is: May 10thYOU WILL BE UNABLE TO SEND E-MAIL unless you take the following action:Please go through your Notes email and clean out as many old/un-needed emai=l=20items as possible BEFORE your date of migration.? After you are migrated to==20Outlook you will only be allocated 100MB of total Mailbox space.?? If more==20than this amount of data is migrated to Outlook YOU WILL NOT BE ABLE TO SEN=D=20E-MAIL until it is below the 100MB limit.? Cleaning up your Notes email now==20will prevent this from happening to YOU.Enron=01,s messaging platform is migrating from Lotus Notes to Microsoft Ou=tlook=202000 worldwide.  You will be accessing Outlook for all of your email=20functions.=20WHY IS ENRON MIGRATING TO OUTLOOK 2000?Many factors contributed to the decision to migrate from Lotus Notes to=20Microsoft Exchange/Outlook.  The most prominent factors were:? Significant advantages to moving to a product that is more integrated wit=h=20current E

In [93]:
def format_hugging_face_input_string(
    not_spam_mail_list: List[str], 
    spam_mail_list: List[str], 
    end_sequence: str = '######$######',
    test_case: str= None
):
    base_not_spam_str = """"""
    for mail in not_spam_mail_list:
        base_not_spam_str += f"""\nEmail: {mail}\nSpam: NotSpam\n{end_sequence}\n
        """
    
    base_spam_str = """"""
    for mail in not_spam_mail_list:
        base_spam_str += f"""\nEmail: {mail}\nSpam: Spam\n{end_sequence}\n
        """
    formated_string = f"""{base_not_spam_str}\n{base_spam_str}"""
    
    if test_case:
        formated_string += f"""\nEmail: {test_case}\nSpam:"""
    
    return formated_string


_not_spam_test = complete_df[complete_df['spam'] == 0]['body'].to_list()
_spam_test = complete_df[complete_df['spam'] == 1]['body'].to_list()

In [98]:
_not_spam_test[-1]

"Here is the language I mentioned in my voice mail.  Cynthia Sandherr's office should be getting in touch this afternoon with the addresses, forms of address, etc.  The letters need to be hand delivered in DC on Monday so they need to be fed exed to our DC office tonight.  Please call if you have any questions.Thanks!Mark"

In [100]:
print(format_hugging_face_input_string(_not_spam_test[:10], _spam_test[10], test_case=_not_spam_test[-1]))



Email: The file was too big to email.  It contains all financial deals (P,B,I,GD) in the active NG-PRICE book.  It does not contain any option deals.  I will set up the new books so that we can move the net positive deals by counterparty.  Let me know if you have any questions.  It is saved in O:/_Dropbox.DG
Spam: NotSpam
######$######

        
Email: OUTLOOK EMAIL NOTIFICATIONYour Date of Migration is: May 10thYOU WILL BE UNABLE TO SEND E-MAIL unless you take the following action:Please go through your Notes email and clean out as many old/un-needed emai=l=20items as possible BEFORE your date of migration.? After you are migrated to==20Outlook you will only be allocated 100MB of total Mailbox space.?? If more==20than this amount of data is migrated to Outlook YOU WILL NOT BE ABLE TO SEN=D=20E-MAIL until it is below the 100MB limit.? Cleaning up your Notes email now==20will prevent this from happening to YOU.Enron=01,s messaging platform is migrating from Lotus Notes to Microsoft Ou=

In [83]:
_merged_str = f"""{_formated_not_spam}\n{_formated_spam}"""
print(_merged_str)


Email: The file was too big to email.  It contains all financial deals (P,B,I,GD) in the active NG-PRICE book.  It does not contain any option deals.  I will set up the new books so that we can move the net positive deals by counterparty.  Let me know if you have any questions.  It is saved in O:/_Dropbox.DG
Spam: Not Spam
######$######

        
Email: OUTLOOK EMAIL NOTIFICATIONYour Date of Migration is: May 10thYOU WILL BE UNABLE TO SEND E-MAIL unless you take the following action:Please go through your Notes email and clean out as many old/un-needed emai=l=20items as possible BEFORE your date of migration.? After you are migrated to==20Outlook you will only be allocated 100MB of total Mailbox space.?? If more==20than this amount of data is migrated to Outlook YOU WILL NOT BE ABLE TO SEN=D=20E-MAIL until it is below the 100MB limit.? Cleaning up your Notes email now==20will prevent this from happening to YOU.Enron=01,s messaging platform is migrating from Lotus Notes to Microsoft Ou

In [81]:
def merge_formated_strings(not_spam_str, spam_str):
    base = """"""
    return 


Email: $ * $ * $ * $ * HOTWEBCASH * $ * $ * $ * $Have you been turned down from receiving a loan by a bank because of poor credit, no credit or bankruptcy?Private Foundations are not interested in credit ratings, only demonstrated need.IF YOU CAN DEMONSTRATE A GENUINE NEED, YOU CAN APPLY FOR A GRANT!It is much easier to receive a grant by mail from a Private Foundation than a local bank, because they are much more lenient. Imagine getting a check, payable to you, in the mail! Money you never have to pay back!WE WILL SHOW YOU HOW!http://r1.adversend.com/?u=303&l=5&id=4199108<a href="http://r1.adversend.com/?u=303&l=5&id=4199108"> AOL Users Click Here </a>Please pass this letter along to someone you feel could use the help!Thanks, $ * HOTWEBCASH * $ http://www.hotwebcash.com<<<>>> <<<>>> <<<>>> <<<>>> <<<>>> <<<>>> <<<>>> <<<>>> <<<>>>  * To be removed from this mailing list, click here:http://www.adversend.com/u/?l=hotwebcash&e=kevin.presto@ubswenergy.com<a href="http://www.adversend.c

For more reference about the methodology we are going to use take a look into: 
- https://towardsdatascience.com/sentence-transformer-fine-tuning-setfit-outperforms-gpt-3-on-few-shot-text-classification-while-d9a3788f0b4e
- https://github.com/pmbaumgartner/setfit

- ### Setfit Classification

- `1` stands for *spam* mails
- `0` stands for *non spam* mails

In [11]:
! pip install git+https://github.com/pmbaumgartner/setfit -q

In [12]:
from setfit import SetFitClassifier

There was a problem when trying to write in your cache folder (/Users/luis.morales/.cache/huggingface/hub). You should set the environment variable TRANSFORMERS_CACHE to a writable directory.


In [13]:
complete_df['spam'].value_counts()

1    1092
0     394
Name: spam, dtype: int64

In [14]:
train_df, test_df = train_test_split(complete_df, test_size=0.2)

In [15]:
classifier = SetFitClassifier('paraphrase-MiniLM-L3-v2')

NotADirectoryError: [Errno 20] Not a directory: '/Users/luis.morales/.cache/torch'