# Data Pre-Processing
## Notebook for data pre-processing, splitting in preperation for Training

### Import libraries and define utility functions

In [157]:
%pip install spacy
!python -m spacy download en_core_web_lg 

import pandas as pd
import numpy as np
import re
import spacy
nlp = spacy.load("en_core_web_lg")

from sklearn.model_selection import train_test_split
from pathlib import Path
from sagemaker import Session
from datetime import datetime


def preprocess_email_text(email_text):

    #Remove leading b and new lines
    email_text = re.sub(r'^b', '', email_text)  
    email_text = re.sub(r'\\r\\n', ' ', email_text)
    
    #Remove stop words, punctuations and lemmetize
    filtered_tokens = []
    doc = nlp(email_text)
    for token in doc:
        if not (token.is_stop or token.is_punct):
            filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)

def createFolders(folder_path):
    folder = Path(folder_path)
    try:
        folder.mkdir(parents=True, exist_ok=False)
    except FileExistsError as e:
        print(f"Directory already exists: {folder_path}")
        
sagemaker_session= Session()

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m22.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Collecting en-core-web-lg==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.4.1/en_core_web_lg-3.4.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m22.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


### Read raw labelled data and split data for training

In [158]:
#read raw data to dataframe
proj_bucket = "nikhil-spam-ham"
data_location = 's3://{}/{}'.format(proj_bucket, 'raw/emails_raw.csv')
emails = pd.read_csv(data_location, index_col=None)

#emails=emails.head(100)

#pre-process
emails["message_preprocessed"] = ""
emails.astype({'message_preprocessed': 'string'}).dtypes
print("Starting pre-processing - " + datetime.now().strftime("%Y%m%d-%H%M%S"))
emails["message_preprocessed"] = emails["message"].apply(lambda text: preprocess_email_text(str(text)))
print("Completed pre-processing - " + datetime.now().strftime("%Y%m%d-%H%M%S"))

#Convert to blazing text format
emails["message_preprocessed_bt"] = ""
emails.astype({'message_preprocessed_bt': 'string'}).dtypes
emails['message_preprocessed_bt'] = '__label__' + emails['spam'].astype(str) + ' ' + emails['message_preprocessed']

train, validation, test = np.split(
    emails, [int(0.8 * len(emails)), int(0.9 * len(emails))]
)

train = train['message_preprocessed_bt']
validation = validation['message_preprocessed_bt']
test = test['message_preprocessed_bt']


Starting pre-processing - 20221027-213414
Completed pre-processing - 20221027-215952


### Write split dataset to S3

In [159]:
#Create local directories for split
project_path = "/opt/projects/spamclass"
createFolders(project_path)

data_path = project_path + "/data"
createFolders(data_path)

try:
    Path(data_path+"/train").mkdir(parents=True, exist_ok=True)
    Path(data_path+"/validation").mkdir(parents=True, exist_ok=True)
    Path(data_path+"/test").mkdir(parents=True, exist_ok=True)
except FileExistsError as fee:
    print("Directory already exists")
    print(fee)

    #Write to S3
print(f"Writing training data set with: {train.shape[0]} emails ")
pd.DataFrame(train).to_csv(str(Path(data_path+'/train/train.csv').absolute()), header=False, index=False)
sagemaker_session.upload_data(f'{data_path}/train/train.csv', bucket=proj_bucket, key_prefix='train')
print(f"data loading done. Training data uploaded to {proj_bucket} bucket" )


print(f"Writing validation data set with: {validation.shape[0]} emails ")
pd.DataFrame(validation).to_csv(str(Path(data_path+'/validation/validation.csv').absolute()), header=False, index=False)
sagemaker_session.upload_data(f'{data_path}/validation/validation.csv', bucket=proj_bucket, key_prefix='validation')
print(f"data loading done. Validation data uploaded to {proj_bucket} bucket" )


print(f"Writing test data set with: {test.shape[0]} emails ")
pd.DataFrame(test).to_csv(str(Path(data_path+'/test/test.csv').absolute()), header=False, index=False)
sagemaker_session.upload_data(f'{data_path}/test/test.csv', bucket=proj_bucket, key_prefix='test')
print(f"data loading done. Test data uploaded to {proj_bucket} bucket" )

Directory already exists: /opt/projects/spamclass
Directory already exists: /opt/projects/spamclass/data
Writing training data set with: 26972 emails 
data loading done. Training data uploaded to nikhil-spam-ham bucket
Writing validation data set with: 3372 emails 
data loading done. Validation data uploaded to nikhil-spam-ham bucket
Writing test data set with: 3372 emails 
data loading done. Test data uploaded to nikhil-spam-ham bucket
