# Data Loading
## Notebook for data ingestion, formating in preperation for pre-processing

In [2]:
#%%writefile /opt/projects/spamclass/src/data_ingest.py

import boto3
import os
from datetime import datetime

from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.datasets import load_files
from sagemaker import Session

def createFolders(folder_path):
    folder = Path(folder_path)
    try:
        folder.mkdir(parents=True, exist_ok=False)
    except FileExistsError as e:
        print(f"Directory already exists: {folder_path}")


def create_bucket(bucket_name, region=None):
    """Create an S3 bucket in a specified region

    If a region is not specified, the bucket is created in the S3 default
    region (us-east-1).

    :param bucket_name: Bucket to create
    :param region: String region to create bucket in, e.g., 'us-west-2'
    :return: True if bucket created, else False
    """

    # Create bucket
    try:
        if region is None:
            s3_client = boto3.client('s3')
            s3_client.create_bucket(Bucket=bucket_name)
        else:
            s3_client = boto3.client('s3', region_name=region)
            location = {'LocationConstraint': region}
            s3_client.create_bucket(Bucket=bucket_name,
                                    CreateBucketConfiguration=location)
    except ClientError as e:
        logging.error(e)
        return False
    return True
    
    
project_path = "/opt/projects/spamclass"
createFolders(project_path)

data_path = project_path + "/data"
createFolders(data_path)

proj_bucket = "nikhil-spam-ham"
create_bucket(proj_bucket)

sagemaker_session= Session()

#1. Download files and unzip.
#http://nlp.cs.aueb.gr/software_and_datasets/Enron-Spam/preprocessed/enron
for i in range(1,7):
    !wget http://nlp.cs.aueb.gr/software_and_datasets/Enron-Spam/preprocessed/enron{i}.tar.gz -P {data_path}/raw
    !tar -xzf {data_path}/raw/enron{i}.tar.gz -C {data_path}/raw/ --no-same-owner
    !rm -rf {data_path}/raw/enron{i}.tar.gz
    
# #2. Load data to dataframes
raw_data_path = data_path + "/raw"
#print(raw_data_path)

X, y = [], []
for subDirectory in os.listdir(raw_data_path):
    subDirectoryPath = Path(os.path.join(raw_data_path, subDirectory))
    if subDirectoryPath.is_dir():
        print("Started loading: " + subDirectory + " - "+datetime.now().strftime("%Y%m%d-%H%M%S"))
        all_data = load_files(os.path.join(raw_data_path, subDirectory))
        X = np.append(X, all_data.data)
        y = np.append(y, all_data.target)
        print("Completed loading: " + subDirectory + " - "+datetime.now().strftime("%Y%m%d-%H%M%S"))
print("Done loading data to dataset")

#mapping dataset to dataframe. creating label
emails = pd.DataFrame(columns=['spam', 'category', 'message'])
emails['spam'] = [y for y in y]
emails['category'] = emails.apply (lambda row: 'ham' if row.spam == 0 else 'spam', axis=1)
emails['message'] = [x for x in X]
print("Done mapping dataset to dataframe")

#3. Save as csv to local and upload to S3
emails_in_csv = Path(raw_data_path + "/emails_raw.csv")
emails_in_csv.parent.mkdir(parents=True, exist_ok=True)
emails.to_csv(emails_in_csv,index=False)
print("Done saving dataframe to local filesystem")

#upload to S3
sagemaker_session.upload_data(f'{raw_data_path}/emails_raw.csv', bucket=proj_bucket, key_prefix='raw')
print(f"data loading done. Raw data uploaded to {proj_bucket} bucket" )

Directory already exists: /opt/projects/spamclass
Directory already exists: /opt/projects/spamclass/data
--2022-10-26 20:50:02--  http://nlp.cs.aueb.gr/software_and_datasets/Enron-Spam/preprocessed/enron1.tar.gz
Resolving nlp.cs.aueb.gr (nlp.cs.aueb.gr)... 195.251.248.252
Connecting to nlp.cs.aueb.gr (nlp.cs.aueb.gr)|195.251.248.252|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1802573 (1.7M) [application/x-gzip]
Saving to: ‘/opt/projects/spamclass/data/raw/enron1.tar.gz’


2022-10-26 20:50:05 (781 KB/s) - ‘/opt/projects/spamclass/data/raw/enron1.tar.gz’ saved [1802573/1802573]

--2022-10-26 20:50:06--  http://nlp.cs.aueb.gr/software_and_datasets/Enron-Spam/preprocessed/enron2.tar.gz
Resolving nlp.cs.aueb.gr (nlp.cs.aueb.gr)... 195.251.248.252
Connecting to nlp.cs.aueb.gr (nlp.cs.aueb.gr)|195.251.248.252|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2905627 (2.8M) [application/x-gzip]
Saving to: ‘/opt/projects/spamclass/data/raw/enr

In [6]:
emails.head()

Unnamed: 0,spam,category,message
0,0.0,ham,b'Subject: nesa / hea \' s 24 th annual meetin...
1,0.0,ham,b'Subject: meter 1431 - nov 1999\r\ndaren -\r\...
2,1.0,spam,"b""Subject: investor here .\r\nfrom : mr . rich..."
3,1.0,spam,"b""Subject: hi paliourg all available meds . av..."
4,0.0,ham,b'Subject: january nominations at shell deer p...
