# **Importing Data**



In [31]:
import os
import numpy as np
import glob
import email
import email.policy

DATASETS_DIR = 'datasets'
MODELS_DIR = 'models'
TAR_DIR = os.path.join(DATASETS_DIR, 'tar')

SPAM_URL = 'https://spamassassin.apache.org/old/publiccorpus/20021010_spam.tar.bz2'
SPAM2_URL = 'https://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2'
EASY_HAM_URL = 'https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham.tar.bz2'
EASY_HAM2_URL = 'https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham_2.tar.bz2'
HARD_HAM_URL = 'https://spamassassin.apache.org/old/publiccorpus/20030228_hard_ham.tar.bz2'

In [65]:
from urllib.request import urlretrieve
import tarfile
import shutil

def download_dataset(url):
    """download and unzip data from a url into the specified path"""

    # create directory if it doesn't exist
    if not os.path.isdir(TAR_DIR):
        os.makedirs(TAR_DIR)

    filename = url.rsplit('/', 1)[-1]
    tarpath = os.path.join(TAR_DIR, filename)

    # download the tar file if it doesn't exist
    try:
        print("Downloading", tarpath)
        tarfile.open(tarpath)
    except:
        urlretrieve(url, tarpath)

    with tarfile.open(tarpath) as tar:
        dirname = os.path.join(DATASETS_DIR, tar.getnames()[0])
        if os.path.isdir(dirname):
            shutil.rmtree(dirname)
        tar.extractall(path=DATASETS_DIR)

        cmds_path = os.path.join(dirname, 'cmds')
        if os.path.isfile(cmds_path):
            os.remove(cmds_path)

    return dirname


def load_dataset(dirpath):
    """load emails from the specified directory"""

    files = []
    filepaths = glob.glob(dirpath + '/*')
    for path in filepaths:
        with open(path, 'rb') as f:
            byte_content = f.read()
            str_content = byte_content.decode('utf-8', errors='ignore')
            files.append(str_content)
            # files.append(email.parser.BytesParser(policy=email.policy.default).parse(f))

    return files


In [66]:
# download the data
spam_dir = download_dataset(SPAM_URL)
spam2_dir = download_dataset(SPAM2_URL)
easy_ham_dir = download_dataset(EASY_HAM_URL)
easy_ham2_dir = download_dataset(EASY_HAM2_URL)
hard_ham_dir = download_dataset(HARD_HAM_URL)

# load the datasets from datasets/tar/*
spam = load_dataset(spam_dir)
spam2 = load_dataset(spam2_dir)
easy_ham = load_dataset(easy_ham_dir)
easy_ham2 = load_dataset(easy_ham2_dir)
hard_ham = load_dataset(hard_ham_dir)

print("Emails Ham :", len(easy_ham + easy_ham2 + hard_ham))
print("Emails Spam :" ,len(spam + spam2))

Downloading datasets/tar/20021010_spam.tar.bz2
Downloading datasets/tar/20050311_spam_2.tar.bz2
Downloading datasets/tar/20030228_easy_ham.tar.bz2
Downloading datasets/tar/20030228_easy_ham_2.tar.bz2
Downloading datasets/tar/20030228_hard_ham.tar.bz2
Emails Ham : 4150
Emails Spam : 1897


In [67]:
from  sklearn.utils import shuffle

# create the full dataset
X = spam + spam2 + easy_ham + easy_ham2 + hard_ham
y = np.concatenate((np.ones(len(spam) + len(spam2)), np.zeros(len(easy_ham) + len(easy_ham2) + len(hard_ham))))

# shuffle the dataset
X, y = shuffle(X, y, random_state=42)


In [28]:
y[0]

0.0

In [69]:
print(X[0])

From rpm-list-admin@freshrpms.net  Wed Aug 14 10:59:36 2002
Return-Path: <rpm-zzzlist-admin@freshrpms.net>
Delivered-To: yyyy@localhost.netnoteinc.com
Received: from localhost (localhost [127.0.0.1])
	by phobos.labs.netnoteinc.com (Postfix) with ESMTP id 119F24413D
	for <jm@localhost>; Wed, 14 Aug 2002 05:51:38 -0400 (EDT)
Received: from phobos [127.0.0.1]
	by localhost with IMAP (fetchmail-5.9.0)
	for jm@localhost (single-drop); Wed, 14 Aug 2002 10:51:38 +0100 (IST)
Received: from egwn.net (ns2.egwn.net [193.172.5.4]) by
    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g7E6NQ403793 for
    <jm-rpm@jmason.org>; Wed, 14 Aug 2002 07:23:27 +0100
Received: from auth02.nl.egwn.net (localhost [127.0.0.1]) by egwn.net
    (8.11.6/8.11.6/EGWN) with ESMTP id g7E6L2J22652; Wed, 14 Aug 2002 08:21:02
    +0200
Received: from smtp.comcast.net (smtp.comcast.net [24.153.64.2]) by
    egwn.net (8.11.6/8.11.6/EGWN) with ESMTP id g7E6KGJ17437 for
    <rpm-list@freshrpms.net>; Wed, 14 Aug 2002 08:20

# **Data Preparation**

In [68]:
def remove_header(email):
    """remove the header from an email"""
    return email[email.index('\n\n'):]


print(remove_header(X[0]))



Can someone tell me what I am doing wrong? it's like make doesn't take the
PREFIX override... It's crazy. I don't get it...
OS: TurboLinux6 rmp3.0.6 make 3.79.1

I just don't get this. It should work...

---

%define real_name Mail-SpamAssassin
%define real_version 2.31

########################################################################
# Package Information
########################################################################
Name:         SpamAssassin
Version:      2.31
Summary:      SpamAssassin - A perl-based spam filter
URL:          http://www.spamassassin.org
Group:        Networking/Mail
License:      Artistic
Release:      1

########################################################################
# Sources/Patches
########################################################################
Source0: http://spamassassin.org/devel/%{real_name}-%{real_version}.tar.gz

########################################################################
# Build Configuration
###########

# **Load Data**

In [None]:
!unzip /content/emails_datasets.zip

Archive:  /content/emails_datasets.zip
  inflating: spam_assassin.csv       


In [1]:
import pandas as pd

data_email = pd.read_csv("spam_assassin.csv")
data_email.head(20)

Unnamed: 0,text,target
0,From ilug-admin@linux.ie Mon Jul 29 11:28:02 2...,0
1,From gort44@excite.com Mon Jun 24 17:54:21 200...,1
2,From fork-admin@xent.com Mon Jul 29 11:39:57 2...,1
3,From dcm123@btamail.net.cn Mon Jun 24 17:49:23...,1
4,From ilug-admin@linux.ie Mon Aug 19 11:02:47 2...,0
5,From tobaccodemon@terra.es Sat Sep 7 22:05:58 ...,1
6,From larlar78@MailOps.Com Sat Jun 30 00:19:08 ...,1
7,From rpm-list-admin@freshrpms.net Thu Jul 25 1...,0
8,From exmh-users-admin@redhat.com Wed Aug 7 06:...,0
9,From contractor@goldenbay.com.cy Tue Jul 23 23...,1


In [20]:
data_email.text.iloc[0]



In [3]:
data_email.describe()

Unnamed: 0,target
count,5796.0
mean,0.327122
std,0.469203
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [4]:
data_email.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5796 entries, 0 to 5795
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5796 non-null   object
 1   target  5796 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 90.7+ KB
