In [1]:
from os import walk
from os.path import join

import pandas as pd
import matplotlib.pyplot as plt

import nltk
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from bs4 import BeautifulSoup
import numpy as np

from sklearn.model_selection import train_test_split

%matplotlib inline

In [2]:
EXAMPLE_FILE = 'SpamData/01_Processing/practice_email.txt'

SPAM_1_PATH = 'SpamData/01_Processing/spam_assassin_corpus/spam_1'
SPAM_2_PATH = 'SpamData/01_Processing/spam_assassin_corpus/spam_2'

EASY_NONSPAM_1_PATH = 'SpamData/01_Processing/spam_assassin_corpus/easy_ham_1'
EASY_NONSPAM_2_PATH = 'SpamData/01_Processing/spam_assassin_corpus/easy_ham_2'

SPAM_CAT = 1
HAM_CAT = 0
VOCAB_SIZE = 2500

DATA_JSON_FILE = 'SpamData/01_Processing/email-text-data.json'
WORD_ID_FILE = 'SpamData/01_Processing/word-by-id.csv'

TRAINING_DATA_FILE = 'SpamData/02_Training/train-data.txt'
TEST_DATA_FILE = 'SpamData/02_Training/test-data.txt'

WHALE_FILE = 'SpamData/01_Processing/wordcloud_resources/whale-icon.png'
SKULL_FILE = 'SpamData/01_Processing/wordcloud_resources/skull-icon.png'
THUMBS_UP_FILE = 'SpamData/01_Processing/wordcloud_resources/thumbs-up.png'
THUMBS_DOWN_FILE = 'SpamData/01_Processing/wordcloud_resources/thumbs-down.png'
CUSTOM_FONT_FILE = 'SpamData/01_Processing/wordcloud_resources/OpenSansCondensed-Bold.ttf'

## Email body extraction

In [3]:
stream = open(EXAMPLE_FILE, encoding='latin-1')

is_body = False
lines = []

for line in stream:
    if is_body:
        lines.append(line)
    elif line == '\n':
        is_body = True

stream.close()

email_body = '\n'.join(lines)
print(email_body)



Dear Mr Still



Good tidings to you and all your staff for the festive season ahead (Christmas).

Now to the crux of the matter-in-hand: I am a fully qualified Santa Claus and am wondering whether you might consider me to run my own "Santa's Grotto" in your store.

But WAIT! You're probably thinking: "What makes him so special?"

Well, first of all, I have made several changes to the characterisation of Father Christmas. Rather than greeting the children with shouts of "Ho, ho, ho!" I prefer to whisper the phrase "Dependence is not unfathomable in this cruel world we live in". In addition, my gifts are ALL hand-made, ranging from felt hoops to vanilla-pod holders.

You will note also, from the enclosed sketch, that I have radically redesigned Santa's outfit and have renamed my character "Lord Buckles". Would you be interested in employing me? I promise NEVER to let you down.

I look forward to hearing from you.



Best wishes

Robin Cooper

[Excerpt from the book: The Timewaster Let

In [4]:
def email_body_generator(path):
    
    for root, dirnames, filenames in walk(path):
        for file_name in filenames:
            
            filepath = join(root, file_name)
            
            stream = open(filepath, encoding='latin-1')

            is_body = False
            lines = []

            for line in stream:
                if is_body:
                    lines.append(line)
                elif line == '\n':
                    is_body = True

            stream.close()

            email_body = '\n'.join(lines)
            
            yield file_name, email_body


In [5]:
def df_from_directory(path, classification):
    rows = []
    row_names = []
    
    for file_name, email_body in email_body_generator(path):
        rows.append({'MESSAGE': email_body, 'CATEGORY': classification})
        row_names.append(file_name)
        
    return pd.DataFrame(rows, index=row_names)

In [6]:
spam_emails = df_from_directory(SPAM_1_PATH, 1)
spam_emails = spam_emails.append(df_from_directory(SPAM_2_PATH, 1))
spam_emails.head()

Unnamed: 0,CATEGORY,MESSAGE
00170.33a973aa9bb7d122bdfbd96d44332996,1,INTEREST RATES HAVE JUST BEEN CUT!!!\n\n \...
00180.13a95a2542a0fd01ff24303561cca949,1,------=_NextPart_000_00A5_78C83A6B.A1543A16\n\...
00201.00020fc9911604f6cae7ae0f598ad29d,1,<html>\n\n\n\n\n\n\n\n<body>\n\n\n\n<div align...
00452.ed43fc952c31c82aa29646edfbecb03f,1,IMPORTANT DOMAIN INFORMATION:\n\n\n\n\n\nThe n...
00286.efd0b8f0c9c779b7a0ad93505c9b0bae,1,"<HTML>\n\n<head>\n\n<META HTTP-EQUIV=""Content-..."


In [7]:
spam_emails.shape

(1898, 2)

In [8]:
ham_emails = df_from_directory(EASY_NONSPAM_1_PATH, HAM_CAT)
ham_emails = ham_emails.append(df_from_directory(EASY_NONSPAM_2_PATH, HAM_CAT))
ham_emails.shape

(3901, 2)

In [9]:
data = pd.concat([spam_emails, ham_emails])
data.head()

Unnamed: 0,CATEGORY,MESSAGE
00170.33a973aa9bb7d122bdfbd96d44332996,1,INTEREST RATES HAVE JUST BEEN CUT!!!\n\n \...
00180.13a95a2542a0fd01ff24303561cca949,1,------=_NextPart_000_00A5_78C83A6B.A1543A16\n\...
00201.00020fc9911604f6cae7ae0f598ad29d,1,<html>\n\n\n\n\n\n\n\n<body>\n\n\n\n<div align...
00452.ed43fc952c31c82aa29646edfbecb03f,1,IMPORTANT DOMAIN INFORMATION:\n\n\n\n\n\nThe n...
00286.efd0b8f0c9c779b7a0ad93505c9b0bae,1,"<HTML>\n\n<head>\n\n<META HTTP-EQUIV=""Content-..."


# Check for missing values

In [10]:
# check if there are empty emails (string length zero)
(data.MESSAGE.str.len() == 0).any()

True

In [11]:
(data.MESSAGE.str.len() == 0).sum()

3

In [12]:
data.MESSAGE.isnull().sum()

0

### Locate empty emails

In [13]:
type(data.MESSAGE.str.len() == 0)

pandas.core.series.Series

In [14]:
data[data.MESSAGE.str.len() == 0].index

Index(['cmds', 'cmds', 'cmds'], dtype='object')

In [15]:
data.index.get_loc('cmds')

array([False, False, False, ..., False, False, False])

In [16]:
data.drop(['cmds'], inplace=True)

In [17]:
data.shape

(5796, 2)

# Add Document IDs to Track Emails in Dataset

In [18]:
document_ids = range(0, len(data.index))
data['DOC_ID'] = document_ids

In [19]:
data['FILE_NAME'] = data.index
data.set_index('DOC_ID', inplace=True)
data.head()

Unnamed: 0_level_0,CATEGORY,MESSAGE,FILE_NAME
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,INTEREST RATES HAVE JUST BEEN CUT!!!\n\n \...,00170.33a973aa9bb7d122bdfbd96d44332996
1,1,------=_NextPart_000_00A5_78C83A6B.A1543A16\n\...,00180.13a95a2542a0fd01ff24303561cca949
2,1,<html>\n\n\n\n\n\n\n\n<body>\n\n\n\n<div align...,00201.00020fc9911604f6cae7ae0f598ad29d
3,1,IMPORTANT DOMAIN INFORMATION:\n\n\n\n\n\nThe n...,00452.ed43fc952c31c82aa29646edfbecb03f
4,1,"<HTML>\n\n<head>\n\n<META HTTP-EQUIV=""Content-...",00286.efd0b8f0c9c779b7a0ad93505c9b0bae


In [20]:
data.tail()

Unnamed: 0_level_0,CATEGORY,MESSAGE,FILE_NAME
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5791,0,Hobos in the depression used to use this techn...,00785.ba3a0a9d41b06b7fd1ea6025a417dfd1
5792,0,_ _ _____ _ __ <*the* weekly high-tech sar...,01389.e4cfb234aace4e12b2d9453686c911c9
5793,0,Don't know if everybody knows this trick...\n\...,00961.404a92dc1c29461b711f0df8e96bbe90
5794,0,http://docs.freebsd.org/cgi/getmsg.cgi?fetch=3...,00205.1b7b16facf48373401d78996a92f6666
5795,0,This is a multi-part message in MIME format.\n...,01324.23a1f5017a5531fca08d9ebe2f5b0537


In [21]:
data.CATEGORY.value_counts()

0    3900
1    1896
Name: CATEGORY, dtype: int64

In [22]:
amount_of_spam = data.CATEGORY.value_counts()[1]
amount_of_ham = data.CATEGORY.value_counts()[0]

# Natural Language Processing

In [23]:
stop_words = set(stopwords.words('english'))

## Functions for Email Processing

In [24]:
def clean_message(message, stemmer=PorterStemmer(),stop_words=set(stopwords.words('english'))):
    
    # Converts to Lower Case and splits up the words
    words = word_tokenize(message.lower())
    
    filtered_words = []
    
    for word in words:
        # Removes the stop words and punctuation
        if word not in stop_words and word.isalpha():
            filtered_words.append(stemmer.stem(word))
    
    return filtered_words

In [25]:
# Challenge: Modify function to remove HTML tags. Then test on Email with DOC_ID 2. 
def clean_msg_no_html(message, stemmer=PorterStemmer(), 
                 stop_words=set(stopwords.words('english'))):
    
    # Remove HTML tags
    soup = BeautifulSoup(message, 'html.parser')
    cleaned_text = soup.get_text()
    
    # Converts to Lower Case and splits up the words
    words = word_tokenize(cleaned_text.lower())
    
    filtered_words = []
    
    for word in words:
        # Removes the stop words and punctuation
        if word not in stop_words and word.isalpha():
            filtered_words.append(stemmer.stem(word))
#             filtered_words.append(word) 
    
    return filtered_words

# Generate Vocabulary & Dictionary

In [26]:
stemmed_nested_list = data.MESSAGE.apply(clean_msg_no_html)

" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup


In [27]:
type(stemmed_nested_list)

pandas.core.series.Series

In [28]:
stemmed_nested_list.head()

DOC_ID
0    [interest, rate, cut, perfect, time, think, re...
1                                                   []
2    [copi, dvd, burner, dvd, wizard, technolog, ad...
3    [import, domain, inform, new, domain, name, fi...
4    [free, info, free, insid, stock, market, repor...
Name: MESSAGE, dtype: object

In [29]:
flat_stemmed_list = [item for sublist in stemmed_nested_list for item in sublist]
flat_stemmed_list

['interest',
 'rate',
 'cut',
 'perfect',
 'time',
 'think',
 'refinanc',
 'home',
 'mortgag',
 'rate',
 'take',
 'minut',
 'fill',
 'quick',
 'onlin',
 'form',
 'http',
 'easi',
 'qualifi',
 'prompt',
 'courteou',
 'servic',
 'low',
 'rate',
 'wait',
 'interest',
 'rate',
 'go',
 'lock',
 'low',
 'rate',
 'unsubscrib',
 'go',
 'http',
 'pleas',
 'allow',
 'hour',
 'remov',
 'copi',
 'dvd',
 'burner',
 'dvd',
 'wizard',
 'technolog',
 'advanc',
 'method',
 'dvd',
 'reproduct',
 'ever',
 'ble',
 'fool',
 'fli',
 'night',
 'websit',
 'offer',
 'outdat',
 'inform',
 'packag',
 'show',
 'b',
 'backup',
 'dvd',
 'vh',
 'cassett',
 'use',
 'burner',
 'go',
 'show',
 'backup',
 'dvd',
 'use',
 'burner',
 'well',
 'make',
 'qualiti',
 'backup',
 'person',
 'dvd',
 'vh',
 'cassett',
 'creat',
 'dvd',
 'librari',
 'never',
 'worri',
 'scratch',
 'lose',
 'dvd',
 'dvd',
 'wizard',
 'pro',
 'complet',
 'unlik',
 'anyth',
 'titor',
 'offer',
 'fulli',
 'guarante',
 'order',
 'today',
 'wo',
 'disap

In [30]:
type(flat_stemmed_list)

list

# Generate Features & a Sparse Matrix

### Creating a DataFrame with one Word per Column

In [31]:
unique_words = pd.Series(flat_stemmed_list).value_counts()
print('Nr of unique words', unique_words.shape[0])
print(unique_words.head())
print(type(unique_words))


Nr of unique words 27305
http     10662
use       5017
list      4852
email     4370
get       4187
dtype: int64
<class 'pandas.core.series.Series'>


In [32]:
frequent_words = unique_words[0:2500]
print('Most common words: \n', frequent_words[:10])

Most common words: 
 http     10662
use       5017
list      4852
email     4370
get       4187
mail      3985
one       3905
free      3171
time      3090
work      2880
dtype: int64


In [33]:
type(stemmed_nested_list.tolist())

list

In [34]:
stemmed_nested_list.tolist()

[['interest',
  'rate',
  'cut',
  'perfect',
  'time',
  'think',
  'refinanc',
  'home',
  'mortgag',
  'rate',
  'take',
  'minut',
  'fill',
  'quick',
  'onlin',
  'form',
  'http',
  'easi',
  'qualifi',
  'prompt',
  'courteou',
  'servic',
  'low',
  'rate',
  'wait',
  'interest',
  'rate',
  'go',
  'lock',
  'low',
  'rate',
  'unsubscrib',
  'go',
  'http',
  'pleas',
  'allow',
  'hour',
  'remov'],
 [],
 ['copi',
  'dvd',
  'burner',
  'dvd',
  'wizard',
  'technolog',
  'advanc',
  'method',
  'dvd',
  'reproduct',
  'ever',
  'ble',
  'fool',
  'fli',
  'night',
  'websit',
  'offer',
  'outdat',
  'inform',
  'packag',
  'show',
  'b',
  'backup',
  'dvd',
  'vh',
  'cassett',
  'use',
  'burner',
  'go',
  'show',
  'backup',
  'dvd',
  'use',
  'burner',
  'well',
  'make',
  'qualiti',
  'backup',
  'person',
  'dvd',
  'vh',
  'cassett',
  'creat',
  'dvd',
  'librari',
  'never',
  'worri',
  'scratch',
  'lose',
  'dvd',
  'dvd',
  'wizard',
  'pro',
  'complet',

In [35]:
word_columns_df = pd.DataFrame.from_records(stemmed_nested_list.tolist())
word_columns_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7661,7662,7663,7664,7665,7666,7667,7668,7669,7670
0,interest,rate,cut,perfect,time,think,refinanc,home,mortgag,rate,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,copi,dvd,burner,dvd,wizard,technolog,advanc,method,dvd,reproduct,...,,,,,,,,,,
3,import,domain,inform,new,domain,name,final,avail,gener,public,...,,,,,,,,,,
4,free,info,free,insid,stock,market,report,valu,get,latest,...,,,,,,,,,,


In [36]:
word_columns_df.shape

(5796, 7671)

### Splitting the Data into a Training and Testing Dataset

In [37]:
# Challenge: Can you split the data into a training and testing set? Set the test size at 30%. 
# The training data should include 4057 emails. Use a seed value of 42 to shuffle the data. 
# What should the target values be? 

In [38]:
X_train, X_test, y_train, y_test = train_test_split(word_columns_df, data.CATEGORY,
                                                   test_size=0.3, random_state=42)

In [39]:
print('Nr of training samples', X_train.shape[0])
print('Fraction of training set', X_train.shape[0] / word_columns_df.shape[0])

Nr of training samples 4057
Fraction of training set 0.6999654934437544


In [40]:
X_train.shape

(4057, 7671)

In [41]:
y_test.shape

(1739,)

In [42]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7661,7662,7663,7664,7665,7666,7667,7668,7669,7670
4844,hi,debian,machin,extern,isdn,termin,adapt,tri,get,dial,...,,,,,,,,,,
4727,theo,van,dinter,wrote,wed,aug,frank,pineau,wrote,omaha,...,,,,,,,,,,
5022,joseph,barrera,iii,use,version,opera,mean,half,page,look,...,,,,,,,,,,
3504,jh,legal,requir,got,similar,check,jh,insur,compani,pay,...,,,,,,,,,,
3921,url,http,date,suppli,damag,form,newli,identifi,gene,implic,...,,,,,,,,,,


In [40]:
X_train.index.name = X_test.index.name = 'DOC_ID'
X_train.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,7661,7662,7663,7664,7665,7666,7667,7668,7669,7670
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4844,hi,debian,machin,extern,isdn,termin,adapt,tri,get,dial,...,,,,,,,,,,
4727,theo,van,dinter,wrote,wed,aug,frank,pineau,wrote,omaha,...,,,,,,,,,,
5022,joseph,barrera,iii,use,version,opera,mean,half,page,look,...,,,,,,,,,,
3504,jh,legal,requir,got,similar,check,jh,insur,compani,pay,...,,,,,,,,,,
3921,url,http,date,suppli,damag,form,newli,identifi,gene,implic,...,,,,,,,,,,


In [41]:
y_train.head()


DOC_ID
4844    0
4727    0
5022    0
3504    0
3921    0
Name: CATEGORY, dtype: int64

### Create a Sparse Matrix for the Training Data

In [None]:
# stemmed_nested_list = data.MESSAGE.apply(clean_msg_no_html)
# flat_stemmed_list = [item for sublist in stemmed_nested_list for item in sublist]
# unique_words = pd.Series(flat_stemmed_list).value_counts()
# frequent_words = unique_words[0:2500]

In [42]:
word_ids = list(range(0, 2500))
vocab = pd.DataFrame({'VOCAB_WORD': frequent_words.index.values}, index=word_ids)
vocab.index.name = 'WORD_ID'
vocab.head()

Unnamed: 0_level_0,VOCAB_WORD
WORD_ID,Unnamed: 1_level_1
0,http
1,use
2,list
3,email
4,get


In [43]:
word_index = pd.Index(vocab.VOCAB_WORD)
type(word_index[3])

str

In [44]:
word_index.get_loc('thu')

395

In [45]:
def make_sparse_matrix(df, indexed_words, labels):
    """
    Returns sparse matrix as dataframe.
    
    df: A dataframe with words in the columns with a document id as an index (X_train or X_test)
    indexed_words: index of words ordered by word id
    labels: category as a series (y_train or y_test)
    """
    
    nr_rows = df.shape[0]
    nr_cols = df.shape[1]
    word_set = set(indexed_words)
    dict_list = []
    
    for i in range(nr_rows):
        for j in range(nr_cols):
            
            word = df.iat[i, j]
            if word in word_set:
                doc_id = df.index[i]
                word_id = indexed_words.get_loc(word)
                category = labels.at[doc_id]
                
                item = {'LABEL': category, 'DOC_ID': doc_id,
                       'OCCURENCE': 1, 'WORD_ID': word_id}
                
                dict_list.append(item)
    
    return pd.DataFrame(dict_list)

In [46]:
sparse_train_df = make_sparse_matrix(X_train, word_index, y_train)

In [47]:
sparse_train_df[:5]

Unnamed: 0,DOC_ID,LABEL,OCCURENCE,WORD_ID
0,4844,0,1,531
1,4844,0,1,1945
2,4844,0,1,468
3,4844,0,1,1641
4,4844,0,1,1487


In [48]:
sparse_train_df.shape

(449250, 4)

In [49]:
sparse_train_df[-5:]

Unnamed: 0,DOC_ID,LABEL,OCCURENCE,WORD_ID
449245,860,1,1,250
449246,860,1,1,0
449247,860,1,1,5
449248,860,1,1,2
449249,860,1,1,0


### Combine Occurrences with the Pandas groupby() Method

In [50]:
train_grouped = sparse_train_df.groupby(['DOC_ID', 'WORD_ID', 'LABEL']).sum()
train_grouped.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,OCCURENCE
DOC_ID,WORD_ID,LABEL,Unnamed: 3_level_1
0,0,1,2
0,8,1,1
0,18,1,1
0,26,1,1
0,28,1,2


In [51]:
vocab.at[0, 'VOCAB_WORD']

'http'

In [52]:
data.MESSAGE[0]

"INTEREST RATES HAVE JUST BEEN CUT!!!\n\n     \n\nNOW is the perfect time to think about refinancing your home mortgage! Rates are down! Take a minute and fill out our quick online form. \n\nhttp://www.newnamedns.com/refi/\n\n           \n\nEasy qualifying, prompt, courteous service, low rates! Don't wait for interest rates to go up again, lock in YOUR low rate now!\n\n          \n\n          \n\n          \n\n         \n\n          \n\n        \n\n---------------------------------------\n\nTo unsubscribe, go to: \n\nhttp://www.newnamedns.com/stopthemailplease/\n\nPlease allow 48-72 hours for removal.\n\n\n"

In [53]:
train_grouped = train_grouped.reset_index()
train_grouped.head()

Unnamed: 0,DOC_ID,WORD_ID,LABEL,OCCURENCE
0,0,0,1,2
1,0,8,1,1
2,0,18,1,1
3,0,26,1,1
4,0,28,1,2


In [54]:
train_grouped.tail()

Unnamed: 0,DOC_ID,WORD_ID,LABEL,OCCURENCE
266539,5795,509,0,2
266540,5795,792,0,2
266541,5795,806,0,2
266542,5795,1647,0,3
266543,5795,1649,0,2


In [55]:
vocab.at[1923, 'VOCAB_WORD']

'exclud'

In [56]:
data.MESSAGE[5795]

'This is a multi-part message in MIME format.\n\n\n\n------=_NextPart_000_0005_01C2291D.98A2ED40\n\nContent-Type: text/plain;\n\n\tcharset="iso-8859-1"\n\nContent-Transfer-Encoding: quoted-printable\n\n\n\nActresses with just the one annoying accent! Don\'t we all just love =\n\nthem? Them and football of course, more of which in the Breaking News =\n\nsection of your friendly neighbourhood Evil Gerald.\n\n\n\nYours,\n\n\n\nThe Evil Gerald=20\n\n\n\n------=_NextPart_000_0005_01C2291D.98A2ED40\n\nContent-Type: text/html;\n\n\tcharset="iso-8859-1"\n\nContent-Transfer-Encoding: quoted-printable\n\n\n\n<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">\n\n<HTML><HEAD>\n\n<META content=3D"text/html; charset=3Diso-8859-1" =\n\nhttp-equiv=3DContent-Type>\n\n<META content=3D"MSHTML 5.00.2919.6307" name=3DGENERATOR>\n\n<STYLE></STYLE>\n\n</HEAD>\n\n<BODY bgColor=3D#ffffff>\n\n<DIV><FONT face=3DArial size=3D2>Actresses with just the one annoying =\n\naccent! Don\'t=20\n\nwe all just 

In [57]:
train_grouped.shape

(266544, 4)

### Save Training Data as .txt File

In [58]:
np.savetxt(TRAINING_DATA_FILE, train_grouped, fmt='%d')

In [59]:
train_grouped.columns

Index(['DOC_ID', 'WORD_ID', 'LABEL', 'OCCURENCE'], dtype='object')

# Challenge

Can you create a sparse matrix for the test data. Group the occurrences of the same word in the same email. Then save the data as a .txt file. 

In [60]:
X_test.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,7661,7662,7663,7664,7665,7666,7667,7668,7669,7670
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4675,messag,mime,format,comput,game,http,wednesday,juli,gmt,space,...,,,,,,,,,,
4220,read,rest,look,quit,good,seem,odd,would,make,seemingli,...,,,,,,,,,,
2484,upon,time,dtd,wrote,thank,great,work,mathia,would,like,...,,,,,,,,,,
2418,hey,problem,rpm,instal,want,uninstal,like,rpm,rpm,packag,...,,,,,,,,,,
5110,tue,aug,wintermut,mention,unimagin,difficult,far,know,linux,kernel,...,,,,,,,,,,


In [61]:
y_test.head()

DOC_ID
4675    0
4220    0
2484    0
2418    0
5110    0
Name: CATEGORY, dtype: int64

In [62]:
X_test.shape

(1739, 7671)

In [63]:
sparse_test_df = make_sparse_matrix(X_test, word_index, y_test)

In [64]:
sparse_test_df.shape

(184883, 4)

In [65]:
test_grouped = sparse_test_df.groupby(['DOC_ID', 'WORD_ID', 'LABEL']).sum().reset_index()
test_grouped.head()

Unnamed: 0,DOC_ID,WORD_ID,LABEL,OCCURENCE
0,12,0,1,1
1,12,2,1,2
2,12,3,1,4
3,12,5,1,2
4,12,10,1,1


In [67]:
test_grouped.shape

(109420, 4)

In [68]:
np.savetxt(TEST_DATA_FILE, test_grouped, fmt='%d')

# Pre-Processing Subtleties and Checking your Understanding

Challenge: We started with 5796 emails. We split it into 4057 emails for training and 1739 emails for testing. 

How many individual emails were included in the testing .txt file? Count the number in the test_grouped DataFrame. After splitting and shuffling our data, how many emails were included in the X_test DataFrame? Is the number the same? If not, which emails were excluded and why? Compare the DOC_ID values to find out.

In [69]:
train_doc_ids = set(train_grouped.DOC_ID)
test_doc_ids = set(test_grouped.DOC_ID)

In [70]:
len(test_doc_ids)

1715

In [71]:
len(X_test)

1739

In [72]:
set(X_test.index.values) - test_doc_ids # Excluded emails after pre-processing

{8,
 88,
 100,
 122,
 139,
 156,
 195,
 214,
 295,
 300,
 450,
 463,
 485,
 497,
 642,
 654,
 693,
 828,
 927,
 1044,
 1084,
 1338,
 1662,
 1831}

In [73]:
data.MESSAGE[14]

'<html>\n\n<head>\n\n<title></title>\n\n</head>\n\n<body bgcolor=3D"#FFFFFF">\n\n\n\n<table width=3D500 cellpadding=3D10 border=3D0><tr>\n\n<td><font  face=3D"bankgothic, western" color=3D"#6600FF" \n\nsize=3D5><p><b>\n\n<a\n\nhref=3D"http://www.herbs4you.net/?id=3D609">\n\nMother Natures all Natural Marital Aid<br> for Men and\n\nWomen - Your\'s Risk Free!</b></p></font>\n\n<font face=3D"arial black" color=3D"#000000" size=3D3><p>The\n\nall natural safe formula for men and women your\'s risk\n\nfree for 30 days. Mother Nature\'s wonder pill of the\n\n21st century.<p>\n\n<li>Increased Sensation</li><br><li>Increased\n\nFrequency</li><br>\n\n<li>Increased Pleasure</li><br><li>Increased\n\nDesire</li><br>\n\n<li>Increased Stamina</li><br><li>Increased\n\nLibido</li></font><br><br>\n\nBoth male and female formulas!<br><p><body\n\nbgcolor=3D"#FFFFFF">\n\nOrder Your  Trial Today !</a></p></font>\n\n</td></tr></table>\n\n<table width=3D500 cellpadding=3D10 border=3D0><tr><td>\n\n<font face=3

In [74]:
data.loc[14]

CATEGORY                                                     1
MESSAGE      <html>\n\n<head>\n\n<title></title>\n\n</head>...
FILE_NAME               00176.79f82496c612ea28f45f13ca5c47f8c2
Name: 14, dtype: object

In [75]:
clean_msg_no_html(data.at[14, 'MESSAGE'])

['mother',
 'natur',
 'natur',
 'marit',
 'aid',
 'men',
 'women',
 'risk',
 'free',
 'natur',
 'safe',
 'formula',
 'men',
 'women',
 'risk',
 'free',
 'day',
 'mother',
 'natur',
 'wonder',
 'pill',
 'centuri',
 'increas',
 'sensationincreas',
 'frequenc',
 'increas',
 'pleasureincreas',
 'desir',
 'increas',
 'staminaincreas',
 'libido',
 'male',
 'femal',
 'formula',
 'order',
 'trial',
 'today',
 'depart',
 'contact',
 'visit',
 'tbeyer']

In [76]:
data.MESSAGE[1096]

'Finally: After 15 Years of Research and Development..A True WEIGHT LOSS Breakthrough Years Ahead of it\'s Time! \n\n\n\nSERIOUSLY: DEEP DOWN YOU KNOW THAT MILLIONAIRES ARE MADE BY GETTING INVOLVED IN OPPORTUNITIES BEFORE THE GENERAL PUBLIC FINDS OUT...HERE IS YOUR CHANCE TO GET INVOLVED WITH A PRODUCT THAT CAN LEAD TO SERIOUS, MONTHLY RESIDUAL INCOME (Do the work once and keep getting paid!)...FIND OUT WHY...FIND OUT HOW...NOW!\n\n\n\nSend a blank email with the words "SUBSCRIBE to T.E.D" in the subject area to:\n\n\n\njmpro@hotpop.com\n\n\n\nThis is your first step towards working with a team of 7 figure income earners that can teach you how to do the same because of a proven system that will be customized for you!\n\n\n\n\n\nmmuocfvfbxotxknbnbdytkwdbonvovyv\n'

In [77]:
clean_msg_no_html(data.at[1096, 'MESSAGE'])

['final',
 'year',
 'research',
 'true',
 'weight',
 'loss',
 'breakthrough',
 'year',
 'ahead',
 'time',
 'serious',
 'deep',
 'know',
 'millionair',
 'made',
 'get',
 'involv',
 'opportun',
 'gener',
 'public',
 'find',
 'chanc',
 'get',
 'involv',
 'product',
 'lead',
 'seriou',
 'monthli',
 'residu',
 'incom',
 'work',
 'keep',
 'get',
 'paid',
 'find',
 'find',
 'send',
 'blank',
 'email',
 'word',
 'subscrib',
 'subject',
 'area',
 'jmpro',
 'first',
 'step',
 'toward',
 'work',
 'team',
 'figur',
 'incom',
 'earner',
 'teach',
 'proven',
 'system',
 'custom',
 'mmuocfvfbxotxknbnbdytkwdbonvovyv']

In [78]:
clean_message(data.at[1096, 'MESSAGE'])

['final',
 'year',
 'research',
 'true',
 'weight',
 'loss',
 'breakthrough',
 'year',
 'ahead',
 'time',
 'serious',
 'deep',
 'know',
 'millionair',
 'made',
 'get',
 'involv',
 'opportun',
 'gener',
 'public',
 'find',
 'chanc',
 'get',
 'involv',
 'product',
 'lead',
 'seriou',
 'monthli',
 'residu',
 'incom',
 'work',
 'keep',
 'get',
 'paid',
 'find',
 'find',
 'send',
 'blank',
 'email',
 'word',
 'subscrib',
 'subject',
 'area',
 'jmpro',
 'first',
 'step',
 'toward',
 'work',
 'team',
 'figur',
 'incom',
 'earner',
 'teach',
 'proven',
 'system',
 'custom',
 'mmuocfvfbxotxknbnbdytkwdbonvovyv']