In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine, cdist

In [2]:
def load_enron_data(path):
    """Load Enron email data from path into a file."""
    for filename in os.listdir(path):
        row = {
            'filename': filename,
            'content': open(os.path.join(path, filename), 'r', encoding='latin1').read()
        }
        yield row

In [3]:
spam_df = pd.DataFrame(load_enron_data('data/enron1/spam/'))
spam_df['is_spam'] = True

In [4]:
ham_df = pd.DataFrame(load_enron_data('data/enron1/ham'))
ham_df['is_spam'] = False

In [5]:
email_df = pd.concat([spam_df, ham_df], axis=0)

In [6]:
email_df.shape

(5172, 3)

In [7]:
vec = TfidfVectorizer(stop_words='english', max_features=2000)

In [8]:
vec = vec.fit(ham_df['content'])

In [23]:
# vec.vocabulary_

In [9]:
ham_vectors = vec.transform(ham_df['content']).toarray()

In [10]:
user_doc = ham_df['content'][11]

In [11]:
user_doc_vector = vec.transform([user_doc]).toarray()

In [12]:
user_doc_vector

array([[0., 0., 0., ..., 0., 0., 0.]])

In [13]:
distances = cdist(user_doc_vector,
                  ham_vectors,
                  metric='cosine')[0]

In [14]:
distances.shape

(3672,)

In [15]:
ranking = np.argsort(distances)

In [16]:
ranking.shape

(3672,)

In [17]:
top_ten = ranking[:10]
top_ten

array([  11, 1371,  562,  752, 1816, 2776, 2671, 3037, 1698, 2133])

In [18]:
distances[top_ten]

array([0.        , 0.52531978, 0.6464135 , 0.65238122, 0.66343585,
       0.69978574, 0.72376664, 0.7244444 , 0.75382551, 0.75867949])

In [19]:
ham_df.loc[ranking, 'content']

11      Subject: password reset\nthis is a generated e...
1371    Subject: your username and password\ndear dare...
562     Subject: deletion of your enrononline user id\...
752     Subject: internal guest id and passowrd\ndarre...
1816    Subject: eol application id and password\ndarr...
2776    Subject: eol application id and password\ndarr...
2671    Subject: performance management process - new ...
3037    Subject: daren ,\nthe firm trading waha book h...
1698    Subject: enron mid - year 2000 performance man...
2133    Subject: investinme . enron . com login inform...
351     Subject: pc install p 600\ndaren\ni ' m workin...
412     Subject: xms - new system\neffective : we have...
2177    Subject: registration welcome email\nthank you...
1409    Subject: registration welcome email\nthank you...
3383    Subject: registration confirmation from spinne...
2634    Subject: enron year end 2000 performance manag...
1846    Subject: attention : changes in remote access\...
2559    Subjec