In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import os.path as op

import matplotlib.pyplot as plt
from qgrid import show_grid
import pandas as pd
import numpy as np

from tools.data_cleansing import remove_numbers_and_ponctuation, remove_stopwords, remove_non_english_words
from tools.data_exploration import print_email_by_id, emails_sent_distribution, emails_received_distribution,\
body_length_distribution, number_of_recipients_distribution
from tools.data_handling import enrich_emails, load_email_senders, unique_recipients, address_book
from tools.features import bag_of_words, bag_of_emails
from tools.evaluation import top_emails, evaluate

# Training & Testing data

Load csv

In [2]:
df_emails = enrich_emails()

Reading dataframe from data/enrich_emails.csv


Train / Test split

In [3]:
ratio = 0.9
n_train = int(ratio * df_emails.shape[0])
df_train = df_emails.sample(n=n_train, random_state=0).head(n=1000)
df_test = df_emails.drop(df_train.index).head(n=100)

Clean data

In [4]:
recipients = unique_recipients(df_train)
names = address_book(recipients)
for df in [df_train, df_test]:
    print("Removing numbers and punctuation")
    df["body"] = remove_numbers_and_ponctuation(df["body"])
    print("Removing stopwords")
    df["body"] = remove_stopwords(df["body"])
    print("Removing non english words")
    df["body"] = remove_non_english_words(df["body"], address_book=names)

Removing numbers and punctuation
Removing stopwords
Removing non english words
Removing numbers and punctuation
Removing stopwords
Removing non english words


Vectorize

In [5]:
X_train, bow_vectorizer = bag_of_words(df_train["body"], max_features=1000)
X_test = bow_vectorizer.transform(df_test["body"])

In [9]:
Y_train, boe_vectorizer = bag_of_emails(df_train["recipients"], recipients, max_features=20)

# Prediction

In [7]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, Y_train.toarray())

Y_pred = model.predict(X_test)

In [11]:
recipients_map = np.array(boe_vectorizer.get_feature_names())
predictions = top_emails(Y_pred, recipients_map)
ground_truth = df_test["recipients"].str.split(expand=True).as_matrix()
evaluate(predictions, ground_truth)

0.035637301587301586