In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

from tools.data_cleansing import remove_numbers_and_ponctuation, remove_stopwords, remove_non_english_words
from tools.data_exploration import print_email_by_id, emails_sent_distribution, emails_received_distribution,\
body_length_distribution, number_of_recipients_distribution
from tools.data_handling import enrich_emails, load_email_senders, unique_recipients, address_book
from tools.features import bag_of_words, bag_of_emails

In [None]:
df_emails = enrich_emails()
df_emails.head()

In [None]:
print_email_by_id(df_emails, 41311)

In [None]:
df_email_senders = load_email_senders()

In [None]:
emails_sent_distribution(df_email_senders, max_value=300)

In [None]:
emails_received_distribution(df_emails, max_value=200)

In [None]:
body_length_distribution(df_emails, max_value=10000)

In [None]:
number_of_recipients_distribution(df_emails, bins=20, max_value=100)

In [None]:
unique_rec = unique_recipients(df_emails)
print("Number of unique recipients: {}".format(len(unique_rec)))

# Dataset separation

In [None]:
df_training = df_emails.sample(n=1000,random_state=42)
training_ids = list(df_training.index.values)
training_mask = df_emails.index.isin(training_ids)
df_test = df_emails[training_mask].sample(n=1000,random_state=42)

# Feature engineering

## Input variables

### Data cleansing

In [None]:
s_text_training = df_training["body"]
unique_rec_training = unique_recipients(df_training)
address_book = address_book(unique_rec_training)
s_text_n_ponct_training = remove_numbers_and_ponctuation(s_text_training)
s_text_n_stop_training = remove_stopwords(s_text_n_ponct_training)
s_text_english_training = remove_non_english_words(s_text_n_stop_training, address_book=address_book)

In [None]:
s_text_test = df_test["body"]
s_text_n_ponct_test = remove_numbers_and_ponctuation(s_text_test)
s_text_n_stop_test = remove_stopwords(s_text_n_ponct_test)
s_text_english_test = remove_non_english_words(s_text_n_stop_test, address_book=address_book)

### BoW computation

In [None]:
input_variables_training, input_bow = bag_of_words(s_text_english_training)

In [None]:
input_variables_test = input_bow.transform(s_text_english_test)

## Output variables

In [None]:
s_recipient = df_training["recipients"]
output_variables = bag_of_emails(s_recipient, unique_rec_training)