In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

from tools.data_cleansing import remove_numbers_and_ponctuation, remove_stopwords, remove_non_english_words,\
remove_after_indicator
from tools.data_exploration import print_email_by_id, emails_sent_distribution, emails_received_distribution,\
body_length_distribution, number_of_recipients_distribution
from tools.data_handling import enrich_emails, load_email_senders, unique_recipients, address_book, load_emails,\
unique_domain_names, name_to_address, mail_body_orig_message
from tools.evaluation import precision
from tools.features import bag_of_words, bag_of_emails
from tools.submission import save_submission

# Data loading and first look

In [None]:
df_emails = enrich_emails()
df_emails.head()

In [None]:
print_email_by_id(df_emails, 41311)

# Data exploration

In [None]:
df_email_senders = load_email_senders()

In [None]:
emails_sent_distribution(df_email_senders, max_value=300)

In [None]:
emails_received_distribution(df_emails, max_value=200)

In [None]:
body_length_distribution(df_emails, max_value=10000)

In [None]:
number_of_recipients_distribution(df_emails, bins=20, max_value=100)

In [None]:
unique_rec = unique_recipients(df_emails)
print("Number of unique recipients: {}".format(len(unique_rec)))

In [None]:
unique_rec

# Dataset separation

In [None]:
n_train = 4000
df_train = df_emails.sample(n=n_train,random_state=42)
train_ids = list(df_train.index.values)
train_mask = df_emails.index.isin(train_ids)
df_test = df_emails[~train_mask].sample(n=1000,random_state=42)

# Feature engineering

## Input variables

### Data cleansing

In [None]:
unique_rec_train = unique_recipients(df_train, min_rec=5)
add_book = address_book(unique_rec_train)
df_train["body without original messages"] = remove_after_indicator(df_train["body"], "Original Message")
df_train["body without forwarded messages"] = remove_after_indicator(df_train["body without original messages"],
                                                                     "Forwarded by")
df_train["body without ponctuation"] = remove_numbers_and_ponctuation(df_train["body without forwarded messages"])
df_train["body without stopwords"] = remove_stopwords(df_train["body without ponctuation"])
df_train["body without non-english words"] = remove_non_english_words(df_train["body without stopwords"], address_book=add_book)

In [None]:
df_emails.loc[268373]
print_email_by_id(df_emails, 268373)

In [None]:
df_test["body without original messages"] = remove_after_indicator(df_test["body"], "Original Message")
df_test["body without forwarded messages"] = remove_after_indicator(df_test["body without original messages"],
                                                                     "Forwarded by")
df_test["body without ponctuation"] = remove_numbers_and_ponctuation(df_test["body without forwarded messages"])
df_test["body without stopwords"] = remove_stopwords(df_test["body without ponctuation"])
df_test["body without non-english words"] = remove_non_english_words(df_test["body without stopwords"], address_book=add_book)

### BoW computation

In [None]:
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
def stem(word):
    if word in add_book:
        return word
    else:
        return(st.stem(word))
def stem_tokenizer(s):
    return [stem(word) for word in s.split(" ")]

In [None]:
input_bow = CountVectorizer(min_df=5,
                           tokenizer=stem_tokenizer)
X_train = input_bow.fit_transform(df_train["body without non-english words"])

In [None]:
X_test = input_bow.transform(df_test["body without non-english words"])

## Output variables

In [None]:
def split_tokenizer(s):
        return s.split(" ")
output_bow = CountVectorizer(tokenizer=split_tokenizer,
                             vocabulary=unique_rec_train)
Y_train = output_bow.fit_transform(df_train["recipients"])

# Classification

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(n_estimators=10, max_depth=200, n_jobs=-1, random_state=0)

In [None]:
rf.fit(X_train, Y_train.toarray())

In [None]:
Y_test = rf.predict(X_test)

## Decoding

In [None]:
top = 10
best_pred_idx = np.argpartition(-Y_test, top, axis=1)[:,:top]
sorted_ids = np.argsort(Y_test[np.arange(Y_test.shape[0])[:, None], best_pred_idx])[:,::-1]
sorted_idx = best_pred_idx[np.arange(best_pred_idx.shape[0])[:, None], sorted_ids]

In [None]:
recipients_map = output_bow.get_feature_names()

In [None]:
df_test["predictions"] = ""

In [None]:
for index, row in df_test.iterrows():
    i = df_test.index.get_loc(index)
    rec_ids = sorted_idx[i, :]
    rec_pred = " ".join([recipients_map[rec_id] for rec_id in rec_ids])
    df_test.set_value(index, "predictions", rec_pred)

# Evaluation

In [None]:
def prec(row):
    return precision(row["predictions"], row["recipients"])
df_test["precision"] = df_test.apply(prec, axis=1)

In [None]:
df_test["precision"].mean(axis=0)

# Submission

## Data loading

In [None]:
df_submission = load_emails(set_type="test")

## Data cleansing

In [None]:
df_submission["body without ponctuation"] = remove_numbers_and_ponctuation(df_submission["body"])
df_submission["body without stopwords"] = remove_stopwords(df_submission["body without ponctuation"])
df_submission["body without non-english words"] = remove_non_english_words(
    df_submission["body without stopwords"], address_book=add_book)

## BoW computation

In [None]:
X_submission = input_bow.transform(df_submission["body without non-english words"])

## Classification

In [None]:
Y_submission = rf.predict(X_submission)

## Decoding

In [None]:
top = 10
best_pred_idx = np.argpartition(-Y_submission, top, axis=1)[:,:top]
sorted_ids = np.argsort(Y_submission[np.arange(Y_submission.shape[0])[:, None], best_pred_idx])[:,::-1]
sorted_idx = best_pred_idx[np.arange(best_pred_idx.shape[0])[:, None], sorted_ids]

In [None]:
df_submission["recipients"] = ""

In [None]:
for index, row in df_submission.iterrows():
    i = df_submission.index.get_loc(index)
    rec_ids = sorted_idx[i, :]
    rec_pred = " ".join([recipients_map[rec_id] for rec_id in rec_ids])
    df_submission.set_value(index, "recipients", rec_pred)

## Formatting

In [None]:
save_submission_b(df_submission,
               algo="RF",
               member="Zac")