In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

from tools.data_cleansing import clean
from tools.data_exploration import print_email_by_id, emails_sent_distribution, emails_received_distribution,\
body_length_distribution, number_of_recipients_distribution
from tools.data_handling import enrich_emails, load_email_senders, unique_recipients, address_book, load_emails,\
unique_domain_names, name_to_address, mail_body_orig_message
from tools.evaluation import precision
from tools.features import bag_of_words, bag_of_emails
from tools.submission import save_submission

# Data loading and first look

In [None]:
df_emails = enrich_emails()
df_emails.head()

In [None]:
print_email_by_id(df_emails, 41311)

# Data exploration

In [None]:
df_email_senders = load_email_senders()

In [None]:
emails_sent_distribution(df_email_senders, max_value=300)

In [None]:
emails_received_distribution(df_emails, max_value=200)

In [None]:
body_length_distribution(df_emails, max_value=10000)

In [None]:
number_of_recipients_distribution(df_emails, bins=20, max_value=100)

In [None]:
unique_rec = unique_recipients(df_emails)
print("Number of unique recipients: {}".format(len(unique_rec)))

In [None]:
unique_rec

# Dataset separation

In [None]:
n_train = 4000
df_train = df_emails.sample(n=n_train,random_state=42)
train_ids = list(df_train.index.values)
train_mask = df_emails.index.isin(train_ids)
df_test = df_emails[~train_mask].sample(n=1000,random_state=42)

In [None]:
df_email_senders[df_email_senders["mids"].str.split().str.len() > 200].head()

In [None]:
sender_id = 17
mids = map(int, df_email_senders["mids"].loc[sender_id].split())
df_interest = df_emails.ix[mids]

In [None]:
train_prop = 0.7
df_train = df_interest.sample(frac=train_prop, random_state=0)
train_ids = list(df_train.index.values)
train_mask = df_interest.index.isin(train_ids)
df_test = df_interest[~train_mask]

# Feature engineering

## Input variables

### Data cleansing

In [None]:
unique_rec_train = unique_recipients(df_train, min_rec=5)
add_book = address_book(unique_rec_train)

In [None]:
df_train["clean body"] = clean(df_train["body"], add_book)
df_test["clean body"] = clean(df_test["body"], add_book)

### BoW computation

In [None]:
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
def stem(word):
    if word in add_book:
        return word
    else:
        return(st.stem(word))
def stem_tokenizer(s):
    return [stem(word) for word in s.split(" ")]

In [None]:
input_bow = CountVectorizer(min_df=3,
                           tokenizer=stem_tokenizer)
X_train = input_bow.fit_transform(df_train["clean body"])

In [None]:
X_test = input_bow.transform(df_test["clean body"])

## Output variables

In [None]:
def split_tokenizer(s):
        return s.split(" ")

In [None]:
output_bow = CountVectorizer(tokenizer=split_tokenizer,
                             vocabulary=unique_rec_train)
Y_train = output_bow.fit_transform(df_train["recipients"])

# Classification

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(n_estimators=10, max_depth=300, n_jobs=-1, random_state=0)

In [None]:
rf.fit(X_train, Y_train.toarray())

In [None]:
Y_test = rf.predict(X_test)

## Decoding

In [None]:
top = 10
best_pred_idx = np.argpartition(-Y_test, top, axis=1)[:,:top]
sorted_ids = np.argsort(Y_test[np.arange(Y_test.shape[0])[:, None], best_pred_idx])[:,::-1]
sorted_idx = best_pred_idx[np.arange(best_pred_idx.shape[0])[:, None], sorted_ids]

In [None]:
recipients_map = output_bow.get_feature_names()

In [None]:
df_test["predictions"] = ""

In [None]:
for index, row in df_test.iterrows():
    i = df_test.index.get_loc(index)
    rec_ids = sorted_idx[i, :]
    rec_pred = " ".join([recipients_map[rec_id] for rec_id in rec_ids])
    df_test.set_value(index, "predictions", rec_pred)

# Evaluation

In [None]:
def prec(row):
    return precision(row["predictions"], row["recipients"])
df_test["precision"] = df_test.apply(prec, axis=1)

In [None]:
df_test["precision"].mean(axis=0)

In [None]:
df_test

# Submission

## Actual model training

In [None]:
sender_models = dict()
sender_input_bow = dict()
sender_output_bow = dict()
sender_add_book = dict()
sender_rec = dict()
for index, row in df_email_senders.iterrows():
    sender = row["sender"]
    mids = map(int, row["mids"].split())
    # data loading
    df_train = df_emails.ix[mids]
    # data cleansing
    unique_rec_train = unique_recipients(df_train)
    add_book = address_book(unique_rec_train)
    df_train["clean body"] = clean(df_train["body"], add_book)
    # feature engineering
    input_bow = CountVectorizer(min_df=3,
                           tokenizer=stem_tokenizer)
    X_train = input_bow.fit_transform(df_train["clean body"])
    output_bow = CountVectorizer(tokenizer=split_tokenizer,
                             vocabulary=unique_rec_train)
    Y_train = output_bow.fit_transform(df_train["recipients"])
    # model fitting
    rf = RandomForestClassifier(n_estimators=10, max_depth=300, n_jobs=-1, random_state=0)
    rf.fit(X_train, Y_train.toarray())
    # attributions
    sender_models[sender] = rf
    sender_rec[sender] = unique_rec_train
    sender_add_book[sender] = add_book
    sender_input_bow[sender] = input_bow
    sender_output_bow[sender] = output_bow

## Data loading

In [None]:
df_submission_senders = load_email_senders(set_type="test")
df_submission = load_emails(set_type="test")
df_submission["recipients"] = ""

In [None]:
top = 10
for index, row in df_submission_senders.iterrows():
    sender = row["sender"]
    mids = map(int, row["mids"].split())
    # data loading
    df_eval = df_submission.ix[mids]
    # model loading
    rf = sender_models[sender]
    unique_rec_train = sender_rec[sender]
    add_book = sender_add_book[sender]
    input_bow = sender_input_bow[sender]
    output_bow = sender_output_bow[sender]
    # data cleansing
    df_eval["clean body"] = clean(df_eval["body"], add_book)
    # feature engineering
    X_eval = input_bow.transform(df_eval["clean body"])
    # Prediction
    Y_eval = rf.predict(X_eval)
    #decoding
    recipients_map = output_bow.get_feature_names()
    if len(Y_eval.shape) > 1 and top > Y_eval.shape[1]:
        best_pred_idx = np.argpartition(-Y_eval, top, axis=1)[:,:top]
        sorted_ids = np.argsort(Y_eval[np.arange(Y_eval.shape[0])[:, None], best_pred_idx])[:,::-1]
        sorted_idx = best_pred_idx[np.arange(best_pred_idx.shape[0])[:, None], sorted_ids]
    else:
        sorted_idx = np.argsort(-Y_eval)
    for index, row in df_eval.iterrows():
        i = df_eval.index.get_loc(index)
        if len(recipients_map) > 1:
            rec_ids = sorted_idx[i, :]
            rec_pred = " ".join([recipients_map[rec_id] for rec_id in rec_ids])
        else:
            rec_pred = recipients_map[0]
        df_submission.set_value(index, "recipients", rec_pred)

In [None]:
list(1)

## Data cleansing

## BoW computation

In [None]:
X_submission = input_bow.transform(df_submission["body without non-english words"])

## Classification

In [None]:
Y_submission = rf.predict(X_submission)

## Decoding

In [None]:
top = 10
best_pred_idx = np.argpartition(-Y_submission, top, axis=1)[:,:top]
sorted_ids = np.argsort(Y_submission[np.arange(Y_submission.shape[0])[:, None], best_pred_idx])[:,::-1]
sorted_idx = best_pred_idx[np.arange(best_pred_idx.shape[0])[:, None], sorted_ids]

In [None]:
df_submission["recipients"] = ""

In [None]:
for index, row in df_submission.iterrows():
    i = df_submission.index.get_loc(index)
    rec_ids = sorted_idx[i, :]
    rec_pred = " ".join([recipients_map[rec_id] for rec_id in rec_ids])
    df_submission.set_value(index, "recipients", rec_pred)

## Formatting

In [None]:
save_submission(df_submission,
               algo="RF single sender",
               member="Zac")