In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from nltk.stem.lancaster import LancasterStemmer
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from tools.data_cleansing import clean
from tools.data_exploration import print_email_by_id, emails_sent_distribution, emails_received_distribution,\
body_length_distribution, number_of_recipients_distribution
from tools.data_handling import enrich_emails, load_email_senders, unique_recipients, address_book, load_emails,\
unique_domain_names, name_to_address, mail_body_orig_message
from tools.evaluation import precision
from tools.features import bag_of_words, bag_of_emails
from tools.submission import save_submission

ImportError: cannot import name 'bag_of_words'

# Data loading and first look

In [None]:
df_emails = enrich_emails()
df_emails.head()

In [None]:
print_email_by_id(df_emails, 41311)

# Data exploration

In [None]:
df_email_senders = load_email_senders()

In [None]:
emails_sent_distribution(df_email_senders, max_value=300)

In [None]:
emails_received_distribution(df_emails, max_value=200)

In [None]:
body_length_distribution(df_emails, max_value=10000)

In [None]:
number_of_recipients_distribution(df_emails, bins=20, max_value=100)

In [None]:
unique_rec = unique_recipients(df_emails)
print("Number of unique recipients: {}".format(len(unique_rec)))

# Tokenizing functions

In [None]:
unique_rec_train = unique_recipients(df_emails)
add_book = address_book(unique_rec_train)
add_book.add("fyi")

In [None]:
st = LancasterStemmer()
def stem(word):
    if word in add_book:
        return word
    else:
        return(st.stem(word))
def stem_tokenizer(s):
    return [stem(word) for word in s.split(" ")]

In [None]:
def split_tokenizer(s):
        return s.split(" ")

# Training

In [None]:
df_small_senders = df_email_senders.sample(frac=0.3)

In [None]:
for index, row in df_emails[df_emails["date"].str.contains("0001")].iterrows():
    old_date = df_emails.loc[index, "date"]
    new_date = "2"+old_date[1:]
    df_emails.loc[index, "date"] = new_date
for index, row in df_emails[df_emails["date"].str.contains("0002")].iterrows():
    old_date = df_emails.loc[index, "date"]
    new_date = "2"+old_date[1:]
    df_emails.loc[index, "date"] = new_date

In [None]:
df_emails["timestamp"] = pd.to_datetime(df_emails["date"], format="%Y-%m-%d %H:%M:%S", errors="raise")

In [None]:
from scipy import sparse
row = df_email_senders.loc[59]
sender = row["sender"]
mids = list(map(int, row["mids"].split()))
n_mails = len(mids)
# data loading and separation
df_interest = df_emails.ix[mids]
df_train = df_interest.sample(frac=train_prop)
train_ids = list(df_train.index.values)
# data cleansing
unique_rec_train = unique_recipients(df_train)
add_book = address_book(unique_rec_train)
df_train["clean body"] = clean(df_train["body"], add_book)
# feature engineering
input_bow = TfidfVectorizer(norm="l2",
                       tokenizer=stem_tokenizer)
X_train = input_bow.fit_transform(df_train["clean body"])
hour_train = sparse.csr_matrix(df_train["timestamp"].dt.hour.as_matrix()).transpose()
day_train = sparse.csr_matrix(df_train["timestamp"].dt.dayofweek.as_matrix()).transpose()
Y_train = output_bow.fit_transform(df_train["recipients"])
# model fitting
rf = RandomForestRegressor(n_estimators=15,
                       max_depth=30,
                       n_jobs=-1,
                       min_samples_leaf=max(1,int(0.0002*n_mails)))
rf.fit(X_train, Y_train.toarray())

In [None]:
df_train

In [None]:
print(X_train.shape)
print(hour_train.shape)
sparse.hstack((X_train, hour_train, day_train))

In [None]:
train_prop = 0.7
sender_models = dict()
sender_input_bow = dict()
sender_output_bow = dict()
sender_add_book = dict()
sender_rec = dict()
sender_train_ids = dict()
for index, row in df_small_senders.iterrows():
    sender = row["sender"]
    mids = list(map(int, row["mids"].split()))
    n_mails = len(mids)
    # data loading and separation
    df_interest = df_emails.ix[mids]
    df_train = df_interest.sample(frac=train_prop)
    train_ids = list(df_train.index.values)
    # data cleansing
    unique_rec_train = unique_recipients(df_train)
    add_book = address_book(unique_rec_train)
    df_train["clean body"] = clean(df_train["body"], add_book)
    # feature engineering
    input_bow = TfidfVectorizer(norm="l2",
                           tokenizer=stem_tokenizer)
    X_train = input_bow.fit_transform(df_train["clean body"])
    hour_train = sparse.csr_matrix(df_train["timestamp"].dt.hour.as_matrix()).transpose()
    day_train = sparse.csr_matrix(df_train["timestamp"].dt.dayofweek.as_matrix()).transpose()
    X_train = sparse.hstack((X_train, hour_train, day_train))
    output_bow = CountVectorizer(tokenizer=split_tokenizer,
                             vocabulary=unique_rec_train)
    Y_train = output_bow.fit_transform(df_train["recipients"])
    # model fitting
    rf = RandomForestRegressor(n_estimators=15,
                           max_depth=30,
                           n_jobs=-1,
                           min_samples_leaf=max(1,int(0.0002*n_mails)))
    rf.fit(X_train, Y_train.toarray())
    # attributions
    sender_models[sender] = rf
    sender_rec[sender] = unique_rec_train
    sender_add_book[sender] = add_book
    sender_input_bow[sender] = input_bow
    sender_output_bow[sender] = output_bow
    sender_train_ids[sender] = train_ids

# Evaluation

In [None]:
import pandas as pd
df_precision = pd.DataFrame(columns=["sender", "n_mails", "precision"])

In [None]:
top = 10
for index, row in df_small_senders.iterrows():
    sender = row["sender"]
    mids = list(map(int, row["mids"].split()))
    # model loading
    rf = sender_models[sender]
    unique_rec_train = sender_rec[sender]
    add_book = sender_add_book[sender]
    input_bow = sender_input_bow[sender]
    output_bow = sender_output_bow[sender]
    train_ids = sender_train_ids[sender]
    # data loading
    df_interest = df_emails.ix[mids]
    train_mask = df_interest.index.isin(train_ids)
    df_test = df_interest[~train_mask]
    n_mails = df_test.shape[0]
    # data cleansing
    df_test["clean body"] = clean(df_test["body"], add_book)
    # feature engineering
    X_test = input_bow.transform(df_test["clean body"])
    hour_test = sparse.csr_matrix(df_test["timestamp"].dt.hour.as_matrix()).transpose()
    day_test = sparse.csr_matrix(df_test["timestamp"].dt.dayofweek.as_matrix()).transpose()
    X_test = sparse.hstack((X_test, hour_test, day_test))
    # Prediction
    Y_test = rf.predict(X_test)
    #decoding
    recipients_map = output_bow.get_feature_names()
    if len(Y_test.shape) > 1 and top < Y_test.shape[1]:
        best_pred_idx = np.argpartition(-Y_test, top, axis=1)[:,:top]
        sorted_ids = np.argsort(Y_test[np.arange(Y_test.shape[0])[:, None], best_pred_idx])[:,::-1]
        sorted_idx = best_pred_idx[np.arange(best_pred_idx.shape[0])[:, None], sorted_ids]
    else:
        sorted_idx = np.argsort(-Y_test)
    preci = 0
    for index_test, row_test in df_test.iterrows():
        i = df_test.index.get_loc(index_test)
        if len(recipients_map) > 1:
            rec_ids = sorted_idx[i, :]
            rec_pred = " ".join([recipients_map[rec_id] for rec_id in rec_ids])
        else:
            rec_pred = recipients_map[0]
        preci += precision(rec_pred, row_test["recipients"])
    preci /= n_mails
    df_precision.loc[index] = [sender, n_mails, preci]

In [None]:
df_precision

In [None]:
df_precision["precision"].mean()

# Submission

## Actual model training

In [None]:
sender_models = dict()
sender_input_bow = dict()
sender_output_bow = dict()
sender_add_book = dict()
sender_rec = dict()
for index, row in df_email_senders.iterrows():
    sender = row["sender"]
    mids = list(map(int, row["mids"].split()))
    n_mails = len(mids)
    # data loading
    df_train = df_emails.ix[mids]
    # data cleansing
    unique_rec_train = unique_recipients(df_train)
    add_book = address_book(unique_rec_train)
    df_train["clean body"] = clean(df_train["body"], add_book)
    # feature engineering
    input_bow = TfidfVectorizer(norm="l2",
                           tokenizer=stem_tokenizer)
    X_train = input_bow.fit_transform(df_train["clean body"])
    hour_train = sparse.csr_matrix(df_train["timestamp"].dt.hour.as_matrix()).transpose()
    day_train = sparse.csr_matrix(df_train["timestamp"].dt.dayofweek.as_matrix()).transpose()
    X_train = sparse.hstack((X_train, hour_train, day_train))
    output_bow = CountVectorizer(tokenizer=split_tokenizer,
                             vocabulary=unique_rec_train)
    Y_train = output_bow.fit_transform(df_train["recipients"])
    # model fitting
    rf = RandomForestRegressor(n_estimators=15,
                           max_depth=30,
                           n_jobs=-1,
                           min_samples_leaf=max(1,int(0.04*n_mails)))
    rf.fit(X_train, Y_train.toarray())
    # attributions
    sender_models[sender] = rf
    sender_rec[sender] = unique_rec_train
    sender_add_book[sender] = add_book
    sender_input_bow[sender] = input_bow
    sender_output_bow[sender] = output_bow

## Data loading

In [None]:
df_submission_senders = load_email_senders(set_type="test")
df_submission = load_emails(set_type="test")
df_submission["recipients"] = ""

In [None]:
for index, row in df_submission[df_submission["date"].str.contains("0001")].iterrows():
    old_date = df_submission.loc[index, "date"]
    new_date = "2"+old_date[1:]
    df_submission.loc[index, "date"] = new_date
for index, row in df_submission[df_submission["date"].str.contains("0002")].iterrows():
    old_date = df_submission.loc[index, "date"]
    new_date = "2"+old_date[1:]
    df_submission.loc[index, "date"] = new_date

In [None]:
df_submission["timestamp"] = pd.to_datetime(df_submission["date"], format="%Y-%m-%d %H:%M:%S", errors="raise")

In [None]:
top = 10
for index, row in df_submission_senders.iterrows():
    sender = row["sender"]
    mids = map(int, row["mids"].split())
    # data loading
    df_eval = df_submission.ix[mids]
    # model loading
    rf = sender_models[sender]
    unique_rec_train = sender_rec[sender]
    add_book = sender_add_book[sender]
    input_bow = sender_input_bow[sender]
    output_bow = sender_output_bow[sender]
    # data cleansing
    df_eval["clean body"] = clean(df_eval["body"], add_book)
    # feature engineering
    X_eval = input_bow.transform(df_eval["clean body"])
    hour_eval = sparse.csr_matrix(df_eval["timestamp"].dt.hour.as_matrix()).transpose()
    day_eval = sparse.csr_matrix(df_eval["timestamp"].dt.dayofweek.as_matrix()).transpose()
    X_eval = sparse.hstack((X_eval, hour_eval, day_eval))
    # Prediction
    Y_eval = rf.predict(X_eval)
    #decoding
    recipients_map = output_bow.get_feature_names()
    if len(Y_eval.shape) > 1 and top < Y_eval.shape[1]:
        best_pred_idx = np.argpartition(-Y_eval, top, axis=1)[:,:top]
        sorted_ids = np.argsort(Y_eval[np.arange(Y_eval.shape[0])[:, None], best_pred_idx])[:,::-1]
        sorted_idx = best_pred_idx[np.arange(best_pred_idx.shape[0])[:, None], sorted_ids]
    else:
        sorted_idx = np.argsort(-Y_eval)
    for index, row in df_eval.iterrows():
        i = df_eval.index.get_loc(index)
        if len(recipients_map) > 1:
            rec_ids = sorted_idx[i, :]
            rec_pred = " ".join([recipients_map[rec_id] for rec_id in rec_ids])
        else:
            rec_pred = recipients_map[0]
        df_submission.set_value(index, "recipients", rec_pred)

# Fine tuning

In [None]:
from tools.fine_tuning import expected_precision

In [None]:
depths = [2, 5, 10, 15, 20, 30, 50, 100, 300]
n = len(depths)
p = np.zeros(n)

In [None]:
for i, depth in enumerate(depths):
    p[i] = expected_precision(min_sample_prop=0.0002,
                             n_estimators=15,
                             max_depth=100)

In [None]:
n_trials = 10
pr = np.zeros(n_trials)
for i in range(n_trials):
    pr[i] += expected_precision(min_sample_prop=0.04,
                             n_estimators=15,
                             max_depth=30)
print(pr)

In [None]:
pr.mean()

In [None]:
plt.plot(depths, p)
plt.xlabel("Number of trees")
plt.ylabel("Precision")

## Formatting

In [None]:
save_submission(df_submission,
               algo="RF single sender regressor tf idf+ date",
               member="Zac")