In [1]:
%matplotlib inline
from collections import Counter
import math

import matplotlib.pyplot as plt
from nltk.stem.lancaster import LancasterStemmer
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm_notebook

from tools.data_cleansing import clean
from tools.data_exploration import print_email_by_id, emails_sent_distribution, emails_received_distribution,\
body_length_distribution, number_of_recipients_distribution
from tools.data_handling import enrich_emails, load_email_senders, unique_recipients, address_book, load_emails
from tools.features import split_tokenizer, TwidfVectorizer, GoWVectorizer
from tools.sender_pipeline import SenderModel
from tools.submission import save_submission

# Data loading and first look

In [2]:
df_emails = enrich_emails()

Reading dataframe from data/enrich_emails.csv


In [None]:
print_email_by_id(df_emails, 41311)

In [3]:
df_email_senders = load_email_senders()

# Data exploration

In [None]:
emails_sent_distribution(df_email_senders, max_value=300)

In [None]:
emails_received_distribution(df_emails, max_value=200)

In [None]:
body_length_distribution(df_emails, max_value=10000)

In [None]:
number_of_recipients_distribution(df_emails, bins=20, max_value=100)

In [None]:
unique_rec = unique_recipients(df_emails)
print("Number of unique recipients: {}".format(len(unique_rec)))

# Data cleansing

## Cleansing function

In [4]:
important_stopwords = {
    "he", "his", "him", "himself", "she", "her", "hers", "herself"
}
additional_stopwords = {
    "fyi", "x", "filename", "forwarded", "forward", "original", "message", "sent"
}
def clean_df(df_emails, df_senders):
    df_emails["clean_body"] = ""
    for index, row in tqdm_notebook(df_senders.iterrows(), desc="Senders cleaning", total=df_senders.shape[0]):
        mids = list(map(int, row["mids"].split()))
        # data loading and separation
        df_interest = df_emails.ix[mids]
        # data cleansing
        unique_rec_train = unique_recipients(df_interest)
        add_book = address_book(unique_rec_train)
        df_emails.loc[mids, "clean_body"] = df_interest["body"].apply(
            lambda x: clean(
                x, add_book, important_stopwords=important_stopwords, additional_stopwords=additional_stopwords))
    df_emails["clean_body"] = df_emails["clean_body"].fillna("")

In [5]:
clean_df(df_emails, df_email_senders)




# Training

## Received numbers

In [6]:
all_recipients = df_emails["recipients"].str.cat(sep=" ").split()
recipients_count = Counter(all_recipients)

## Current Model

In [35]:
def model(df_interest, n_mails):
    # Input TF-IDF vectorizer
    input_bow = CountVectorizer(max_features=n_mails)
    # Random Forest Regressor
    rf = RandomForestRegressor(
        n_estimators=min(n_mails // 10, 100),
        max_depth=100,
        n_jobs=-1,
        max_features="sqrt",
        random_state=0)
    # Random Forest Regressor for reranking
    lr_rerank = LogisticRegression(
        C=1,
        class_weight="balanced",
        n_jobs=-1,
        random_state=0
    )    
    # Output vectorizer
    unique_rec = unique_recipients(df_interest)
    output_bow = CountVectorizer(
        tokenizer=split_tokenizer,
        vocabulary=unique_rec)
    # Sender Model
    sender_model = SenderModel(
        df_emails=df_interest,
        classifier = rf,
        reranking_classifier=lr_rerank,
        input_vectorizer = input_bow,
        output_vectorizer = output_bow,
        n_received=recipients_count
    )
    return sender_model

## On one particular sender

In [36]:
row = df_email_senders.iloc[0]
mids = list(map(int, row["mids"].split()))
df_interest = df_emails.ix[mids]
n_mails = len(mids)
sender_model = model(df_interest, n_mails)
prec = sender_model.evaluate(reranking=True)
print(prec)

0.599868287741


## Evaluation

In [37]:
df_small_senders = df_email_senders.sample(frac=0.3, random_state=0)

In [38]:
df_precision = pd.DataFrame(columns=["sender", "n_mails", "precision"])

In [None]:
for index, row in tqdm_notebook(df_small_senders.iterrows(), desc="Senders evaluation", total=df_small_senders.shape[0]):
    # Row unpacking
    sender = row["sender"]
    mids = list(map(int, row["mids"].split()))
    df_interest = df_emails.ix[mids]
    n_mails = len(mids)
    # Model building
    sender_model = model(df_interest, n_mails)
    # Precision computation
    prec = sender_model.evaluate(reranking=False)
    df_precision.loc[index] = [sender, n_mails, prec]

In [None]:
(df_precision["precision"]*df_precision["n_mails"]).sum()/(df_precision["n_mails"].sum())

In [None]:
df_precision

# Submission

## Actual model training

In [None]:
sender_models = dict()
for index, row in tqdm_notebook(df_email_senders.iterrows(), desc="Senders Training (submision)", total=df_email_senders.shape[0]):
    # Row unpacking
    sender = row["sender"]
    mids = list(map(int, row["mids"].split()))
    df_interest = df_emails.ix[mids]
    n_mails = len(mids)
    # Model building
    sender_model = model(df_interest, n_mails)
    # Training
    sender_model.train()
    # Saving
    sender_models[sender] = sender_model

## Data loading

In [None]:
df_submission_senders = load_email_senders(set_type="test")
df_submission = load_emails(set_type="test")
df_submission["recipients"] = ""

## Data cleansing

In [None]:
clean_df(df_submission, df_submission_senders)

## Prediction

In [None]:
for index, row in tqdm_notebook(df_submission_senders.iterrows(), desc="Senders predictions", total=df_submission_senders.shape[0]):
    # Row unpacking
    sender = row["sender"]
    mids_train = list(map(int, df_email_senders[df_email_senders["sender"]==sender]["mids"].iloc[0].split()))
    df_interest = df_emails.ix[mids_train]
    n_mails = len(mids_train)
    # Model building
    sender_model = model(df_interest, n_mails)
    # Training
    sender_model.train()
    # Prediction
    mids_sub = list(map(int, row["mids"].split()))
    sender_model.predict(mids_sub, df_submission)

## Formatting

In [None]:
save_submission(df_submission,
               algo="RF single sender regressor bow reranked",
               member="Zac")

In [None]:
df_submission