In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
from nltk.stem.lancaster import LancasterStemmer
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from tqdm import tqdm_notebook
import xgboost as xgb

from tools.data_cleansing import clean
from tools.data_exploration import print_email_by_id, emails_sent_distribution, emails_received_distribution,\
body_length_distribution, number_of_recipients_distribution
from tools.data_handling import enrich_emails, load_email_senders, unique_recipients, address_book, load_emails
from tools.features import split_tokenizer, TwidfVectorizer, TeamVectorizer
from tools.sender_pipeline import SenderModel
from tools.submission import save_submission
from tools.staff_graph import compute_summary_graph, construct_graph, compute_teams, assign_team

# Data loading and first look

In [3]:
df_emails = enrich_emails()

Reading dataframe from data/enrich_emails.csv


In [16]:
print_email_by_id(df_emails, 41308)

From: lynn.blair@enron.com
To: steven.january@enron.com terry.kowalke@enron.com
On: 2001-07-27 17:09:14
Body:
    	FYI.  Thanks. Lynn -----Original Message-----From: 	Betancourt, Ramona   Sent:	Friday, July 27, 2001 4:42 PMTo:	Ratliff, Dale; Watson, Kimberly; Frazier, Perry; Schoolcraft, Darrell; Licciardo, Jeanne; Holmes, Bradley; Sawyer, Lisa; Kuehl, Toby; Blair, Lynn; Donoho, Lindy; Draemer, Mary Subject:	RE: Jeanne, Dale Ratliff is out of the office on vacation all next week & I wanted to help follow up on this item. I think for TW the posting it occurs after the cycle is complete. Rich  or Toby can  verify that for us. One thing that is different on TW, is our processis to confirm before we allocate. The allocation is the last process we do which produces our final or completed schedule volumes for each cycle.This is different than NNG which allocates then confirms. Also in you reference below that you need to know the schedule for posting Unsubscribed.  I want to make sure that t

In [5]:
df_email_senders = load_email_senders()

# Data exploration

In [None]:
emails_sent_distribution(df_email_senders, max_value=300)

In [None]:
emails_received_distribution(df_emails, max_value=200)

In [None]:
body_length_distribution(df_emails, max_value=10000)

In [None]:
number_of_recipients_distribution(df_emails, bins=20, max_value=100)

In [None]:
unique_rec = unique_recipients(df_emails)
print("Number of unique recipients: {}".format(len(unique_rec)))

# Testing TeamVectorizer

In [6]:
team_vectorizer = TeamVectorizer()

In [7]:
team_vectorizer.fit(df_emails)

<tools.features.TeamVectorizer at 0x10e094898>

In [8]:
team_vectorizer.n_features

10

In [21]:
a = team_vectorizer.transform(["steven.january@enron.com", "terry.kowalke@enron.com", "lynn.blair@enron.com"])

In [26]:
b = team_vectorizer.transform(["rick.dietz@enron.com", "raetta.zadow@enron.com", "larry.berger@enron.com"])

# Data cleansing

## Cleansing function

In [None]:
def clean_df(df_emails, df_senders):
    df_emails["clean_body"] = ""
    for index, row in tqdm_notebook(df_senders.iterrows(), desc="Senders cleaning", total=df_senders.shape[0]):
        mids = list(map(int, row["mids"].split()))
        # data loading and separation
        df_interest = df_emails.ix[mids]
        # data cleansing
        unique_rec_train = unique_recipients(df_interest)
        add_book = address_book(unique_rec_train)
        df_emails.loc[mids, "clean_body"] = df_interest["body"].apply(lambda x: clean(x, add_book))
    df_emails["clean_body"] = df_emails["clean_body"].fillna("")

In [None]:
clean_df(df_emails, df_email_senders)

In [None]:
df_emails[df_emails["body"].str.contains("Forwarded by")].head(20)

# Training

In [None]:
df_small_senders = df_email_senders.sample(frac=0.3, random_state=0)

## Current Model

In [None]:
def model(df_interest, n_mails):
    # Input TF-IDF vectorizer
    input_bow = TfidfVectorizer(norm="l2")
    # Random Forest Regressor
    rf = RandomForestRegressor(
        n_estimators=100,
        max_depth=30,
        n_jobs=-1,
        min_samples_leaf=max(1,int(0.0002*n_mails)),
        max_features="sqrt",
        random_state=0)
    # Output vectorizer
    unique_rec = unique_recipients(df_interest)
    output_bow = CountVectorizer(
        tokenizer=split_tokenizer,
        vocabulary=unique_rec)
    # Sender Model
    sender_model = SenderModel(
        df_emails=df_interest,
        classifier = rf,
        input_vectorizer = input_bow,
        output_vectorizer = output_bow
    )
    return sender_model

## On one particular sender

In [None]:
row = df_email_senders.iloc[0]
mids = list(map(int, row["mids"].split()))
df_interest = df_emails.ix[mids]
n_mails = len(mids)
sender_model = model(df_interest, n_mails)
prec = sender_model.evaluate(random_state=0)
print(prec)

## Evaluation

In [None]:
df_precision = pd.DataFrame(columns=["sender", "n_mails", "precision"])

In [None]:
for index, row in tqdm_notebook(df_email_senders.iterrows(), desc="Senders evaluation", total=df_email_senders.shape[0]):
    # Row unpacking
    sender = row["sender"]
    mids = list(map(int, row["mids"].split()))
    df_interest = df_emails.ix[mids]
    n_mails = len(mids)
    # Model building
    sender_model = model(df_interest, n_mails)
    # Precision computation
    prec = sender_model.evaluate(random_state=0)
    df_precision.loc[index] = [sender, n_mails, prec]

In [None]:
df_precision

In [None]:
(df_precision["precision"]*df_precision["n_mails"]).sum()/(df_precision["n_mails"].sum())

# Submission

## Actual model training

In [None]:
sender_models = dict()
for index, row in tqdm_notebook(df_email_senders.iterrows(), desc="Senders Training (submision)", total=df_email_senders.shape[0]):
    # Row unpacking
    sender = row["sender"]
    mids = list(map(int, row["mids"].split()))
    df_interest = df_emails.ix[mids]
    n_mails = len(mids)
    # Model building
    sender_model = model(df_interest, n_mails)
    # Training
    sender_model.train()
    # Saving
    sender_models[sender] = sender_model

## Data loading

In [None]:
df_submission_senders = load_email_senders(set_type="test")
df_submission = load_emails(set_type="test")
df_submission["recipients"] = ""

## Data cleansing

In [None]:
clean_df(df_submission, df_submission_senders)

## Prediction

In [None]:
for index, row in tqdm_notebook(df_submission_senders.iterrows(), desc="Senders predictions", total=df_submission_senders.shape[0]):
    sender = row["sender"]
    mids = map(int, row["mids"].split())
    # Model loading
    sender_model = sender_models[sender]
    # Prediction
    sender_model.predict(mids, df_submission)

# Fine tuning

In [None]:
from tools.fine_tuning import expected_precision

In [None]:
depths = [2, 5, 10, 15, 20, 30, 50, 100, 300]
n = len(depths)
p = np.zeros(n)

In [None]:
for i, depth in enumerate(depths):
    p[i] = expected_precision(min_sample_prop=0.0002,
                             n_estimators=15,
                             max_depth=100)

In [None]:
n_trials = 10
pr = np.zeros(n_trials)
for i in range(n_trials):
    pr[i] += expected_precision(min_sample_prop=0.04,
                             n_estimators=15,
                             max_depth=30)
print(pr)

In [None]:
pr.mean()

In [None]:
plt.plot(depths, p)
plt.xlabel("Number of trees")
plt.ylabel("Precision")

## Formatting

In [None]:
save_submission(df_submission,
               algo="RF single sender regressor tw idf",
               member="Zac")

In [None]:
df_submission