In [42]:
%matplotlib inline
import matplotlib.pyplot as plt
from nltk.stem.lancaster import LancasterStemmer
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from tqdm import tqdm_notebook

from tools.data_cleansing import clean
from tools.data_exploration import print_email_by_id, emails_sent_distribution, emails_received_distribution,\
body_length_distribution, number_of_recipients_distribution
from tools.data_handling import enrich_emails, load_email_senders, unique_recipients, address_book, load_emails
from tools.features import split_tokenizer, TwidfVectorizer, GoWVectorizer
from tools.sender_pipeline import SenderModel
from tools.submission import save_submission

# Data loading and first look

In [2]:
df_emails = enrich_emails()

Reading dataframe from data/enrich_emails.csv


In [3]:
print_email_by_id(df_emails, 41311)

From: lynn.blair@enron.com
To: john.buchanan@enron.com lynn.blair@enron.com
On: 2001-07-26 08:02:34
Body:
    		John, would you please work with Brad Holmes and Jerry Medeles to capture all of	the system enhancements we have made on NNG since last years meeting.	Thanks. Lynn


In [4]:
df_email_senders = load_email_senders()

# Data exploration

In [None]:
emails_sent_distribution(df_email_senders, max_value=300)

In [None]:
emails_received_distribution(df_emails, max_value=200)

In [None]:
body_length_distribution(df_emails, max_value=10000)

In [None]:
number_of_recipients_distribution(df_emails, bins=20, max_value=100)

In [None]:
unique_rec = unique_recipients(df_emails)
print("Number of unique recipients: {}".format(len(unique_rec)))

# Data cleansing

## Cleansing function

In [5]:
def clean_df(df_emails, df_senders):
    df_emails["clean_body"] = ""
    for index, row in tqdm_notebook(df_senders.iterrows(), desc="Senders cleaning", total=df_senders.shape[0]):
        mids = list(map(int, row["mids"].split()))
        # data loading and separation
        df_interest = df_emails.ix[mids]
        # data cleansing
        unique_rec_train = unique_recipients(df_interest)
        add_book = address_book(unique_rec_train)
        df_emails.loc[mids, "clean_body"] = df_interest["body"].apply(lambda x: clean(x, add_book))
    df_emails["clean_body"] = df_emails["clean_body"].fillna("")

In [6]:
clean_df(df_emails, df_email_senders)




# Training

In [8]:
df_small_senders = df_email_senders.sample(frac=0.3, random_state=0)

## Current Model

In [93]:
def model(df_interest, n_mails):
    # Input TF-IDF vectorizer
    input_bow = CountVectorizer()
    # Random Forest Regressor
    rf = RandomForestRegressor(
        n_estimators=200,
        max_depth=75,
        n_jobs=-1,
        min_samples_leaf=max(1,int(0.0002*n_mails)),
        max_features="sqrt",
        random_state=0)
    # Output vectorizer
    unique_rec = unique_recipients(df_interest)
    output_bow = CountVectorizer(
        tokenizer=split_tokenizer,
        vocabulary=unique_rec)
    # Sender Model
    sender_model = SenderModel(
        df_emails=df_interest,
        classifier = rf,
        input_vectorizer = input_bow,
        output_vectorizer = output_bow
    )
    return sender_model

## On one particular sender

In [94]:
row = df_email_senders.iloc[0]
mids = list(map(int, row["mids"].split()))
df_interest = df_emails.ix[mids]
n_mails = len(mids)
sender_model = model(df_interest, n_mails)
prec = sender_model.evaluate(random_state=0)
print(prec)

0.635842620736


## Evaluation

In [95]:
df_precision = pd.DataFrame(columns=["sender", "n_mails", "precision"])

In [96]:
for index, row in tqdm_notebook(df_email_senders.iterrows(), desc="Senders evaluation", total=df_email_senders.shape[0]):
    # Row unpacking
    sender = row["sender"]
    mids = list(map(int, row["mids"].split()))
    df_interest = df_emails.ix[mids]
    n_mails = len(mids)
    # Model building
    sender_model = model(df_interest, n_mails)
    # Precision computation
    prec = sender_model.evaluate(random_state=0)
    df_precision.loc[index] = [sender, n_mails, prec]

  self.classifier.fit(X_train, Y_train.toarray())
  self.classifier.fit(X_train, Y_train.toarray())
  self.classifier.fit(X_train, Y_train.toarray())





In [97]:
(df_precision["precision"]*df_precision["n_mails"]).sum()/(df_precision["n_mails"].sum())

0.51448870141092262

# Submission

## Actual model training

In [15]:
sender_models = dict()
for index, row in tqdm_notebook(df_email_senders.iterrows(), desc="Senders Training (submision)", total=df_email_senders.shape[0]):
    # Row unpacking
    sender = row["sender"]
    mids = list(map(int, row["mids"].split()))
    df_interest = df_emails.ix[mids]
    n_mails = len(mids)
    # Model building
    sender_model = model(df_interest, n_mails)
    # Training
    sender_model.train()
    # Saving
    sender_models[sender] = sender_model

  self.classifier.fit(X_train, Y_train.toarray())
  self.classifier.fit(X_train, Y_train.toarray())
  self.classifier.fit(X_train, Y_train.toarray())





## Data loading

In [16]:
df_submission_senders = load_email_senders(set_type="test")
df_submission = load_emails(set_type="test")
df_submission["recipients"] = ""

## Data cleansing

In [17]:
clean_df(df_submission, df_submission_senders)




## Prediction

In [18]:
for index, row in tqdm_notebook(df_submission_senders.iterrows(), desc="Senders predictions", total=df_submission_senders.shape[0]):
    sender = row["sender"]
    mids = map(int, row["mids"].split())
    # Model loading
    sender_model = sender_models[sender]
    # Prediction
    sender_model.predict(mids, df_submission)




# Fine tuning

In [None]:
from tools.fine_tuning import expected_precision

In [None]:
depths = [2, 5, 10, 15, 20, 30, 50, 100, 300]
n = len(depths)
p = np.zeros(n)

In [None]:
for i, depth in enumerate(depths):
    p[i] = expected_precision(min_sample_prop=0.0002,
                             n_estimators=15,
                             max_depth=100)

In [None]:
n_trials = 10
pr = np.zeros(n_trials)
for i in range(n_trials):
    pr[i] += expected_precision(min_sample_prop=0.04,
                             n_estimators=15,
                             max_depth=30)
print(pr)

In [None]:
pr.mean()

In [None]:
plt.plot(depths, p)
plt.xlabel("Number of trees")
plt.ylabel("Precision")

## Formatting

In [19]:
save_submission(df_submission,
               algo="RF single sender regressor tw idf",
               member="Zac")

'RF single sender regressor tw idf_Zac_1489065686.1829622.csv'

In [20]:
df_submission

Unnamed: 0_level_0,date,body,recipients,clean_body
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1577,2001-11-19 06:59:51,Note: Stocks of heating oil are very high for...,kevin.hyatt@enron.com lindy.donoho@enron.com m...,not stock heat oil high wint affect northeast ...
1750,2002-03-05 08:46:57,"Kevin Hyatt and I are going for ""sghetti"" at S...",no.address@enron.com michelle.lokay@enron.com ...,kevin hyat going sghetti spaghett wareh today ...
1916,2002-02-13 14:17:39,This was forwarded to me and it is funny. - Wi...,no.address@enron.com michelle.lokay@enron.com ...,forward funny witc enron voic mail system htm
2094,2002-01-22 11:33:56,I will be in to and happy to assist too. I ma...,no.address@enron.com louise.kitchen@enron.com ...,happy assist may gon febru th trip confirm yet
2205,2002-01-11 07:12:19,Thanks. I needed a morning chuckle.,no.address@enron.com danny.mccarty@enron.com t...,thank nee morn chuckl
2297,2002-01-11 14:37:19,Note: Westpath Expansion plans filed at NEBTr...,kevin.hyatt@enron.com lorraine.lindberg@enron....,not westpa expand plan fil nebtranscanad plan ...
5300,2001-11-26 14:13:01,Here are Peggy s slides. -----Original Message...,peggy.fowler@enron.com aaron.brown@enron.com m...,peggy slid origin mess peggy fowl peggy fowl p...
5333,2001-11-19 13:44:18,Here s the information. -----Original Message-...,rod.hayslett@enron.com tracy.geaccone@enron.co...,inform origin mess tayl liz sent monday novemb...
6583,2002-01-18 05:00:48,I would like to know where and how this is goi...,rich.jolly@enron.com controllers.dl-ets@enron....,would lik know going affect us paramet kim kno...
7460,2001-11-12 16:43:31,"Richard: Per Elliot s e-mail below, do you hav...",jeff.dasovich@enron.com richard.ring@enron.com...,richard per elliot e mail access list ex ee cu...
