In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
from nltk.stem.lancaster import LancasterStemmer
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from tools.data_cleansing import clean
from tools.data_exploration import print_email_by_id, emails_sent_distribution, emails_received_distribution,\
body_length_distribution, number_of_recipients_distribution
from tools.data_handling import enrich_emails, load_email_senders, unique_recipients, address_book, load_emails,\
unique_domain_names, name_to_address, mail_body_orig_message
from tools.evaluation import precision
from tools.submission import save_submission

# Loading data

In [3]:
df_emails = enrich_emails()
df_emails.head()

Reading dataframe from data/enrich_emails.csv


Unnamed: 0_level_0,date,body,recipients,sender
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
60,2000-07-25 08:14:00,Legal has been assessing the risks of doing bl...,robert.badeer@enron.com murray.o neil@enron.co...,christian.yoder@enron.com
66,2000-08-03 02:56:00,Attached is a spreadsheet to estimate export f...,kim.ward@enron.com robert.badeer@enron.com mur...,heather.dunton@enron.com
74,2000-08-15 05:37:00,Kevin/Bob: Here is a quick rundown on the cons...,robert.badeer@enron.com john.massey@enron.com ...,janel.guerrero@enron.com
80,2000-08-20 14:12:00,check this out and let everyone know what s up...,robert.badeer@enron.com jeff.richter@enron.com,tim.belden@enron.com
83,2000-08-22 08:17:00,Further to your letter to us (addressed to Mr....,pgillman@schiffhardin.com kamarlantes@calpx.co...,christian.yoder@enron.com


In [4]:
print_email_by_id(df_emails, 83)

From: christian.yoder@enron.com
To: pgillman@schiffhardin.com kamarlantes@calpx.com robert.badeer@enron.com sewilson@calpx.com
On: 2000-08-22 08:17:00
Body:
    Further to your letter to us (addressed to Mr. Tim Belden)  dated August 14, 2000:  Enron thinks that the elimination of physical risk during the month of August will be of commercial benefit because Enron expects that during the month of August there will be transmission line derations affecting the hour ahead market which will lead to TO debit charges by the CAISO.   The elimination of  TO debit charges is a commercial benefit to Enron.


In [4]:
df_email_senders = load_email_senders(set_type="training")
df_email_senders.head()

Unnamed: 0,sender,mids
0,karen.buckley@enron.com,158713 158697 200301 158679 278595 298162 2002...
1,amr.ibrahim@enron.com,215241 3437 215640 3506 191790 3517 3520 3562 ...
2,andrea.ring@enron.com,270705 270706 270707 270708 270709 270710 2707...
3,sylvia.hu@enron.com,111444 111422 183084 111412 111347 110883 1105...
4,phillip.platter@enron.com,327074 327384 327385 264443 274124 274125 2741...


# Tokenizing functions

In [5]:
unique_rec_train = unique_recipients(df_emails)
add_book = address_book(unique_rec_train)
add_book.add("fyi")

In [6]:
st = LancasterStemmer()
def stem(word):
    if word in add_book:
        return word
    else:
        return(st.stem(word))
def stem_tokenizer(s):
    return [stem(word) for word in s.split(" ")]

In [7]:
def split_tokenizer(s):
        return s.split(" ")

# Training

In [8]:
df_small_senders = df_email_senders.sample(frac=0.3)

In [9]:
for index, row in df_emails[df_emails["date"].str.contains("0001")].iterrows():
    old_date = df_emails.loc[index, "date"]
    new_date = "2"+old_date[1:]
    df_emails.loc[index, "date"] = new_date
for index, row in df_emails[df_emails["date"].str.contains("0002")].iterrows():
    old_date = df_emails.loc[index, "date"]
    new_date = "2"+old_date[1:]
    df_emails.loc[index, "date"] = new_date

In [10]:
df_emails["timestamp"] = pd.to_datetime(df_emails["date"], format="%Y-%m-%d %H:%M:%S", errors="raise")

In [11]:
from scipy import sparse
# import just one row in variable 'row'
row = df_email_senders.loc[59]
sender = row["sender"]
mids = list(map(int, row["mids"].split()))
n_mails = len(mids)
# data loading and separation

In [12]:
print(n_mails)
print(sender)

1099
phillip.m.love@enron.com


In [13]:
train_prop = 0.7
# data loading and separation
df_interest = df_emails.ix[mids]
#df_interest.head()
df_train = df_interest.sample(frac=train_prop)
train_ids = list(df_train.index.values)

In [38]:
print(train_ids)

[288750, 288676, 289264, 288949, 288804, 288751, 288864, 289061, 289300, 335948, 288661, 384728, 289277, 288935, 288755, 288660, 289275, 288735, 288846, 289240, 288519, 289102, 289243, 289040, 288625, 288856, 288760, 289002, 288783, 288607, 289204, 288534, 291338, 289254, 288776, 289182, 289321, 288961, 291116, 288785, 291355, 291323, 288689, 288975, 291310, 291246, 289095, 289074, 289033, 288487, 289246, 289131, 288556, 159539, 289076, 288546, 288734, 288584, 288827, 288733, 289235, 291307, 288983, 288744, 289209, 288940, 288490, 288740, 288483, 288637, 289247, 291244, 288987, 288972, 288719, 289256, 288913, 267309, 291311, 289253, 289014, 288798, 289007, 288927, 288784, 291250, 282082, 291316, 292041, 291297, 289067, 288605, 288616, 291367, 288621, 288643, 288721, 288958, 288623, 289198, 288782, 289093, 288844, 291344, 288777, 289217, 288813, 289223, 288882, 288575, 288976, 288552, 288791, 289268, 291277, 291098, 288893, 288480, 288836, 291231, 288528, 288952, 288841, 289126, 288815,

In [16]:
# data cleansing
unique_rec_train = unique_recipients(df_train)
add_book = address_book(unique_rec_train)
df_train["clean body"] = df_train["body"].apply(lambda x : clean(x, except_words=add_book))

In [17]:
# feature engineering
input_bow = TfidfVectorizer(norm="l2",
                       tokenizer=stem_tokenizer)
X_train = input_bow.fit_transform(df_train["clean body"])
hour_train = sparse.csr_matrix(df_train["timestamp"].dt.hour.as_matrix()).transpose()
day_train = sparse.csr_matrix(df_train["timestamp"].dt.dayofweek.as_matrix()).transpose()
output_bow = CountVectorizer(tokenizer=split_tokenizer,
                             vocabulary=unique_rec_train)
Y_train = output_bow.fit_transform(df_train["recipients"])
# model fitting
rf = RandomForestRegressor(n_estimators=15,
                       max_depth=30,
                       n_jobs=-1,
                       min_samples_leaf=max(1,int(0.0002*n_mails)))
rf.fit(X_train, Y_train.toarray())

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=30,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=15, n_jobs=-1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

# Zac's idea

In [19]:
train_prop = 0.7
df_small_senders = df_email_senders.sample(frac=0.3)
sender_models = dict()
sender_input_bow = dict()
sender_output_bow = dict()
sender_add_book = dict()
sender_rec = dict()
sender_train_ids = dict()
for index, row in df_small_senders.iterrows():
    sender = row["sender"]
    mids = list(map(int, row["mids"].split()))
    n_mails = len(mids)
    # data loading and separation
    df_interest = df_emails.ix[mids]
    df_train = df_interest.sample(frac=train_prop)
    train_ids = list(df_train.index.values)
    # data cleansing
    unique_rec_train = unique_recipients(df_train)
    add_book = address_book(unique_rec_train)
    df_train["clean body"] = df_train["body"].apply(lambda x : clean(x, except_words=add_book))
    # feature engineering
    input_bow = TfidfVectorizer(norm="l2",
                           tokenizer=stem_tokenizer)
    X_train = input_bow.fit_transform(df_train["clean body"])
    hour_train = sparse.csr_matrix(df_train["timestamp"].dt.hour.as_matrix()).transpose()
    day_train = sparse.csr_matrix(df_train["timestamp"].dt.dayofweek.as_matrix()).transpose()
    X_train = sparse.hstack((X_train, hour_train, day_train))
    output_bow = CountVectorizer(tokenizer=split_tokenizer,
                             vocabulary=unique_rec_train)
    Y_train = output_bow.fit_transform(df_train["recipients"])
    # model fitting
    rf = RandomForestRegressor(n_estimators=15,
                           max_depth=30,
                           n_jobs=-1,
                           min_samples_leaf=max(1,int(0.0002*n_mails)))
    rf.fit(X_train, Y_train.toarray())
    # attributions
    sender_models[sender] = rf
    sender_rec[sender] = unique_rec_train
    sender_add_book[sender] = add_book
    sender_input_bow[sender] = input_bow
    sender_output_bow[sender] = output_bow
    sender_train_ids[sender] = train_ids



# Evaluation

In [20]:
df_precision = pd.DataFrame(columns=["sender", "n_mails", "precision"])

In [21]:
top = 10
for index, row in df_small_senders.iterrows():
    sender = row["sender"]
    mids = list(map(int, row["mids"].split()))
    # model loading
    rf = sender_models[sender]
    unique_rec_train = sender_rec[sender]
    add_book = sender_add_book[sender]
    input_bow = sender_input_bow[sender]
    output_bow = sender_output_bow[sender]
    train_ids = sender_train_ids[sender]
    # data loading
    df_interest = df_emails.ix[mids]
    train_mask = df_interest.index.isin(train_ids)
    df_test = df_interest[~train_mask]
    n_mails = df_test.shape[0]
    # data cleansing
    df_test["clean body"] = df_test["body"].apply(lambda x : clean(x, except_words=add_book))
    # feature engineering
    X_test = input_bow.transform(df_test["clean body"])
    hour_test = sparse.csr_matrix(df_test["timestamp"].dt.hour.as_matrix()).transpose()
    day_test = sparse.csr_matrix(df_test["timestamp"].dt.dayofweek.as_matrix()).transpose()
    X_test = sparse.hstack((X_test, hour_test, day_test))
    # Prediction
    Y_test = rf.predict(X_test)
    #decoding
    recipients_map = output_bow.get_feature_names()
    if len(Y_test.shape) > 1 and top < Y_test.shape[1]:
        best_pred_idx = np.argpartition(-Y_test, top, axis=1)[:,:top]
        sorted_ids = np.argsort(Y_test[np.arange(Y_test.shape[0])[:, None], best_pred_idx])[:,::-1]
        sorted_idx = best_pred_idx[np.arange(best_pred_idx.shape[0])[:, None], sorted_ids]
    else:
        sorted_idx = np.argsort(-Y_test)
    preci = 0
    for index_test, row_test in df_test.iterrows():
        i = df_test.index.get_loc(index_test)
        if len(recipients_map) > 1:
            rec_ids = sorted_idx[i, :]
            rec_pred = " ".join([recipients_map[rec_id] for rec_id in rec_ids])
        else:
            rec_pred = recipients_map[0]
        preci += precision(rec_pred, row_test["recipients"])
    preci /= n_mails
    df_precision.loc[index] = [sender, n_mails, preci]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [30]:
from tools.data_cleansing import remove_after_indicator
remove_after_indicator(df_test.loc[252208, "body"], "Original Message")

'X-FileName: john zufferli 6-26-02.PST -----'

In [31]:
df_test.loc[252208, "clean body"]

'x filenam john zufferl pst origin mess rit fred fred rit eal ab ca enron sent tuesday jun amto charlesm epc ca dal mitchel e mail david tesk e mail doug hea e mail frayn donaldson e mail frit de kiewit e mail gary hagerty e mail gary k halabut e mail geoff wagn e mail jim oosterba e mail john maniawsk e mail zufferl john ken wagn e mail doug castellino transcanad com peter karl e mail rick hendrickson e mail rick huery e mail ron raymond e mail tara j konowalec e mail wayn symington e mail kri yadav pancanad ca greg lingelbach transalt com sandy con e mail hly enmax com dan ruiu e mail cc trav shelleysubject gen maint coordin group meet jun wish conf meet pleasenot follow outsid calgary would cal ent cod pound key calgary would cal ent cod poundkey let know quest thank fred rit p eng techn serv esb albert ltd phon cel web sit www eal ab ca http www eal ab ca origin mess rit fred sent friday jun pm charlesm epc ca dal mitchel e mail david tesk e mail doug hea e mail frayn donaldson e m

# Submission

## Actual model training

In [46]:
sender_models = dict()
sender_input_bow = dict()
sender_output_bow = dict()
sender_add_book = dict()
sender_rec = dict()
for index, row in df_email_senders.iterrows():
    sender = row["sender"]
    mids = list(map(int, row["mids"].split()))
    n_mails = len(mids)
    # data loading
    df_train = df_emails.ix[mids]
    # data cleansing
    unique_rec_train = unique_recipients(df_train)
    add_book = address_book(unique_rec_train)
    df_train["clean body"] = clean(df_train["body"], add_book)
    # feature engineering
    input_bow = TfidfVectorizer(norm="l2",
                           tokenizer=stem_tokenizer)
    X_train = input_bow.fit_transform(df_train["clean body"])
    hour_train = sparse.csr_matrix(df_train["timestamp"].dt.hour.as_matrix()).transpose()
    day_train = sparse.csr_matrix(df_train["timestamp"].dt.dayofweek.as_matrix()).transpose()
    X_train = sparse.hstack((X_train, hour_train, day_train))
    output_bow = CountVectorizer(tokenizer=split_tokenizer,
                             vocabulary=unique_rec_train)
    Y_train = output_bow.fit_transform(df_train["recipients"])
    # model fitting
    rf = RandomForestRegressor(n_estimators=15,
                           max_depth=30,
                           n_jobs=-1,
                           min_samples_leaf=max(1,int(0.04*n_mails)))
    rf.fit(X_train, Y_train.toarray())
    # attributions
    sender_models[sender] = rf
    sender_rec[sender] = unique_rec_train
    sender_add_book[sender] = add_book
    sender_input_bow[sender] = input_bow
    sender_output_bow[sender] = output_bow



## Data loading

In [48]:
df_submission_senders = load_email_senders(set_type="test")
df_submission = load_emails(set_type="test")
df_submission["recipients"] = ""

In [49]:
for index, row in df_submission[df_submission["date"].str.contains("0001")].iterrows():
    old_date = df_submission.loc[index, "date"]
    new_date = "2"+old_date[1:]
    df_submission.loc[index, "date"] = new_date
for index, row in df_submission[df_submission["date"].str.contains("0002")].iterrows():
    old_date = df_submission.loc[index, "date"]
    new_date = "2"+old_date[1:]
    df_submission.loc[index, "date"] = new_date

In [50]:
df_submission["timestamp"] = pd.to_datetime(df_submission["date"], format="%Y-%m-%d %H:%M:%S", errors="raise")

In [51]:
top = 10
for index, row in df_submission_senders.iterrows():
    sender = row["sender"]
    mids = map(int, row["mids"].split())
    # data loading
    df_eval = df_submission.ix[mids]
    # model loading
    rf = sender_models[sender]
    unique_rec_train = sender_rec[sender]
    add_book = sender_add_book[sender]
    input_bow = sender_input_bow[sender]
    output_bow = sender_output_bow[sender]
    # data cleansing
    df_eval["clean body"] = clean(df_eval["body"], add_book)
    # feature engineering
    X_eval = input_bow.transform(df_eval["clean body"])
    hour_eval = sparse.csr_matrix(df_eval["timestamp"].dt.hour.as_matrix()).transpose()
    day_eval = sparse.csr_matrix(df_eval["timestamp"].dt.dayofweek.as_matrix()).transpose()
    X_eval = sparse.hstack((X_eval, hour_eval, day_eval))
    # Prediction
    Y_eval = rf.predict(X_eval)
    #decoding
    recipients_map = output_bow.get_feature_names()
    if len(Y_eval.shape) > 1 and top < Y_eval.shape[1]:
        best_pred_idx = np.argpartition(-Y_eval, top, axis=1)[:,:top]
        sorted_ids = np.argsort(Y_eval[np.arange(Y_eval.shape[0])[:, None], best_pred_idx])[:,::-1]
        sorted_idx = best_pred_idx[np.arange(best_pred_idx.shape[0])[:, None], sorted_ids]
    else:
        sorted_idx = np.argsort(-Y_eval)
    for index, row in df_eval.iterrows():
        i = df_eval.index.get_loc(index)
        if len(recipients_map) > 1:
            rec_ids = sorted_idx[i, :]
            rec_pred = " ".join([recipients_map[rec_id] for rec_id in rec_ids])
        else:
            rec_pred = recipients_map[0]
        df_submission.set_value(index, "recipients", rec_pred)

In [52]:
save_submission(df_submission,
               algo="RF single sender regressor tf idf + date",
               member="Pierre")

'RF single sender regressor tf idf + date_Pierre_1487205539.555113.csv'