In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import os.path as op

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from tools.utils import save_and_reload_df
from tools.data_handling import enrich_emails_train, enrich_emails_test, unique_recipients, address_book
from tools.data_cleansing import clean, remove_non_emails
from tools.features import split_tokenizer, stem_tokenizer, lemmatize_tokenizer, VectorizerManager, FastCountVectorizer, GoWVectorizer
from tools.evaluation import top_emails, evaluate, get_precision
from tools.training import data_generator, EvaluateAndSave
from tools.submission import save_submission

# Process data

In [None]:
overwrite = False
df_emails = enrich_emails_train(overwrite=overwrite)
df_emails = df_emails.head(int(df_emails.shape[0]*1))
# Remove recipients that are not emails (don't contain @)
df_emails["recipients"] = df_emails["recipients"].apply(lambda x: remove_non_emails(x))

## Train / Test split

In [None]:
ratio = 0.9
n_train = int(ratio * df_emails.shape[0])
df_train = df_emails.sample(n=n_train, random_state=0)
df_test = df_emails.drop(df_train.index)

## Clean & Tokenize

In [None]:
recipients = unique_recipients(df_train)
names = address_book(recipients)
names.add("fyi")
excepted_words = names

Clean

In [None]:
@save_and_reload_df
def clean_df_train(df, excepted_words):
    df["clean_body"] = df["body"].apply(lambda x: clean(x, excepted_words, only_english=False))
    return df


@save_and_reload_df
def clean_df_test(df, excepted_words):
    df["clean_body"] = df["body"].apply(lambda x: clean(x, excepted_words, only_english=False))
    return df


overwrite = False
df_train = clean_df_train(df_train, excepted_words, overwrite=overwrite)
df_test = clean_df_test(df_test, excepted_words, overwrite=overwrite)

Tokenize

In [None]:
# TODO: Regarder à quels mots correspondent les poids les plus grands

In [None]:
sender_vectorizer = FastCountVectorizer()
body_vectorizer = GoWVectorizer(min_df=15, max_features=1000000)
recipients_vectorizer = FastCountVectorizer(vocabulary=recipients)
vm = VectorizerManager(sender_vectorizer, body_vectorizer, recipients_vectorizer)
vm.fit_sender(df_train["sender"])
vm.fit_body(df_train["clean_body"])
vm.fit_recipients(df_train["recipients"])
print("Features: {}, Outputs: {}".format(vm.n_features, vm.n_outputs))
print("Train samples: {}, Test samples: {}".format(df_train.shape[0], df_test.shape[0]))

# Prediction

## Neural Network

In [None]:
from keras.layers import Input, Dense, Dropout, merge
from keras.models import Model
from keras.optimizers import RMSprop
from keras.backend.tensorflow_backend import set_session, clear_session
import tensorflow as tf
clear_session()
tf.reset_default_graph()

In [None]:
# Set GPU memory usage
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.45
set_session(tf.Session(config=config))

In [None]:
batch_size = 32
samples_per_epoch = 2048 * batch_size
nb_epoch = 100

# Create callbacks
X_test = vm.vectorize_x(df_test)
ground_truth = df_test["recipients"].str.split(expand=True).as_matrix()
recipients_map = np.array(vm.recipients_vectorizer.get_feature_names())

callbacks = []
filepath = "models/weights_{epoch:02d}.hdf5"
evaluate_and_save = EvaluateAndSave(X_test, recipients_map, ground_truth,
                                    batch_size=batch_size)
callbacks.append(evaluate_and_save)

generator = data_generator(df_train, vm=vm, batch_size=batch_size)

In [None]:
inputs = Input(shape=(vm.n_features,))
dense1 = Dense(vm.n_outputs//2, activation='relu')(inputs)
dense1 = Dropout(0.2)(dense1)
dense2 = Dense(vm.n_outputs//2, activation='relu')(dense1)
dense2 = Dropout(0.2)(dense2)
dense2 = merge([dense1, dense2], mode="sum")
predictions = Dense(vm.n_outputs, activation='sigmoid')(dense2)

optimizer = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.000001)
model = Model(input=inputs, output=predictions)
model.compile(optimizer=optimizer,
              loss="mse")
model.load_weights("models/nnet_0.45.hdf5")
print(model.summary())

In [None]:
model.fit_generator(
    generator,
    samples_per_epoch=samples_per_epoch, nb_epoch=nb_epoch,
    callbacks=callbacks)

## Test

In [None]:
model.load_weights("models/nnet_1488906783/nnet_0.427.hdf5")
Y_test = model.predict(X_test)
predictions = top_emails(Y_test, recipients_map)
precision = evaluate(predictions, ground_truth)
print("*** Precision: {prec:.3f} ***".format(prec=precision))

In [None]:
def shuffle_last(predictions, k_last=1):
    # Shuffle the elements in the k last columns
    # column per column
    predictions = predictions.copy()
    for j in range(k_last):
        np.random.shuffle(predictions[:, -(j+1)])
    return predictions

precision = evaluate(shuffle_last(predictions, 4), ground_truth)
print("*** Precision: {prec:.3f} ***".format(prec=precision))

# Submission

In [None]:
@save_and_reload_df
def clean_df_submission(df, excepted_words):
    df["clean_body"] = df["body"].apply(lambda x: clean(x, excepted_words, only_english=False))
    return df

df_submission = enrich_emails_test()
df_submission = clean_df_submission(df_submission, excepted_words, overwrite=overwrite)
X_submission = vm.vectorize_x(df_submission)

In [None]:
Y_submission = model.predict(X_submission)
predictions = top_emails(Y_submission, recipients_map)
predictions = shuffle_last(predictions, 4)

In [None]:
df_submission["recipients"] = list(predictions)
# Convert arrays to strings
df_submission["recipients"] = df_submission["recipients"].apply(lambda x: " ".join(x))

save_submission(df_submission,
                algo="neural net s",
                member="loulou")