In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import os.path as op

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from tools.data_handling import enrich_emails, unique_recipients, address_book
from tools.data_cleansing import clean
from tools.features import split_tokenizer, stem_tokenizer, lemmatize_tokenizer, VectorizerManager, FastCountVectorizer, GoWVectorizer
from tools.evaluation import top_emails, evaluate
from tools.training import data_generator, EvaluateAndSave

# Process data

In [None]:
overwrite = False
df_emails = enrich_emails(overwrite=overwrite)
df_emails = df_emails.head(int(df_emails.shape[0]*0.2))

## Train / Test split

In [None]:
ratio = 0.9
n_train = int(ratio * df_emails.shape[0])
df_train = df_emails.sample(n=n_train, random_state=0)
df_test = df_emails.drop(df_train.index)

## Clean & Tokenize

In [None]:
recipients = unique_recipients(df_train)
names = address_book(recipients)
names.add("fyi")
excepted_words = names

Clean

In [None]:
# TODO: all_recipients += df_emails["sender"].str.cat(sep=" ").split()

In [None]:
@save_and_reload_df
def clean_df_train(df, excepted_words):
    df["clean_body"] = df["body"].apply(lambda x: clean(x, excepted_words, only_english=False))
    return df


@save_and_reload_df
def clean_df_test(df, excepted_words):
    df["clean_body"] = df["body"].apply(lambda x: clean(x, excepted_words, only_english=False))
    return df

overwrite = True
df_train = clean_df_train(df_train, excepted_words, overwrite=overwrite)
df_test = clean_df_test(df_test, excepted_words, overwrite=overwrite)

Tokenize

In [None]:
# TODO: Tester en input uniquement les noms de l'address book
# TODO: Rajouter en input le sender
# TODO: Regarder à quels mots correspondent les poids les plus grands

In [None]:
sender_vectorizer = FastCountVectorizer()
body_vectorizer = FastCountVectorizer()
recipients_vectorizer = CountVectorizer(vocabulary=recipients)
vm = VectorizerManager(sender_vectorizer, body_vectorizer, recipients_vectorizer)
vm.fit_sender(df_train["sender"])
vm.fit_body(df_train["clean_body"])
vm.fit_recipients(df_train["recipients"])
print("Features: {}, Outputs: {}".format(vm.n_features, vm.n_outputs))

# Prediction

## Neural Network

In [None]:
from keras.layers import Input, Dense
from keras.models import Model
from keras.backend.tensorflow_backend import set_session, clear_session
import tensorflow as tf
clear_session()
tf.reset_default_graph()

In [None]:
# Set GPU memory usage
#config = tf.ConfigProto()
#config.gpu_options.per_process_gpu_memory_fraction = 1
#set_session(tf.Session(config=config))

In [None]:
batch_size = 8
samples_per_epoch = 8 * batch_size
nb_epoch = 20

# Create callbacks
X_test_body = vm.vectorize_body(df_test["clean_body"])
X_test_sender = vm.vectorize_sender(df_test["sender"])
X_test = np.concatenate((X_test_body, X_test_sender), axis=1)
ground_truth = df_test["recipients"].str.split(expand=True).as_matrix()
recipients_map = np.array(vm.recipients_vectorizer.get_feature_names())

callbacks = []
filepath = "models/weights_{epoch:02d}.hdf5"
evaluate_and_save = EvaluateAndSave(X_test, recipients_map, ground_truth,
                                    batch_size=batch_size)
callbacks.append(evaluate_and_save)

generator = data_generator(df_train, vm=vm, batch_size=batch_size)

In [None]:
inputs = Input(shape=(vm.n_features,))
x = inputs
x = Dense(vm.n_outputs//8, activation='relu')(x)
x = Dense(vm.n_outputs//4, activation='relu')(x)
# TODO: Changer softmax
predictions = Dense(vm.n_outputs, activation='softmax')(x)

model = Model(input=inputs, output=predictions)
model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy')


model.fit_generator(
    generator,
    samples_per_epoch=samples_per_epoch, nb_epoch=nb_epoch,
    callbacks=callbacks)