In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import os.path as op

import matplotlib.pyplot as plt
from qgrid import show_grid
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from tools.data_handling import enrich_emails, unique_recipients, address_book
from tools.data_cleansing import clean
from tools.features import split_tokenizer, stem_tokenizer, lemmatize_tokenizer, Vectorizer
from tools.evaluation import top_emails, evaluate
from tools.training import data_generator, EvaluateAndSave

# Process data

In [None]:
overwrite = False
df_emails = enrich_emails(overwrite=overwrite)
df_emails = df_emails.head(int(df_emails.shape[0]*0.2))

## Train / Test split

In [None]:
ratio = 0.9
n_train = int(ratio * df_emails.shape[0])
df_train = df_emails.sample(n=n_train, random_state=0)
df_test = df_emails.drop(df_train.index)

## Clean & Tokenize

In [None]:
recipients = unique_recipients(df_train)
names = address_book(recipients)
names.add("fyi")
excepted_words = names

Clean

In [None]:
# TODO: all_recipients += df_emails["sender"].str.cat(sep=" ").split()

In [None]:
df_train["clean_body"] = clean(df_train["body"], excepted_words, only_english=True)
df_test["clean_body"] = clean(df_test["body"], excepted_words, only_english=True)

Tokenize

In [None]:
# TODO: Tester en input uniquement les noms de l'address book
# TODO: Rajouter en input le sender
# TODO: Regarder à quels mots correspondent les poids les plus grands

In [None]:
v = Vectorizer(recipients)
v.fit_input(df_train["clean_body"])
v.fit_output(df_train["recipients"])

# Prediction

## Neural Network

In [None]:
from keras.layers import Input, Dense
from keras.models import Model

In [None]:
batch_size = 32
samples_per_epoch = 32 * batch_size
nb_epoch = 20

inputs = Input(shape=(v.n_features,))
x = inputs
#x = Dense(n_features//2, activation='relu')(x)
# TODO: Changer softmax
predictions = Dense(v.n_outputs, activation='softmax')(x)

model = Model(input=inputs, output=predictions)
model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy')

In [None]:
# Create callbacks
X_test = v.vectorize_input(df_test["clean_body"])
ground_truth = df_test["recipients"].str.split(expand=True).as_matrix()
recipients_map = np.array(v.output_bow.get_feature_names())

callbacks = []
filepath = "models/weights_{epoch:02d}.hdf5"
evaluate_and_save = EvaluateAndSave(X_test, recipients_map, ground_truth,
                                    batch_size=batch_size)
callbacks.append(evaluate_and_save)

generator = data_generator(df_train, vectorizer=v, batch_size=batch_size)

In [None]:
model.fit_generator(
    generator,
    samples_per_epoch=samples_per_epoch, nb_epoch=nb_epoch,
    callbacks=callbacks, nb_worker=1)