In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import os.path as op

import matplotlib.pyplot as plt
from qgrid import show_grid
import pandas as pd
import numpy as np

from tools.data_cleansing import remove_numbers_and_ponctuation, remove_stopwords, remove_non_english_words, get_clean_df_train, get_clean_df_test
from tools.data_exploration import print_email_by_id, emails_sent_distribution, emails_received_distribution,\
body_length_distribution, number_of_recipients_distribution
from tools.data_handling import enrich_emails, load_email_senders, unique_recipients, address_book
from tools.features import bag_of_words, bag_of_emails
from tools.evaluation import top_emails, evaluate

# Training & Testing data

Train / Test split

In [2]:
overwrite = False
df_emails = enrich_emails(overwrite=overwrite)
df_train = get_clean_df_train(ratio=0.9, overwrite=overwrite)
df_test = get_clean_df_test(overwrite=overwrite)
df_train = df_train.fillna("")
df_test = df_test.fillna("")

Reading dataframe from data/enrich_emails.csv
Removing numbers and punctuation
Removing stopwords
Removing non english words
Reading dataframe from data/enrich_emails.csv
Reading dataframe from data/get_clean_df_train.csv
Removing numbers and punctuation
Removing stopwords
Removing non english words


Vectorize

In [9]:
X_train, bow_vectorizer = bag_of_words(df_train["body"], max_features=None)
X_test = bow_vectorizer.transform(df_test["body"])

Y_train, boe_vectorizer = bag_of_emails(df_train["recipients"], unique_recipients(df_train), max_features=None)

# Convert from sparse matrix to numpy array
X_train = X_train.toarray()
X_test = X_test.toarray()
Y_train = Y_train.toarray()

In [10]:
n_train_samples, n_outputs = Y_train.shape
n_test_samples, n_features = X_test.shape
print("Train samples: {} - Test samples: {}\nFeatures: {} - Outputs: {}".format(n_train_samples, n_test_samples, n_features, n_outputs))

Train samples: 39251 - Test samples: 4362
Features: 25591 - Outputs: 9530


# Prediction

## Linear regression
Takes too long on the whole dataset and all features

In [7]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, Y_train)

Y_pred = model.predict(X_test)

In [None]:
recipients_map = np.array(boe_vectorizer.get_feature_names())
predictions = top_emails(Y_pred, recipients_map)
ground_truth = df_test["recipients"].str.split(expand=True).as_matrix()
evaluate(predictions, ground_truth)