In [1]:
# Import some useful python modules
import matplotlib as plt
import seaborn as sns
%matplotlib inline

import numpy as np
import pandas as pd
import os
import datetime
import sqlite3
from sqlalchemy import create_engine, text
import nltk

# change working directory into folder with data, view what is available
os.chdir(os.getcwd() + '\data')

# Parse data into Pandas DataFrames
# emails
emails = pd.read_table('emails.tsv', header=None)
emails.columns = ['email_id','timestamp','email_type','email_variant','member_id']
emails.email_variant = emails.email_variant.apply(lambda x: str(x).replace('\\N', 'NONE')) 
emails['email_variant_first_part'] = emails['email_variant'].apply(lambda x: x.split(':')[0] if not pd.isnull(x) else np.nan)
emails['email_variant_second_part'] = emails['email_variant'].apply(lambda x: x.split(':')[1] if not pd.isnull(x) and len(x.split(':')) > 1 else np.nan)
emails.member_id = emails.member_id.astype(str)

# email responses
email_responses = pd.read_table('email_responses.tsv', header=0)

# There are some issues with the members file (varying apparent row lengths, likely due to a separator issue) 
# so we will read that directly and try to identify where the issues are. 
# After the data is in a usable form and then parse the data into a dataframe manually
members_file = open('members.tsv','r')
members_list = []
members_list_raw = []
for line in members_file.readlines():
    members_list_raw.append(line)
    members_list.append(line.replace('\n','').split('\t'))

#print members_list[0]
#members_list_raw[205009]

# From investigating the above rows, we can see that the issue stems from additional tabs in the value fields
# we therefore need to replace the invalid dditional tab values so the data can be formatted correctly
clean_members_list = [
    [unicode(col.decode("ascii","ignore")) 
        for col in member.replace('\n','').replace('\\\t','').replace('\N','').replace("'","").replace('+','').split('\t')
    ] for member in members_list_raw
]

members = pd.DataFrame(data=clean_members_list[1:], 
                       columns=clean_members_list[0])

emails_and_responses = pd.merge(emails, email_responses, how='inner', on='email_id', suffixes=['_1', '_2'])
emails_and_responses['action'] = emails_and_responses['action'].fillna('non_open')

emails_and_members = pd.merge(emails_and_responses, members, how='inner', on='member_id', suffixes=['_1', '_2'])

In [2]:
emails_and_members.head()

Unnamed: 0,email_id,timestamp_1,email_type,email_variant,member_id,email_variant_first_part,email_variant_second_part,timestamp_2,action,date,email_domain,first_name,city,state,zip,degree_level,hs_or_ged_year,pcp_score,keyword
0,205570076,2012-09-01 00:10:08,Transactional JR Welcome Email,account_login_info_s2_v1,14802260,account_login_info_s2_v1,,2012-09-01 00:23:49,click,2012-09-01 01:02:03,yahoo.com,michael,SPARTA,TN,38583,Some HS,2013,0.10344,kroger jobs
1,205570123,2012-09-01 00:11:07,Transactional Forgot Password Email,,8450299,,,2012-09-01 00:12:36,open,2011-07-13 21:57:47,hotmail.com,Maria,Chula vista,CA,91911,Associate,1979,0.320591,Costco job
2,205570123,2012-09-01 00:11:07,Transactional Forgot Password Email,,8450299,,,2012-09-01 00:12:57,click,2011-07-13 21:57:47,hotmail.com,Maria,Chula vista,CA,91911,Associate,1979,0.320591,Costco job
3,205570320,2012-09-01 00:30:08,Transactional JR Welcome Email,account_login_info_s2_v1,14802278,account_login_info_s2_v1,,2012-09-01 00:30:52,open,2012-09-01 01:24:50,yahoo.com,Frank,FORT LAUDERDALE,FL,33301,Some College,1981,0.174869,FedEx Job
4,205570320,2012-09-01 00:30:08,Transactional JR Welcome Email,account_login_info_s2_v1,14802278,account_login_info_s2_v1,,2012-09-01 00:31:09,click,2012-09-01 01:24:50,yahoo.com,Frank,FORT LAUDERDALE,FL,33301,Some College,1981,0.174869,FedEx Job


In [5]:
emails_and_members.email_variant.unique()

array(['account_login_info_s2_v1', 'nan',
       'fixed_keyword_cloud_s1_v1:tplus1_age22+',
       'fixed_keyword_cloud_s1_v1:tplus2', 'job_alert_s1_v1:tplus4_age22+',
       'fixed_keyword_cloud_s1_v1:tplus5_1opened',
       'job_alert_s1_v1:tplus11_age22+', 'job_alert_s1_v1:tplus14_age22+',
       'job_alert_s1_v1:tplus16_age22+', 'job_alert_s1_v1:tplus18_age22+',
       'job_alert_s1_v1:tplus23_age22+', 'job_alert_s1_v1:tplus25_age22+',
       'fixed_keyword_cloud_s1_v1:tplus26_1opened',
       'fixed_keyword_cloud_s1_v1:tplus3',
       'job_alert_s1_v1:tplus5_1opened',
       'fixed_keyword_cloud_s1_v1:tplus10',
       'job_alert_s1_v1:tplus1_age22+', 'job_alert_s1_v1:tplus2_age22+',
       'fixed_keyword_cloud_s1_v1:tplus4',
       'job_alert_s1_v1:tplus26_1opened', 'job_alert_s1_v1:tplus7_age22+',
       'fixed_keyword_cloud_s1_v1:tplus7_age22+',
       'fixed_keyword_cloud_s1_v1:tplus23',
       'job_alert_s1_v1:tplus12_1opened',
       'fixed_keyword_cloud_s1_v1:tplus33_1opened

In [7]:
emails_and_members.email_variant_first_part.unique()

array(['account_login_info_s2_v1', 'nan', 'fixed_keyword_cloud_s1_v1',
       'job_alert_s1_v1', 'birthday_s1_v1_f2',
       'birthday_joblist1_s1_v1_f2', 'NONE'], dtype=object)

In [13]:
emails_and_members[emails_and_members.action=='click'].head()
#

Unnamed: 0,email_id,timestamp_1,email_type,email_variant,member_id,email_variant_first_part,email_variant_second_part,timestamp_2,action,date,email_domain,first_name,city,state,zip,degree_level,hs_or_ged_year,pcp_score,keyword
0,205570076,2012-09-01 00:10:08,Transactional JR Welcome Email,account_login_info_s2_v1,14802260,account_login_info_s2_v1,,2012-09-01 00:23:49,click,2012-09-01 01:02:03,yahoo.com,michael,SPARTA,TN,38583,Some HS,2013,0.10344,kroger jobs
2,205570123,2012-09-01 00:11:07,Transactional Forgot Password Email,,8450299,,,2012-09-01 00:12:57,click,2011-07-13 21:57:47,hotmail.com,Maria,Chula vista,CA,91911,Associate,1979,0.320591,Costco job
4,205570320,2012-09-01 00:30:08,Transactional JR Welcome Email,account_login_info_s2_v1,14802278,account_login_info_s2_v1,,2012-09-01 00:31:09,click,2012-09-01 01:24:50,yahoo.com,Frank,FORT LAUDERDALE,FL,33301,Some College,1981,0.174869,FedEx Job
6,206014887,2012-09-02 04:00:08,T plus N,fixed_keyword_cloud_s1_v1:tplus1_age22+,14802278,fixed_keyword_cloud_s1_v1,tplus1_age22+,2012-09-02 15:35:01,click,2012-09-01 01:24:50,yahoo.com,Frank,FORT LAUDERDALE,FL,33301,Some College,1981,0.174869,FedEx Job
8,206384628,2012-09-03 04:00:07,T plus N,fixed_keyword_cloud_s1_v1:tplus2,14802278,fixed_keyword_cloud_s1_v1,tplus2,2012-09-03 05:18:37,click,2012-09-01 01:24:50,yahoo.com,Frank,FORT LAUDERDALE,FL,33301,Some College,1981,0.174869,FedEx Job


In [14]:
emails_and_members[emails_and_members.email_id==206384628]

Unnamed: 0,email_id,timestamp_1,email_type,email_variant,member_id,email_variant_first_part,email_variant_second_part,timestamp_2,action,date,email_domain,first_name,city,state,zip,degree_level,hs_or_ged_year,pcp_score,keyword
7,206384628,2012-09-03 04:00:07,T plus N,fixed_keyword_cloud_s1_v1:tplus2,14802278,fixed_keyword_cloud_s1_v1,tplus2,2012-09-03 05:18:28,open,2012-09-01 01:24:50,yahoo.com,Frank,FORT LAUDERDALE,FL,33301,Some College,1981,0.174869,FedEx Job
8,206384628,2012-09-03 04:00:07,T plus N,fixed_keyword_cloud_s1_v1:tplus2,14802278,fixed_keyword_cloud_s1_v1,tplus2,2012-09-03 05:18:37,click,2012-09-01 01:24:50,yahoo.com,Frank,FORT LAUDERDALE,FL,33301,Some College,1981,0.174869,FedEx Job


In [15]:
email_responses.head()

Unnamed: 0,email_id,timestamp,action
0,205570123,2012-09-01 00:12:36,open
1,205570123,2012-09-01 00:12:57,click
2,205570076,2012-09-01 00:23:49,click
3,205570320,2012-09-01 00:30:52,open
4,205570320,2012-09-01 00:31:09,click


In [1]:
print(__doc__)

# Author: Mathieu Blondel <mathieu@mblondel.org>
#         Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
#         Balazs Kegl <balazs.kegl@gmail.com>
#         Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
# License: BSD Style.

import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm

from sklearn.datasets import make_blobs
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import brier_score_loss
from sklearn.calibration import CalibratedClassifierCV
from sklearn.cross_validation import train_test_split


n_samples = 50000
n_bins = 3  # use 3 bins for calibration_curve as we have 3 clusters here

# Generate 3 blobs with 2 classes where the second blob contains
# half positive samples and half negative samples. Probability in this
# blob is therefore 0.5.
centers = [(-5, -5), (0, 0), (5, 5)]
X, y = make_blobs(n_samples=n_samples, n_features=2, cluster_std=1.0,
                  centers=centers, shuffle=False, random_state=42)

y[:n_samples // 2] = 0
y[n_samples // 2:] = 1
sample_weight = np.random.RandomState(42).rand(y.shape[0])

# split train, test for calibration
X_train, X_test, y_train, y_test, sw_train, sw_test = \
    train_test_split(X, y, sample_weight, test_size=0.9, random_state=42)

# Gaussian Naive-Bayes with no calibration
clf = GaussianNB()
clf.fit(X_train, y_train)  # GaussianNB itself does not support sample-weights
prob_pos_clf = clf.predict_proba(X_test)[:, 1]

# Gaussian Naive-Bayes with isotonic calibration
clf_isotonic = CalibratedClassifierCV(clf, cv=2, method='isotonic')
clf_isotonic.fit(X_train, y_train, sw_train)
prob_pos_isotonic = clf_isotonic.predict_proba(X_test)[:, 1]

# Gaussian Naive-Bayes with sigmoid calibration
clf_sigmoid = CalibratedClassifierCV(clf, cv=2, method='sigmoid')
clf_sigmoid.fit(X_train, y_train, sw_train)
prob_pos_sigmoid = clf_sigmoid.predict_proba(X_test)[:, 1]

print("Brier scores: (the smaller the better)")

clf_score = brier_score_loss(y_test, prob_pos_clf, sw_test)
print("No calibration: %1.3f" % clf_score)

clf_isotonic_score = brier_score_loss(y_test, prob_pos_isotonic, sw_test)
print("With isotonic calibration: %1.3f" % clf_isotonic_score)

clf_sigmoid_score = brier_score_loss(y_test, prob_pos_sigmoid, sw_test)
print("With sigmoid calibration: %1.3f" % clf_sigmoid_score)

###############################################################################
# Plot the data and the predicted probabilities
plt.figure()
y_unique = np.unique(y)
colors = cm.rainbow(np.linspace(0.0, 1.0, y_unique.size))
for this_y, color in zip(y_unique, colors):
    this_X = X_train[y_train == this_y]
    this_sw = sw_train[y_train == this_y]
    plt.scatter(this_X[:, 0], this_X[:, 1], s=this_sw * 50, c=color, alpha=0.5,
                label="Class %s" % this_y)
plt.legend(loc="best")
plt.title("Data")

plt.figure()
order = np.lexsort((prob_pos_clf, ))
plt.plot(prob_pos_clf[order], 'r', label='No calibration (%1.3f)' % clf_score)
plt.plot(prob_pos_isotonic[order], 'g', linewidth=3,
         label='Isotonic calibration (%1.3f)' % clf_isotonic_score)
plt.plot(prob_pos_sigmoid[order], 'b', linewidth=3,
         label='Sigmoid calibration (%1.3f)' % clf_sigmoid_score)
plt.plot(np.linspace(0, y_test.size, 51)[1::2],
         y_test[order].reshape(25, -1).mean(1),
         'k', linewidth=3, label=r'Empirical')
plt.ylim([-0.05, 1.05])
plt.xlabel("Instances sorted according to predicted probability "
           "(uncalibrated GNB)")
plt.ylabel("P(y=1)")
plt.legend(loc="upper left")
plt.title("Gaussian naive Bayes probabilities")

plt.show()

Automatically created module for IPython interactive environment
Brier scores: (the smaller the better)
No calibration: 0.104
With isotonic calibration: 0.085
With sigmoid calibration: 0.109


  " itself." % estimator_name)


In [2]:
#sample_weight = np.random.RandomState(42).rand(y.shape[0])
sample_weight

array([ 0.37454012,  0.95071431,  0.73199394, ...,  0.74899508,
        0.52101091,  0.86170671])