In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import tensorflow_hub as hub
import tensorflow_text as text
# import tensorflow as tf
from tqdm import tqdm  # Optional: progress bar
import gdown

In [4]:
# https://drive.google.com/file/d/1h6DlFxmt9ztvz6s6O6ycw7-BQzY5nzRR/view?usp=sharing
gdown.download(id='1h6DlFxmt9ztvz6s6O6ycw7-BQzY5nzRR', output='04_encoded_labelled_emails.csv', quiet=False)
# labelled_emails_df = pd.read_csv('../data/04_encoded_labelled_emails.csv')
labelled_emails_df = pd.read_csv('04_encoded_labelled_emails.csv')
labelled_emails_df.head()

Downloading...
From: https://drive.google.com/uc?id=1h6DlFxmt9ztvz6s6O6ycw7-BQzY5nzRR
To: /Users/pkchoy/code/data_science_bootcamp/predictor-with-files/notebooks/04_encoded_labelled_emails.csv
100%|██████████| 5.98M/5.98M [00:00<00:00, 17.3MB/s]


Unnamed: 0,Date,From,To,Subject,X_From,X_To,Message,Entire-Message,DISC_rule,DISC_manual,DISC_final,D,I,S,C
0,"Fri, 25 Aug 2000 03:30:00 -0700 (PDT)",phillip.allen@enron.com,brad.mcsherry@enron.com,"Cc: [EMAIL], [EMAIL]",Phillip K Allen,Brad McSherry,"Brad,\nWith regard to Tori Kuykendall, I would...","Cc: [EMAIL], [EMAIL] Brad,\nWith regard to Tor...",['S'],[],['S'],0,0,1,0
1,"Mon, 10 Jul 2000 06:54:00 -0700 (PDT)",phillip.allen@enron.com,al.pollard@enron.com,Re: Katy flatlands,Phillip K Allen,Al Pollard,"Al,\nI am not in good enough shape to ride a c...","Re: Katy flatlands Al,\nI am not in good enoug...",['D'],[],['D'],1,0,0,0
2,"Tue, 24 Apr 2001 13:05:00 -0700 (PDT)",phillip.allen@enron.com,jsmith@austintx.com,Re: The Stage,Phillip K Allen,Jeff Smith <jsmith@austintx.com>,I just spoke to the insurance company. They ar...,Re: The Stage I just spoke to the insurance co...,['C'],[],['C'],0,0,0,1
3,"Mon, 5 Mar 2001 07:21:00 -0800 (PST)",phillip.allen@enron.com,cbpres@austin.rr.com,"Cc: [EMAIL], [EMAIL]",Phillip K Allen,cbpres@austin.rr.com,"George,\nI am back in the office and ready to ...","Cc: [EMAIL], [EMAIL] George,\nI am back in the...",['I'],[],['I'],0,1,0,0
4,"Wed, 13 Dec 2000 06:35:00 -0800 (PST)",critical.notice@enron.com,"ywang@enron.com, patti.sullivan@enron.com, phi...",New Notice from Transwestern Pipeline Co.,critical.notice@Enron.com,"ywang@Enron.com, Patti.Sullivan@Enron.com, Phi...",Transwestern Pipeline Co. posted new notice(s)...,New Notice from Transwestern Pipeline Co. Tran...,['C'],[],['C'],0,0,0,1


In [5]:
labelled_emails_df.isnull().sum()

Date              0
From              0
To                0
Subject           0
X_From            0
X_To              0
Message           0
Entire-Message    0
DISC_rule         0
DISC_manual       0
DISC_final        0
D                 0
I                 0
S                 0
C                 0
dtype: int64

In [6]:
mlb = MultiLabelBinarizer(classes=['D', 'I', 'S', 'C'])
Y = mlb.fit_transform(labelled_emails_df['DISC_final']) # Learns the unique labels
mlb.classes_ # The labels learned in sorted order
Y = mlb.transform(labelled_emails_df['DISC_final'])
print(Y)

[[0 0 1 0]
 [1 0 0 0]
 [0 0 0 1]
 ...
 [0 1 0 0]
 [0 1 0 0]
 [1 0 0 0]]




## Split

In [7]:
X_train_text, X_test_text, Y_train, Y_test = train_test_split(labelled_emails_df['Entire-Message'], Y, test_size=0.2, random_state=32)
print('Y_train: ', Y_train)
print('Y_test: ', Y_test)
print('shape: ', Y_train.shape, Y_test.shape)

Y_train:  [[0 0 0 1]
 [0 0 0 1]
 [0 1 0 0]
 ...
 [0 0 1 0]
 [1 0 0 0]
 [0 1 0 0]]
Y_test:  [[0 0 1 0]
 [0 1 0 0]
 [0 1 0 0]
 ...
 [0 1 0 0]
 [0 0 0 1]
 [1 0 0 0]]
shape:  (1910, 4) (478, 4)


In [8]:
np.save('../model_data/Y_train.npy', Y_train)
np.save('../model_data/Y_test.npy', Y_test)

## Feature Extraction: Generate BERT Embeddings from Entire-Message of labelled samples

In [9]:
# bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
# bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3")

# print(os.getcwd())
# print(os.listdir("tfhub_modules"))

bert_preprocess = hub.KerasLayer("../tfhub_modules/bert_preprocess", trainable=False)
bert_encoder = hub.KerasLayer("../tfhub_modules/bert_encoder", trainable=False)

In [10]:
# def get_bert_embeddings(texts):
#     text_inputs = tf.constant(texts)
#     tokenized_text = bert_preprocess(text_inputs)
#     outputs = bert_encoder(tokenized_text)
#     print(outputs.keys())
#     return outputs['pooled_output'].numpy()

# def get_bert_embeddings(texts, batch_size=96):
#     all_embeddings = []

#     for i in tqdm(range(0, len(texts), batch_size)):
#         batch = texts[i:i + batch_size]
#         batch_inputs = tf.constant(batch)
#         tokenized = bert_preprocess(batch_inputs)
#         outputs = bert_encoder(tokenized)
#         pooled = outputs['pooled_output'].numpy()
#         all_embeddings.append(pooled)

#     return np.vstack(all_embeddings)

# X_train = get_bert_embeddings(X_train_text.tolist())
# X_test = get_bert_embeddings(X_test_text.tolist())

# np.save("../model_data/X_train_bert.npy", X_train)
# np.save("../model_data/X_test_bert.npy", X_test)

# print("X_train", X_train)
# print("X_testX", X_test)

In [None]:
# all_emails_df = pd.read_csv('../data/04_all_emails_with_labelled_samples.csv')


In [None]:
# all_emails_df.sample(10, random_state=32)

Unnamed: 0,Date,From,To,Subject,X_From,X_To,Message,Entire-Message,DISC_rule,DISC_manual,DISC_final
22402,"Wed, 15 Mar 2000 09:56:00 -0800 (PST)",michael.burke@enron.com,stanley.horton@enron.com,EOTT Sub Unit Options plan,Michael Burke,Stanley Horton,"Stan,\nThis my understanding of the proposal\n...","EOTT Sub Unit Options plan Stan,\nThis my unde...",['I'],[],['I']
21391,"Mon, 19 Nov 2001 13:33:56 -0800 (PST)",max.sonnonstine@enron.com,marie.heard@enron.com,Forest and Noble Gas,"Sonnonstine, Max </O=ENRON/OU=NA/CN=RECIPIENTS...","Heard, Marie </O=ENRON/OU=NA/CN=RECIPIENTS/CN=...","Marie,\nSince Jay is out this week, I'll be ""w...","Forest and Noble Gas Marie,\nSince Jay is out ...",[],[],[]
8258,"Tue, 8 May 2001 03:52:00 -0700 (PDT)",jennifer.rudolph@enron.com,ca.team@enron.com,NEWS: quick read - calif electric lottery,Jennifer Rudolph,CA Team,* an editorial from today's Wall Street Journa...,NEWS: quick read - calif electric lottery * an...,['C'],[],['C']
27522,"Wed, 19 Dec 2001 14:26:51 -0800 (PST)",lance.jameson@enron.com,tana.jones@enron.com,29th floor,"Jameson, Lance </O=ENRON/OU=NA/CN=RECIPIENTS/C...","Jones, Tana </O=ENRON/OU=NA/CN=RECIPIENTS/CN=T...","Tana,\nThe suite of offices on 29 will be comp...","29th floor Tana,\nThe suite of offices on 29 w...",['I'],[],['I']
44698,"Thu, 10 May 2001 03:57:00 -0700 (PDT)",wade.stubblefield@enron.com,"rogers.herndon@enron.com, meredith.eggleston@e...",Project Teams,Wade Stubblefield,"Rogers Herndon, Meredith M Eggleston, Don Blac...",Detailed below are the project teams identifie...,Project Teams Detailed below are the project t...,['D'],[],['D']
28785,"Wed, 10 Jan 2001 07:49:00 -0800 (PST)",zimin.lu@enron.com,kenneth.parkhill@enron.com,Storage model change: Commodity Delta,Zimin Lu,Kenneth Parkhill,"Dear All,\nI change the storage model output t...",Storage model change: Commodity Delta Dear All...,['C'],[],['C']
35942,"Mon, 18 Sep 2000 08:25:00 -0700 (PDT)",audrey.robertson@enron.com,"jeffery.fawcett@enron.com, steven.harris@enron...",Customer Meeting Attendance,Audrey Robertson,"Jeffery Fawcett, Steven Harris, Kevin Hyatt, L...","As of 3:00p today, Monday, September 18, I hav...","Customer Meeting Attendance As of 3:00p today,...",['C'],[],['C']
47543,"Mon, 5 Nov 2001 11:16:13 -0800 (PST)",josie.jarnagin@enron.com,center.dl-portland@enron.com,Reminder-Flu Shots-Please respond by 11/6/01 i...,"Jarnagin, Josie </O=ENRON/OU=NA/CN=RECIPIENTS/...",DL-Portland World Trade Center </O=ENRON/OU=NA...,Flu Shot Clinic Sponsored By Enron\nWhen: Tues...,Reminder-Flu Shots-Please respond by 11/6/01 i...,['I'],[],['I']
6046,"Fri, 14 Dec 2001 12:08:03 -0800 (PST)",ruth.jensen@enron.com,"ron.beidelman@enron.com, larry.campbell@enron....",CAM Applications to Engines with Catalytic Con...,"Jensen, Ruth </O=ENRON/OU=NA/CN=RECIPIENTS/CN=...","Beidelman, Ron </O=ENRON/OU=NA/CN=RECIPIENTS/C...",I am trying to compile a list of engines with ...,CAM Applications to Engines with Catalytic Con...,['I'],[],['I']
44871,"Tue, 26 Jun 2001 11:02:22 -0700 (PDT)",m..presto@enron.com,greg.wolfe@enron.com,Real-time opportunities,"Presto, Kevin M. </O=ENRON/OU=NA/CN=RECIPIENTS...","Wolfe, Greg </O=ENRON/OU=NA/CN=RECIPIENTS/CN=G...","Greg,\nMy brother is looking to make a career ...","Real-time opportunities Greg,\nMy brother is l...",['D'],[],['D']


In [13]:
# X = get_bert_embeddings(all_emails_df['Entire-Message'].tolist())
# no need Y coz that's what we're trying to predict
