# Enron Action Item classification 

In [79]:
import pandas as pd
import tensorflow_hub as hub
import tensorflow as tf
from sklearn.metrics import precision_score,recall_score,f1_score,roc_curve,confusion_matrix
from sklearn.model_selection import train_test_split


In [80]:
def clean_text_through_regex(message):
#     print(row)
    tokens = []
    try:
        for token in message.split():
            token = token.lower()
            token = re.sub("[image]","",token)
            token=re.sub('x-.*:',"",token)
            tokens.append(token)
    except:
        token = ""
        tokens.append(token)
    return " ".join(tokens)

In [81]:
data=pd.read_csv("./final_classifier_data.csv")
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7795 entries, 0 to 7794
Data columns (total 2 columns):
sentence           7795 non-null object
actionable_item    7795 non-null bool
dtypes: bool(1), object(1)
memory usage: 68.6+ KB


Unnamed: 0,sentence,actionable_item
0,Make recommendations including budgets.,True
1,00 east coast nepool 90,False
2,"David, Please change the date on the draft so ...",True
3,christian yoder and steve hall are reviewing t...,False
4,"for further assistance with unsubscribing, yo=...",False


In [82]:
data.loc[data['actionable_item'] ==True, 'actionable_item']=1
data.loc[data['actionable_item']==False,'actionable_item']=0
data.head()

Unnamed: 0,sentence,actionable_item
0,Make recommendations including budgets.,1
1,00 east coast nepool 90,0
2,"David, Please change the date on the draft so ...",1
3,christian yoder and steve hall are reviewing t...,0
4,"for further assistance with unsubscribing, yo=...",0


In [83]:
#more cleaning
import re
data.sentence.str.replace("[image]","")
data.sentence.str.replace(re.compile("x-.*:"),"")
data.sentence.str.replace(re.compile("<.*>.*</.*>"),"")
data.drop(data[data['sentence'].str.len() < 20].index, inplace = True) 
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7735 entries, 0 to 7794
Data columns (total 2 columns):
sentence           7735 non-null object
actionable_item    7735 non-null object
dtypes: object(2)
memory usage: 181.3+ KB


In [84]:
train, test = train_test_split(data, test_size=0.2)
train,val= train_test_split(train, test_size=0.2)
print("Train set length {}".format(len(train)))
print("Test set length {}".format(len(test)))
print("Val set length {}".format(len(val)))

Train set length 4950
Test set length 1547
Val set length 1238


In [85]:
# A utility method to create a tf.data dataset from a Pandas Dataframe, From : https://www.tensorflow.org/tutorials/structured_data/feature_columns
def df_to_dataset(dataframe, shuffle=True,batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('actionable_item')
  ds = tf.data.Dataset.from_tensor_slices((dataframe['sentence'], labels))

  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe)).batch(batch_size,drop_remainder=True)
  return ds

In [86]:

train_data = df_to_dataset(train,batch_size=512)
val_data = df_to_dataset(val, shuffle=False)
test_data = df_to_dataset(test,shuffle=False)



In [87]:
for feature_batch, label_batch in train_data.take(1):
  print('A batch of ages:', feature_batch)
  print('A batch of targets:', label_batch)

A batch of ages: tf.Tensor(
[b'Please resend it and will roll it for this week this morning.'
 b'burn=measured daily usage thru electronic measurement, min dq-used if customer does not have electronic measurement'
 b'for information on becoming an affiliate click here: http://www'
 b'nsf lucy, here is a schedule of the most recent utility bills and the overages'
 b'subject: mid c new deals sept 24 phillip, here is the breakdown on new deals and option values for sept 24'
 b'his contact information is: phone (713)781-5810, fax (713)781-6614, and email jim123@pdq'
 b'com *** if you would like to be added to future event mailings, please click to www'
 b'you can keep a log on paper or on the computer'
 b'html find the best web-publishing systems http://chkpt'
 b'i am not sure if this is exactly what you need or not'
 b"as you may recall, earlier this summer we informed you that el paso's line 1110 had been brought down for repair, pigging and ops review"
 b'so that reduces some of the sav

In [88]:
embedding = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"
hub_layer = hub.KerasLayer(embedding, input_shape=[], 
                           dtype=tf.string, trainable=True)
hub_layer(train_examples_batch[:3])

<tf.Tensor: id=142856, shape=(3, 20), dtype=float32, numpy=
array([[ 1.5573357e-01, -1.6626104e+00,  1.8650322e+00,  2.4816637e+00,
        -1.4826401e+00, -1.1316029e+00,  2.1797378e-01,  1.1033629e+00,
         6.8309858e-02, -1.6947306e-03, -2.2111032e+00,  3.3551735e-01,
        -4.2400092e-01,  6.0004383e-01,  3.5240003e-01, -7.1367389e-01,
         2.3353989e+00, -1.2479353e+00, -5.1270074e-01, -7.8972071e-01],
       [-2.3828392e+00, -4.7576043e-01, -1.1516083e+00,  1.8457975e+00,
         1.9555467e+00, -2.7957010e+00, -7.4170488e-01, -2.3338521e+00,
        -1.3306203e+00, -1.2596037e+00,  1.2413057e+00,  8.3921778e-01,
        -1.6220599e+00,  1.5289219e-01,  2.2239392e+00,  1.3495511e+00,
         5.9250730e-01, -3.5831320e+00,  1.7612565e+00, -1.4403285e-02],
       [ 4.6105564e-01, -7.8218991e-01,  5.9550911e-01,  4.2833871e-01,
        -1.1568698e+00, -4.1606683e-01, -7.2149861e-01,  6.9150347e-01,
         4.0079048e-01,  2.2099164e-01, -9.9852675e-01,  4.3770045e-01,
  

In [89]:
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer_2 (KerasLayer)   (None, 20)                400020    
_________________________________________________________________
dense_2 (Dense)              (None, 16)                336       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 400,373
Trainable params: 400,373
Non-trainable params: 0
_________________________________________________________________


In [90]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [95]:
history = model.fit(train_data,
                    epochs=30,
                    validation_data=val_data.batch(512,drop_remainder=True),
                    verbose=1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


### Test Accuracies and Other Metrics

In [96]:


x,y=next(iter(test_data.batch(1547,drop_remainder=True)))
predicted=model.predict_on_batch(x)
results = model.evaluate(test_data.batch(1547), verbose=2)
for name, value in zip(model.metrics_names, results):
  print("%s: %.3f" % (name, value))# 
tp=0
tn=0
y_pred=[]
for row,val in enumerate(y):
    if predicted[row]>0.5:
        y_pred.append(1)
    else:
        y_pred.append(0)

print("-------------------------------")

print("Precision", precision_score(y, y_pred))
print("Recall", recall_score(y, y_pred))
print("f1_score", f1_score(y, y_pred))
print("confusion_matrix")
print(confusion_matrix(y, y_pred))
print("------------------------------")

1/1 - 0s - loss: 0.1387 - accuracy: 0.9509
loss: 0.139
accuracy: 0.951
-------------------------------
Precision 0.8899082568807339
Recall 0.7886178861788617
f1_score 0.8362068965517241
confusion_matrix
[[1277   24]
 [  52  194]]
------------------------------


In [97]:
 model.predict(["send me the assignment by today","the figures for enron are solid","can you get me the latest news on the deal"])
    
    
    

array([[0.87501764],
       [0.02841192],
       [0.8524244 ]], dtype=float32)