In [244]:
import pandas as pd
from collections import defaultdict
from pymongo import MongoClient

def _connect_mongo(host, port, username, password, db):
    """ A util for making a connection to mongo """

    if username and password:
        mongo_uri = 'mongodb://%s:%s@%s:%s/%s' % (username, password, host, port, db)
        conn = MongoClient(mongo_uri)
    else:
        conn = MongoClient(host, port)


    return conn[db]

def read_mongo(db, collection, query={}, host='localhost', port=27017, username=None, password=None, no_id=True):
    """ Read from Mongo and Store into DataFrame """

    # Connect to MongoDB
    db = _connect_mongo(host=host, port=port, username=username, password=password, db=db)

    # Make a query to the specific DB and Collection
    cursor = db[collection].find(query)

    # Expand the cursor and construct the DataFrame
    df =  pd.DataFrame(list(cursor))

    # Delete the _id
    if no_id:
        del df['_id']

    return df

In [245]:
# Load raw collections into dataframes
email_df = read_mongo('enron', 'emails')

In [246]:
print (email_df['text_chunks'][1][0]['content'])

Please answer all of the questions below...
Are we going to look for a ring tomorrow?   I guess
Do you know what size she wears? (I'm working on finding out myself)  i don't know
What time frame do you want to work with for an engagement? (Yes a TIME FRAME)  (ie, in a week, month, year...)
I have the impression that beth wants you to take care of the whole wedding thing, is this true?
She says she wants to go away (deffinately tropical because she has never seen blue water) and invite close family members (parents sibblings) and close friends by sending a letter/invitation.  She says they will need time to save if they are going to come, so I suggested 6 months.
Does this sound appealing to you?
FYI... My brother went away to get married and I have a friend who went away so I have experience in planning tropical weddings.  They are great because only those who you want there come and others aren't affended for not being invited because y'all went away.  You can have a dream wedding for

#### E-Mail Formatting

In [247]:
# Format e-mail BSON to usable data frame
email_df = email_df[['from',
                     'to',
                     'header_info',
                     'subject',
                     'text_chunks']]
print ("# E-mails: " + str(email_df.shape[0]))
email_df = email_df[email_df['from'] > 0]
print ("# E-mails w/From: " + str(email_df.shape[0]))

# E-mails: 276279
# E-mails w/From: 263587


#### Entity Formatting

In [248]:
# Format entity BSON to usable data frame
ent_df = read_mongo('enron', 'entities')
ent_df = ent_df[['email_address',
                'email_addresses',
                'position',
                'position_id',
                'position_nodes',
                ]]
print ("# Entities: " + str(ent_df.shape[0]))

# Remove null affiliated employees
#ent_df_no_pn = ent_df[ent_df['position_nodes'].isnull()]
#ent_df = ent_df[ent_df['affiliation'].notnull()]
#print ("# Non-null-affiliated entities: " + str(ent_df.shape[0]))
ent_df = ent_df[ent_df['position_nodes'].notnull()]

print ("# Entities with position nodes: " + str(ent_df.shape[0]))

# Entities: 95941
# Entities with position nodes: 3187


#### New Ranking System Method

In [249]:
job_titles = set([])
jt_ids = set([])
for i, row in ent_df.iterrows():
    try:
        job_titles.add(row['position'])
        jt_ids.add(row['position_id'])
    except:
        None

In [250]:
job_titles_both = defaultdict(set)
for i, row in ent_df.iterrows():
    #try:
    job_titles_both[row['position']].add(row['position_id'])
    #except:
    #    None

In [251]:
jt_map = {}
rank_map = {}
i = 0

for jt in job_titles:
    jt = str(jt)
    if jt.find('CEO') != -1:
        jt_map[jt] = 'CEO'
        rank_map[jt] = 0
    elif jt.find('COO') != -1 or jt.find('CTO') != -1 or \
            jt.find('CFO') != -1 or \
            (jt.find('Chief') != -1 and jt.find('Officer') != -1):
        jt_map[jt] = 'C-Suite'
        rank_map[jt] = 0
    elif jt.find('Vice') != -1 or jt.find('VP') != -1:
        jt_map[jt] = 'VP'
        rank_map[jt] = 3
    elif jt.find('President') != -1:
        jt_map[jt] = 'President'
        rank_map[jt] = 2
    elif jt.find('Lawyer') != -1 or jt.find('Counsel') != -1 or jt.find('Legal') != -1 or jt.find('Attorney') != -1:
        jt_map[jt] = 'Lawyer'
        rank_map[jt] = 4
    elif jt.find('Director') != -1:
        rank_map[jt] = 4
        jt_map[jt] = 'Director'
    elif jt.find('Manager') != -1:
        jt_map[jt] = 'Manager'
        rank_map[jt] = 5
    elif jt.find('Analyst') != -1 or jt.find('Specialist') != -1 or jt.find('Engineer') != -1 or \
            jt.find('Tech') != -1:
        jt_map[jt] = 'Tech'
        rank_map[jt] = 6
    elif jt.find('Assistant') != -1 or jt.find('HR') != -1 or jt.find('Secretary') != -1 or \
            jt.find('Admin') != -1 or jt.find('Assoc') != -1 or jt.find('Clerk') != -1 or jt.find('Entry') != -1:
        jt_map[jt] = 'Admin'
        rank_map[jt] = 6
    elif jt.find('Trader') != -1:
            jt_map[jt] = 'Trader'
            rank_map[jt] = 6
    else:
        jt_map[jt] = 'Unknown'
        rank_map[jt] = -1
        i += 1

print (len(job_titles))
print (i)

368
125


In [252]:
email_to_jt = defaultdict(int)
k = 0
l = 0
for i, row in ent_df.iterrows():
    if str(row['email_address']) != 'nan' and str(row['position']) != 'nan':
        email_to_jt[row['email_address'].lower()] = rank_map[row['position']]
        for em_add in row['email_addresses']:
            email_to_jt[em_add.lower()] = rank_map[row['position']]

In [253]:
# Double check for still unknowns
i = 0
j = 0
for h, row in ent_df.iterrows():
    if str(row['email_address']) != 'nan':
        if email_to_jt[row['email_address'].lower()] == -1:
            jt = row['position_id']
            
            if jt.find('CEO') != -1:
                email_to_jt[row['email_address'].lower()] = 0
            elif jt.find('COO') != -1 or jt.find('CTO') != -1 or \
                    jt.find('CFO') != -1 or \
                    (jt.find('Chief') != -1 and jt.find('Officer') != -1):
                email_to_jt[row['email_address'].lower()] = 1
            elif jt.find('Vice') != -1 or jt.find('VP') != -1:
                email_to_jt[row['email_address'].lower()] = 3
            elif jt.find('President') != -1:
                email_to_jt[row['email_address'].lower()] = 2
            elif jt.find('Lawyer') != -1 or jt.find('Counsel') != -1 or jt.find('Legal') != -1 or jt.find('Attorney') != -1:
                email_to_jt[row['email_address'].lower()] = 4
            elif jt.find('Director') != -1:
                email_to_jt[row['email_address'].lower()] = 4
            elif jt.find('Manager') != -1:
                email_to_jt[row['email_address'].lower()] = 5
            elif jt.find('Analyst') != -1 or jt.find('Specialist') != -1 or jt.find('Engineer') != -1 or \
                    jt.find('Tech') != -1 or jt.find('Logistician') != -1 or jt.find('Operator') != -1 or \
                    jt.find('Statistician') != -1 or jt.find('Designer') != -1 or jt.find('Producer') != -1 or \
                    jt.find('Purchasing Agent') != -1:
                email_to_jt[row['email_address'].lower()] = 6
            elif jt.find('Assistant') != -1 or jt.find('HR') != -1 or jt.find('Secretary') != -1 or \
                    jt.find('Admin') != -1 or jt.find('Assoc') != -1 or jt.find('Clerk') != -1 or \
                    jt.find('Customer Service') != -1 or jt.find('Recruit') != -1 or jt.find('Entry') != -1:
                email_to_jt[row['email_address'].lower()] = 6
            elif jt.find('Trader') != -1:
                email_to_jt[row['email_address'].lower()] = 6
            else:
                i += 1
                j -= 1
        j += 1
print ("Email Addresses Mapped: " + str(j))
print ("Remaining Unknown: " + str(i))

Email Addresses Mapped: 1518
Remaining Unknown: 0


In [254]:
#for i, email in email_df.iterrows():
#    if 'content' in email['text_chunks'][0].keys():
#        print (email['text_chunks'][0]['content'])
#    if i > 1010: break
print (len(email_to_jt.keys()))

6154


In [255]:
# Employee Email to Position Code
u_count = 0
n_count = 0
s_count = 0
d_count = 0

conversation_strings = defaultdict(str)
conversation_count = defaultdict(int)
conversation_labels = defaultdict(str)

for i, email in email_df.iterrows():
    
    # Check if valid role
    from_ = None
    to_ = []
    for header in email['header_info']:
        if header['role'] == 'from':
            from_ = header['email_address'].lower()
        elif header['role'] == 'to':
            to_.append(header['email_address'].lower())
    if from_ is None:
        continue

    content = ""
    if 'content' in email['text_chunks'][0].keys():
        content += email['text_chunks'][0]['content']
    if str(email['subject']) != 'nan':
        content += email['subject'] + " "
    
    #if len(to_) > 8:
    #    continue
        
    for to_address in to_:
        if from_ != to_address:
            if to_address in email_to_jt .keys() and from_ in email_to_jt.keys():
                code1 = email_to_jt[from_]
                code2 = email_to_jt[to_address]
                
                if code1 == -1 or code2 == -1:
                    continue
                    
                index_ = (from_, to_address)
                
                u_count = 0
                n_count = 0
                s_count = 0
                d_count = 0
                
                conversation_strings[index_] += " " + content.lower()
                conversation_count[index_] += 1
                    
                if code1 < code2:
                    conversation_labels[index_] = 'up'
                    u_count += 1
                elif code1 > code2:
                    conversation_labels[index_] = 'down'
                    d_count += 1
                elif code1 == code2:
                    conversation_labels[index_] = 'neutral'
                    s_count += 1  
                else:
                    conversation_labels[index_] = 'unk'
                    n_count += 1
print (str(n_count) + " unranked messages.")

0 unranked messages.


In [256]:
total_length = len(conversation_strings.keys())
total_count = 0
convos_count = 0

for c_ in conversation_count:
    convos_count += conversation_count[c_]

for rc in conversation_strings:
    total_count += len(conversation_strings[rc])
    
print ("Total relationships: " + str(total_length))
print ("Avg char. count of content: " + str(total_count / total_length))

print ("\nTotal e-mails sent: " + str(convos_count))
print ("Avg convo. count per relationship: " + str(convos_count / total_length))

Total relationships: 36027
Avg char. count of content: 4525.337052765981

Total e-mails sent: 192538
Avg convo. count per relationship: 5.344269575596081


In [257]:
# Convert to label -> text
u_count = 0
nu_count = 0
n_count = 0
for key in conversation_strings.keys():
    
    if conversation_labels[key] == 'up':
        u_count += 1
    elif conversation_labels[key] == 'down':
        nu_count += 1
    elif conversation_labels[key] == 'neutral':
        n_count += 1
        

In [258]:
print ("Upward messages: " + str(u_count))
print ("Not-upward messages: " + str(nu_count))
print ("Neutral messages: " + str(n_count))

Upward messages: 9970
Not-upward messages: 11507
Neutral messages: 14550


## Data Cleansing 

In [259]:
import re
import nltk
import string
nltk.download('stopwords')
nltk.download('punkt')
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import SnowballStemmer
import time 

# Some code from: https://medium.com/@sabber/classifying-yelp-review-comments
# used to create this function
def clean_text(text):
    
    ## Remove puncuation
    #text = text.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    ## Remove stop words
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 2]
    
    text = " ".join(text)
    
    ## Clean the text
    text = re.sub(r"[^A-Za-z^,!.\/'+-=]", " ", text)
    text = re.sub("\d+", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ", text)
    text = re.sub(r"\+", " ", text)
    text = re.sub(r"\@", " ", text)
    text = re.sub(r"\-", " ", text)
    text = re.sub(r"\=", " ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", " ", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", " ", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    
    text = text.replace("  ", " ")
    text = text.split()
    text = [w for w in text if len(w) >= 3 and len(set(w)) > 1]
    text = " ".join(text)
    
    ## Stemming
    #text = text.split()
    #stemmer = SnowballStemmer('english')
    #stemmed_words = [stemmer.stem(word) for word in text]
    #text = " ".join(stemmed_words)
    
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nic\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nic\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [260]:
post_prep_format = []
i = 0
start = time.time()
print ("Processing " + str(len(conversation_strings.keys())) + " relationships...")
for key in conversation_strings.keys():
    i += 1
    label = conversation_labels[key]
    
    if label == 'up' or label == 'down':
        
        content = clean_text(conversation_strings[key])
        
        post_prep_format.append((label, content))
    
    if i % 10 == 0:
        print ("At position..." + str(i), end='\r')
        
finish = time.time()
print("Text processing completed in " + str(finish-start) + " seconds.")

Processing 36027 relationships...
Text processing completed in 30.2667236328125 seconds.


In [273]:
df = pd.DataFrame(post_prep_format, columns=['label', 'message'])
df = df.drop_duplicates()

df = df.sample(frac=1).reset_index(drop=True)

df_train = df[:round(df.shape[0] *.7)]
df_test = df[round(df.shape[0] *.7):]

df_test = df_test.reset_index()

print ((df.shape[0]))

10906


In [274]:
for message in df['message']:
    if message.find('0') != -1:
        print (message)
        break

In [278]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import numpy as np

vectorizer = CountVectorizer(ngram_range=(2,6))
clf = MultinomialNB()
tfidf_transformer = TfidfTransformer()
baseline_clf = Pipeline([('vect', vectorizer),
                         ('tfidf', TfidfTransformer()),
                         ('clf', clf),
])

In [279]:
baseline_clf.fit(df_train['message'], df_train['label'])

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(2, 6), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [280]:
predicted = baseline_clf.predict(df_test['message'])
print ("Initial baseline accuracy: " + str(np.mean(predicted == df_test['label'])))

Initial baseline accuracy: 0.66320293398533


In [281]:
def show_most_informative_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print ("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))

In [282]:
show_most_informative_features(vectorizer, clf, n=10)

	-15.5771	aae dellnet    		-12.8929	let know       
	-15.5771	aae dellnet combliley		-12.9599	would like     
	-15.5771	aae dellnet combliley transmission		-13.2567	please let     
	-15.5771	aae dellnet combliley transmission language		-13.3358	please let know
	-15.5771	aae dellnet combliley transmission language case		-13.4559	staff meeting  
	-15.5771	aae dellnet combliley transmission language commerce		-13.4875	see attached   
	-15.5771	aae dellnet combliley transmission language richardson		-13.5189	please see     
	-15.5771	aae dellnet comlanguage		-13.5451	master agreement
	-15.5771	aae dellnet comlanguage bliley		-13.5470	north america  
	-15.5771	aae dellnet comlanguage bliley sent		-13.6132	enron north america


#### What's going wrong?

In [272]:
# Print 5 messages of misclassified samples
import math
test_start_index = math.floor(df.shape[ 0] * .7)
k = 0
j = 0
l = 0
q = 0
m = 0
for i, x in enumerate(predicted):
    if x == 'up':
        j += 1
    else:
        k += 1
    
    if df_test['label'][i] == 'up':
        l += 1
    else:
        m += 1
    if x != df_test['label'][i]:
        print (x)
        print (df_test['message'][i])
        if q > 5:
            break
        else:
            q += 1
print ("Predicted up: " + str(j))
print ("Actual up: " + str(l))
print ("Predicted down: " + str(k))
print ("Actual down: " + str(m))

down
dear john attached trading contract package natsourcedirect trading platform based proven global vision software trayport believe strongly able serve better combining skills brokers line capabilities natsourcedirect able trade system need sign standard participant agreement read market rules procedures complete schedule please mail fax signed agreement new york office attention able provide trading access natsourcedirect questions please hesitate call email tclaughton natsource com mail natsource llc natsourcedirect division attn trent claughton broadway floor fax natsourcedirect division attn trent claughton best regards trent claughton need adobe acrobat read print documents adobe acrobat reader downloaded free http adobe com copies fedexed email tclaughton natsource comnatsourcedirect trading contract
up
ken responsible managing enron banking relationships speaking rosalee reserving time calendar meetings top banks given uncertainty tightness credit markets andy ben discussed i

## Simple NN

In [177]:
words = set()
for message in df_train['message']:
    for word in message.split():
        words.add(word)
vocab_size = len(words)
print ("Vocab size: " + str(vocab_size))

Vocab size: 53546


In [183]:
df = pd.DataFrame(post_prep_format, columns=['label', 'message'])
df = df.drop_duplicates()
j = 0
k = 0
binary = False
if binary:
    for i, row in df.iterrows():
        if row['label'] == 'up':
            row['label'] = 1
            j += 1
        else:
            row['label'] = 0
            k += 1

df_train = df[:round(df.shape[0] *.7)]
df_test = df[round(df.shape[0] *.7):]

In [201]:
import pickle
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout
from sklearn.preprocessing import LabelBinarizer
import sklearn.datasets as skds
from pathlib import Path
# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras import optimizers
from sklearn.manifold import TSNE

num_labels = 2
batch_size = 100
 
# define Tokenizer with Vocab Size
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(df_train['message'])
 
x_train = tokenizer.texts_to_matrix(df_train['message'], mode='tfidf')
x_test = tokenizer.texts_to_matrix(df_test['message'], mode='tfidf')
 
encoder = LabelBinarizer()
encoder.fit(df_train['label'])
y_train = encoder.transform(df_train['label'])
y_test = encoder.transform(df_test['label'])

In [196]:
model = Sequential()
model.add(Dense(1024, input_shape=(vocab_size,)))
model.add(Activation('relu'))
model.add(Dense(1024))
model.add(Activation('softmax'))
model.add(Dense(num_labels))
model.add(Activation('sigmoid'))
model.summary()

model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=3,
                    verbose=0,
                    validation_split=0.3)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_22 (Dense)             (None, 1024)              54832128  
_________________________________________________________________
activation_22 (Activation)   (None, 1024)              0         
_________________________________________________________________
dense_23 (Dense)             (None, 1024)              1049600   
_________________________________________________________________
activation_23 (Activation)   (None, 1024)              0         
_________________________________________________________________
dense_24 (Dense)             (None, 2)                 2050      
_________________________________________________________________
activation_24 (Activation)   (None, 2)                 0         
Total params: 55,883,778
Trainable params: 55,883,778
Non-trainable params: 0
________________________________________________________________

In [197]:
score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 0.6941606323701537
Test accuracy: 0.5241442542787286


### LSTM

In [205]:
df = pd.DataFrame(post_prep_format, columns=['label', 'message'])
df = df.drop_duplicates()
df = df.sample(frac=1).reset_index(drop=True)

j = 0
k = 0
binary = True
if binary:
    for i, row in df.iterrows():
        if row['label'] == 'up':
            row['label'] = 1
            j += 1
        else:
            row['label'] = 0
            k += 1

df_train = df[:round(df.shape[0] *.7)]
df_test = df[round(df.shape[0] *.7):]

In [215]:
### Create sequence
tokenizer = Tokenizer(num_words= vocab_size)
tokenizer.fit_on_texts(df_train['message'])
sequences = tokenizer.texts_to_sequences(df_train['message'])
data = pad_sequences(sequences, maxlen=50)

In [216]:
## Network architecture
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=50))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
## Fit the model
model.fit(data, np.array(df_train['label']), validation_split=0.4, epochs=3)

Train on 4580 samples, validate on 3054 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x2370b2585f8>

In [217]:
tokenizer.fit_on_texts(df_test['message'])
sequences_t = tokenizer.texts_to_sequences(df_test['message'])
data_t = pad_sequences(sequences_t, maxlen=50)
score = model.evaluate(data_t, np.array(df_test['label']), verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 0.8987463428513637
Test accuracy: 0.48746943765281175


### LSTM-CNN

In [219]:
### Create sequence
tokenizer = Tokenizer(num_words= vocab_size)
tokenizer.fit_on_texts(df_train['message'])
sequences = tokenizer.texts_to_sequences(df_train['message'])
data = pad_sequences(sequences, maxlen=50)

In [222]:
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=50))
model.add(Dropout(0.2))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=8))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam',    metrics=['accuracy'])
model.fit(data, np.array(df_train['label']), validation_split=0.4, epochs=3)

Train on 4580 samples, validate on 3054 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x2370fd70358>

In [223]:
tokenizer.fit_on_texts(df_test['message'])
sequences_t = tokenizer.texts_to_sequences(df_test['message'])
data_t = pad_sequences(sequences_t, maxlen=50)
score = model.evaluate(data_t, np.array(df_test['label']), verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 0.9201591125618858
Test accuracy: 0.508557457212714


### CNN

In [225]:
### Create sequence
tokenizer = Tokenizer(num_words= vocab_size)
tokenizer.fit_on_texts(df_train['message'])
sequences = tokenizer.texts_to_sequences(df_train['message'])
data = pad_sequences(sequences, maxlen=50)

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=50))
model.add(Dropout(0.2))
model.add(Conv1D(64, 5, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

sgd =optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)

model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
model.fit(data, np.array(df_train['label']), validation_split=0.3, epochs=3)
    

In [239]:
#### SciKit Learn Default Models
##### Adapted for this data set with some additional slight modifications

In [231]:
import logging
import numpy as np
from optparse import OptionParser
import sys
from time import time
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.extmath import density
from sklearn import metrics

  from numpy.core.umath_tests import inner1d


In [232]:
df = pd.DataFrame(post_prep_format, columns=['label', 'message'])
full = df.shape[0]
df = df.drop_duplicates()
df = df.sample(frac=1).reset_index(drop=True)
df_train = df[:round(df.shape[0] *.7)]
df_test = df[round(df.shape[0] *.7):]

In [233]:
y_train = df_train['label']
y_test = df_test['label']

vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                             stop_words='english')
X_train = vectorizer.fit_transform(df_train['message'])
X_test = vectorizer.transform(df_test['message'])

In [234]:
def benchmark(clf):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time

In [235]:
results = []
for clf, name in (
        (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"),
        (Perceptron(n_iter=50), "Perceptron"),
        (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"),
        (KNeighborsClassifier(n_neighbors=10), "kNN"),
        (RandomForestClassifier(n_estimators=100), "Random forest")):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf))

Ridge Classifier
________________________________________________________________________________
Training: 
RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
        max_iter=None, normalize=False, random_state=None, solver='lsqr',
        tol=0.01)




train time: 0.585s
test time:  0.003s
accuracy:   0.656

Perceptron
________________________________________________________________________________
Training: 
Perceptron(alpha=0.0001, class_weight=None, eta0=1.0, fit_intercept=True,
      max_iter=None, n_iter=50, n_jobs=1, penalty=None, random_state=0,
      shuffle=True, tol=None, verbose=0, warm_start=False)




train time: 0.289s
test time:  0.003s
accuracy:   0.619

Passive-Aggressive
________________________________________________________________________________
Training: 
PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None,
              fit_intercept=True, loss='hinge', max_iter=None, n_iter=50,
              n_jobs=1, random_state=None, shuffle=True, tol=None,
              verbose=0, warm_start=False)




train time: 0.404s
test time:  0.002s
accuracy:   0.629

kNN
________________________________________________________________________________
Training: 
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform')
train time: 0.021s
test time:  2.138s
accuracy:   0.642

Random forest
________________________________________________________________________________
Training: 
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
train time: 164.386s
test time:  0.484s
accuracy:   0.625



In [236]:
from sklearn import svm, grid_search
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
svr = svm.SVC()
clf = grid_search.GridSearchCV(svr, parameters)
clf.fit(X_train, y_train)
                            




GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'kernel': ('linear', 'rbf'), 'C': [1, 10]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [237]:
pred = clf.predict(X_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)

accuracy:   0.647


### Optimizing initial model 

In [241]:
parameters = {'ngram_range': ('2,2', '2,3')}
mnb = MultinomialNB()
clf = grid_search.GridSearchCV(mnb, parameters)
clf.fit(X_train, y_train)
vectorizer = CountVectorizer(ngram_range=(3,3))

tfidf_transformer = TfidfTransformer()
baseline_clf = Pipeline([('vect', vectorizer),
                         ('tfidf', TfidfTransformer()),
                         ('clf', clf),
])

In [None]:
baseline_clf.fit(df_train['message'], df_train['label'])
predicted = baseline_clf.predict(df_test['message'])
print ("Initial baseline accuracy: " + str(np.mean(predicted == df_test['label'])))