# Harry's Test Notebook
Experimentation with:
* Processing my personal Facebook chat data with BeautifulSoup
* Preprocessing of text? (stop word removal, stemming, lowercase)
* Similarity with **TF-IDF**
* **doc2vec**

In [272]:
# Load in libraries
from bs4 import BeautifulSoup
import urllib
import pickle
import editdistance
from operator import itemgetter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE
import numpy as np
import math
from copy import copy
from gensim.models import doc2vec, word2vec
from collections import namedtuple

# Scrape and preprocess Facebook Messenger messages

In [2]:
# Read html
page = urllib.urlopen('file:///Users/harryxue/Desktop/facebook-harryxue/html/messages.htm').read()
soup = BeautifulSoup(page, 'lxml')

In [3]:
# Print first 5000 characters to locate room groupchat
print soup.prettify()[:5000]

<html>
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <title>
   Harry Xue - Messages
  </title>
  <link href="../html/style.css" rel="stylesheet" type="text/css"/>
 </head>
 <body>
  <div class="nav">
   <img src="../photos/profile.jpg"/>
   <ul>
    <li>
     <a href="../index.htm">
      Profile
     </a>
    </li>
    <li>
     <a href="../html/contact_info.htm">
      Contact Info
     </a>
    </li>
    <li>
     <a href="../html/timeline.htm">
      Timeline
     </a>
    </li>
    <li>
     <a href="../html/photos.htm">
      Photos
     </a>
    </li>
    <li>
     <a href="../html/synced_photos.htm">
      Synced Photos
     </a>
    </li>
    <li>
     <a href="../html/videos.htm">
      Videos
     </a>
    </li>
    <li>
     <a href="../html/friends.htm">
      Friends
     </a>
    </li>
    <li class="selected">
     Messages
    </li>
    <li>
     <a href="../html/pokes.htm">
      Pokes
     </a>
    </li>
    <li>
     <a href="../h

In [11]:
contents_div = soup.body.div.next_sibling
# Get div of room groupchat
groupchat_div = contents_div.div.div

In [12]:
# Create list of all messages in this groupchat
groupchat_messages = []

groupchat_p = groupchat_div.find_all("p")
for p in groupchat_p:
    groupchat_messages.append(p.text)

In [18]:
print "Number of messages:", len(groupchat_messages)
print groupchat_messages[:10]

Number of messages: 7316
[u'', u'No more pumpkins sigh', u'', u'in 121. I\u2019ll give it that much', u'ok ok ok fine. I\u2019ll get op at regex', u'Cough algorithms', u'Cough parsing', u'Cough compilers', u'But k', u'I beg to disagree sir']


We may have to clean this data later, but for now we'll move on

In [16]:
# Create pickle dump of all messages
pickle.dump(groupchat_messages, open('groupchat.p', 'wb'))

# Naive Implementation with Edit Distance
This chatbot matches an inputted message to the closest message in the groupchat history by edit distance and then outputs the next message in the history after the match.

In [25]:
train_data = groupchat_messages

while True:
    s = raw_input('Message: ')

    dists = []

    for past_m in train_data:
        dists.append(editdistance.eval(past_m, s))

    best_ind = min(enumerate(dists), key=itemgetter(1))[0] 

    if best_ind == len(train_data):
        print 'Pizzabot: ' + train_data[0]
    else: 
        print 'Pizzabot: ' + train_data[best_ind + 1]

KeyboardInterrupt: 

Message: sds


# Naive Implementation with TF-IDF
Now let's try vectorizing each of the messages via TF-IDF and use **cosine similarity** as the distance metric.

In [169]:
train_data = copy(groupchat_messages)

while True:
    s = raw_input('Message: ')
    
    # Vectorize each of the messages
    vect = TfidfVectorizer(min_df=1)
    train_data.append(s)
    tfidf = vect.fit_transform(train_data)
    
    dists = []
    
    for index, past_m in enumerate(train_data[:-1]):
        # Calculate cosine similarity between each of the messages in chat history and the user inputted message
        dist = cosine_similarity(tfidf[index].toarray(), tfidf[-1].toarray())[0][0]
        dists.append(dist)
        
    best_ind = max(enumerate(dists), key=itemgetter(1))[0] 

    if best_ind == len(train_data):
        print 'Pizzabot: ' + train_data[0]
    else: 
        print 'Pizzabot: ' + train_data[best_ind + 1]
        print '(Closest match to:', train_data[best_ind], ')'

Message: chris u sux
Pizzabot: Because those TFs likely were faculty
(Closest match to: lol chris
Message: poop is good
Pizzabot: K
(Closest match to: Poop
Message: What do you think of stat 110?
Pizzabot: this morning
(Closest match to: What did you think of it?
Message: chris is dead
Pizzabot: *waiting
(Closest match to: Is Chris dead
Message: *waiting
Pizzabot: Bacon mushroom ottos just siting for you
(Closest match to: *waiting
Message: anyone wanna lift later?
Pizzabot: Well blood test
(Closest match to: Can you lift later?
Message: where u
Pizzabot: Almost there
(Closest match to: Where u
Message: how u how life
Pizzabot: Wut
(Closest match to: How
Message: I hate 109
Pizzabot: 
(Closest match to: as much as I hate javascript
Message: I am poop
Pizzabot: K
(Closest match to: Poop
Message: I need to poop
Pizzabot: K
(Closest match to: Poop
Message: K
Pizzabot: No more pumpkins sigh
(Closest match to: 
Message: ok
Pizzabot: uh i did
(Closest match to: ok
Message: where do you wanna

KeyboardInterrupt: 

# Naive Implementation with doc2vec
Now let's try training a doc2vec model

In [294]:
taggedMessage = namedtuple('TaggedMessage', 'words tags')
documents = []

# Preprocess messages
# TODO: Decide what further preprocessing we may want to do
for i, message in enumerate(groupchat_messages):
    words = message.lower().split()
    tags = [i]
    x = taggedMessage(words, tags)
    documents.append(taggedMessage(words, tags))

# Train model
model = doc2vec.Doc2Vec(documents, size=100, workers=4, iter=20)

In [295]:
# Demo of functions that may be useful
# Get most similar terms to a given term
model.most_similar("josh")

[(u'guys', 0.649903416633606),
 (u'phil', 0.6136562824249268),
 (u'jdawg', 0.6103042364120483),
 (u'where', 0.5632401704788208),
 (u'probability', 0.5101494789123535),
 (u'eat', 0.5087506771087646),
 (u'club', 0.4903731346130371),
 (u'want', 0.48883336782455444),
 (u'breakfast', 0.4771336317062378),
 (u't', 0.46553558111190796)]

# Exploration of word relationships with word2vec
Let's train a word2vec model on the groupchat data and visualize the 2d projection of the words

In [313]:
# Train w2v model
messages = [m.split() for m in groupchat_messages]
w2v_model = word2vec.Word2Vec(messages, size=100, min_count=3, iter=20)

In [329]:
w2v_model.most_similar("lift")

[(u'join', 0.9992973804473877),
 (u'yo', 0.9987936615943909),
 (u'Do', 0.9987677335739136),
 (u'explain', 0.9986845254898071),
 (u'eat', 0.9986089468002319),
 (u'still', 0.9985542893409729),
 (u'You', 0.9984771013259888),
 (u'wanna', 0.9984598159790039),
 (u'When', 0.9984278678894043),
 (u'they', 0.9984255433082581)]