# Predicting FB messenger users

In [None]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

## Load and parse messages

In [None]:
# Replace with the path to messages.htm from the FB dump
HTML_PATH = '/path/to/messages.htm'

with open(HTML_PATH) as f:
    soup = BeautifulSoup(f.read(), 'html5lib')

In [None]:
# Create a dict from every message
# and group them all into threads
threads = soup.select('.thread')
thread_list = []
for thread in threads:
    msg_list = []
    for message in thread.select('div.message'):
        item = {}
        item['user'] = message.select_one('span.user').text
        item['timestamp'] = message.select_one('span.meta').text
        item['content'] = message.next_sibling.text
        msg_list.append(item)
    thread_list.append(msg_list)

In [None]:
# Append thread id
thread_ids = []
for thread in threads:
    first_msg = thread.select_one('div.message')
    thread_ids.append(str(first_msg.previous_sibling))

In [None]:
# Merge everything into a single dataframe
dfs = []
for idx, thread in enumerate(thread_list):
    df = pd.DataFrame(thread)
    df['id'] = thread_ids[idx]
    dfs.append(df)
    
df = pd.concat(dfs)

## Clean up

In [None]:
# Uncomment to parse dates (slow)
#df['timestamp'] = pd.to_datetime(df['timestamp'])

In [None]:
# Due to Facebook bug, sometimes there is no
# user information. Remove those rows :(
df = df[df['user'] != '']

## Pick a conversation

In [None]:
# This requires some manual digging to find
# the right thread id in the data. The numbers
# represent the users involved in the thread
THREAD_ID = '123456789@facebook.com, 0123456789@facebook.com'
s = df[df['id'] == THREAD_ID]

## Train model

In [None]:
# Split dataset into training and testing parts
X_train, X_test, y_train, y_test = train_test_split(s['content'], s['user'], test_size=0.25)

In [None]:
# Create a word vector from the training data
count_vect = CountVectorizer(token_pattern=r'(?u)\b\w\w+\b|:\)\)?|:D|:/|\?')
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)

In [None]:
# Fit the ML model
clf = MultinomialNB().fit(X_train_counts, y_train)

## Evaluate

In [None]:
# Compare true vs predicted labels
# Pure guessing will have accuracy 0.5
X_test_counts = count_vect.transform(X_test)
pred = clf.predict(X_test_counts)
accuracy_score(y_test, pred)

## Make new predictions!

In [None]:
# Feed the algorithm some sample texts
docs = ['jag är trött', 'gud vad kul!', 'ska vi dricka öl i kväll?']

# Create a sparse representation the texts above
X_new_counts = count_vect.transform(docs)

# Predict user based on texts
predicted = clf.predict(X_new_counts)

# Print the results
for doc, pred in zip(docs, predicted):
    print(doc + '\nProbably written by: ' + pred + '\n---')