# Baseline Models
Basic baseline models for next utterance classification using:
1. completely random predictor
2. TF-IDF
3. word2vec (with averaging)
4. doc2vec
5. LDA

methods for vectorizing messages. Evaluation using recall@k on test set with cosine similarity.

In [1]:
import urllib
import pickle
import numpy as np
import pandas as pd
from copy import copy
from gensim.models import doc2vec, word2vec
import gensim
from collections import namedtuple
import timeit
import csv

from modules.evaluation_metrics import recall_at_k
from modules import tfidf
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.preprocessing import scale

In [2]:
# Read in data
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
valid = pd.read_csv("data/valid.csv")

In [8]:
# Get data into right format i.e. list of strings
train_data = np.append(train.Context.values,train.Utterance.values)

In [7]:
print train.shape
train.head()

(1000000, 3)


Unnamed: 0,Context,Utterance,Label
0,i think we could import the old comment via rs...,basic each xfree86 upload will not forc user t...,1
1,i 'm not suggest all - onli the one you modifi...,sorri __eou__ i think it be ubuntu relat . __e...,0
2,afternoon all __eou__ not entir relat to warti...,"yep . __eou__ oh , okay . i wonder what happen...",0
3,interest __eou__ grub-instal work with / be ex...,that the one __eou__,1
4,and becaus python give mark a woodi __eou__ __...,( i think someon be go to make a joke about .a...,1


In [8]:
print test.shape
test.head()

(18920, 11)


Unnamed: 0,Context,Ground Truth Utterance,Distractor_0,Distractor_1,Distractor_2,Distractor_3,Distractor_4,Distractor_5,Distractor_6,Distractor_7,Distractor_8
0,anyon know whi my stock oneir export env var u...,nice thank ! __eou__,"wrong channel for it , but check efnet.org , u...","everi time the kernel chang , you will lose vi...",ok __eou__,! nomodeset > acer __eou__ i 'm assum it be a ...,http : //www.ubuntu.com/project/about-ubuntu/d...,thx __eou__ unfortun the program be n't instal...,how can i check ? by do a recoveri for test ? ...,my humbl apolog __eou__,# ubuntu-offtop __eou__
1,i set up my hd such that i have to type a pass...,"so you dont know , ok , anyon els ? __eou__ yo...","nmap be nice , but it be n't what i be look fo...",ok __eou__,cdrom work fine on window . __eou__ i dont thi...,"ah yes , i have read return as rerun __eou__",hm ? __eou__,"not the case , lts be everi other .04 releas ....",pretti much __eou__,i use the one i download from amd __eou__,"ffmpeg be part of the packag , quixotedon , at..."
2,im tri to use ubuntu on my macbook pro retina ...,just wonder how it run __eou__,"yes , that 's what i do , export it to a `` id...",noth - i be talk about the question of myhero ...,that should fix the font be too larg __eou__,"okay , so hcitool echo back hci0 < mac address...",i get to the menu with option such as tri ubun...,whi do u need analyz __eou__ it be a toy __eou...,cntrl-c may stop the command but it doe n't fi...,"if you re onli go to run ubuntu , just get a n...",the one which be not pick up at the moment be ...
3,no suggest ? __eou__ link ? __eou__ how can i ...,you cant load anyth via usb or cd when luk be ...,-p sorri ... __eou__ nmap -p22 __eou__ it doe ...,i guess so i ca n't even launch it . __eou__,note __eou__,rxvt-unicod be one __eou__,i tar all of ~ __eou__,i tar all of ~ __eou__,"i do n't realli know if i can help , but i be ...","that work just fine , thank ! __eou__",thank you __eou__
4,i just ad a second usb printer but not sure wh...,i be set it up under the printer configur __eo...,i 'd say the most common venu would be via lau...,"the old hardi man page , http : //manpages.ubu...",i ll give a tri __eou__,"by the way , the url you post for davf be from...",http : //ubuntuforums.org/showthread.php ? t=1...,"so i load up putti gui , then what do i do ? _...","you should read error messag , it say be you r...",wait the colleg semest to close just to make s...,i be call myself a jerk . all i know be that y...


In [10]:
print valid.shape
valid.head()

(19560, 11)


Unnamed: 0,Context,Ground Truth Utterance,Distractor_0,Distractor_1,Distractor_2,Distractor_3,Distractor_4,Distractor_5,Distractor_6,Distractor_7,Distractor_8
0,ani idea on how lts will be releas ? __eou__ _...,we be talk 12.04 not 10.04 __eou__,you rememb my flash issu from yesterday or the...,"oh , no idea other be probabl ok __eou__ updat...","no , greenit be say his download speed be slow...",lsb_releas -sc __eou__ well ... regardless . i...,you can buy _anything_ in china __eou__,no __eou__,sudo restart lightdm __eou__,you be still ask for the uniti logout menu rig...,"so i be work as a linux admin intern , and my ..."
1,how much hdd use ubuntu default instal ? __eou...,that whi i ask how much be default instal ? : ...,all of this possibl in older version of ubuntu...,: be that a question ? __eou__,yes __eou__,"thank __eou__ i would imagin so , the site bon...",yes i ve investig that alreadi . it seem you c...,not realli . i use urxvt myself . __eou__,"thank a lot , realli ! __eou__","as someon els suggest , close update-manag , a...",you re welcom .. sinc 12.04 throw dnsmasq into...
2,in my countri it near the 27th __eou__ when wi...,thanx __eou__,"i have no .docx file , so do n't know , whi no...",i ve boot countless distro from usb on my aao ...,but i 'm sure i can work it out __eou__,"the way you put it , that sound like a sever c...",im not familiar with hotspot __eou__,it work fine without set up an ssh tunnel manu...,so it have two be a two-command process ? __eou__,"and becaus you onli have 3 gb of ram , be not ...",it ok but no error ? then how do you know it a...
3,it 's not out __eou__ __eot__ they probabali b...,wait for mani thing to be setup __eou__ final ...,"that 's right , while chat i regrett make a lo...",afaik it 's best to start at 2mb = 2048k __eou__,"for the most part , you should be instal pytho...",do you overwrit your win instal or can you bro...,for some reason the headphon option doe not ch...,well then i do n't know . can anyth boot on th...,well then i do n't know . can anyth boot on th...,"ya , but i guess you could do a git of your en...",noexec be a mount option . you would have to c...
4,be the ext4 driver stabl ? __eou__ __eot__ i b...,you sound like it 's updat to skynet . ; ) __e...,"ok i will tri that , brb __eou__ it complain a...",ouch __eou__,i do system annalysi and it say everyth pass 1...,not to mention way less complex ... you can ha...,"well , you can , accord to that articl , i als...","if not , i think you can pretti much grab ani ...","gpart ? i do n't want do edit partit , just mo...",i ve tri it . not a fan at all __eou__ i have ...,"ah , okay __eou__"


## Completely Random Predictor
This naive predictor randomly picks one of the responses for each of the rows in the test data set. Therefore, we would expect it to have ~10% accuracy with recall@1, ~20% accuracy with recall@2, ~50% accuracy with recall@5 and 100% accuracy with recall@10.

In [4]:
# Completely random prediction model
def random_predictor(context, test):
    return np.random.choice(len(test), 10, replace=False)

In [5]:
# As a sanity check, let's see if the random predictor performs as expected
y_random = [random_predictor(test.Context[x], test.iloc[x,1:].values) for x in range(len(test))]
for k in [1, 2, 5, 10]:
    print("Recall @ {}, 10 total choices: {:g}".format(k, recall_at_k(y_random, k)))

Recall @ 1, 10 total choices: 0.0974101
Recall @ 2, 10 total choices: 0.195032
Recall @ 5, 10 total choices: 0.493869
Recall @ 10, 10 total choices: 1


The random predictor does indeed perform as expected.

## TF-IDF Weighting
Now let's try to use **TF-IDF** weighting to vectorize the contexts and responses and use **cosine similarity** to compute the rank each of the possible responses.

In [None]:
# Fit TF-IDF model
tfidf_model = tfidf.TFIDF_Predictor()
tfidf_model.train(train_data)

In [10]:
# Evaluate model performance
y = [tfidf_model.predict(test.Context[x], test.iloc[x,1:].values) for x in range(test.shape[0])]
for k in [1, 2, 5, 10]:
    print("Recall @ {}, 10 total choices: {:g}".format(k, recall_at_k(y, k)))

Recall @ 1, 10 total choices: 0.495032
Recall @ 2, 10 total choices: 0.596882
Recall @ 5, 10 total choices: 0.766121
Recall @ 10, 10 total choices: 1


## word2vec
Next let's generate word embeddings using word2vec and average the word2vec representations for each word in a message in order to get a feature vector for that message. 

In [153]:
# Split each of the messages into lists of words
train_data_lists = [m.split() for m in train_data]

In [None]:
# Train w2v model
w2v_model = word2vec.Word2Vec(train_data_lists, size=100, min_count=3, iter=3, workers=3)

In [17]:
# Save model
w2v_model.save('w2v_models/size100_mincount3_iter3')

In [36]:
# Train w2v model of size 200, min_count 5 and 20 iterations
import timeit
start = timeit.default_timer()
w2v_model2 = word2vec.Word2Vec(train_data_lists, size=200, min_count=5, iter=20, workers=4)

# Save model
w2v_model2.save('w2v_models/size200_mincount5_iter20')
stop = timeit.default_timer()
print stop - start 

1899.75511289


In [171]:
# Load trained w2v model
w2v_model = word2vec.Word2Vec.load('w2v_models/size200_mincount5_iter20')

In [177]:
def average_w2v(document, model, num_features):
    '''
    Calculate a feature vector for a document by averaging the 
    word2vec representations of its constituent words
    
    Args:
        document: a message as a str
        model: trained w2v model
        num_features: length of feature vector
        
    Returns:
        Feature vector that is the average of the word2vec representations of all words in the document
    '''
    # Get vocab of the model
    vocab = set(model.index2word)
    vector = np.zeros(num_features)
    num_words = 0
    # Split document into list of words
    words = document.split()
    
    # If word exists in vocabulary, add its w2v to sum
    for word in words:
        if word in vocab:
            vector = np.add(vector, model[word])
            num_words = num_words + 1
        
    # Average by the number of words
    vector = np.divide(vector, num_words)
    
    return vector

In [178]:
def w2v_predict(model, context, responses, size):
    '''
    Calculates the cosine similarity between the context and each of the possible responses
    
    Args:
        model: a word2vec model
        context: a context that we want to find the response for
        responses: list of candidate responses containing the actual response
        size: size of word2vec vectors for the model
        
    Returns:
        List of response indices sorted in descending order by cosine similarity with context
    '''
    context_vec = average_w2v(context, model, size)
    sims = []
    
    for response in responses:
        # Calculate cosine similarity between the averaged word2vec vector of response and context
        response_vec = average_w2v(response, model, size)
        sim = cosine_similarity(context_vec.reshape(1, -1), response_vec.reshape(1, -1))[0][0]
        sims.append(sim)
    
    return np.argsort(sims, axis=0)[::-1]

In [181]:
# Evaluate w2v averaging model's performance
y = [w2v_predict(w2v_model, test.Context[x], test.iloc[x,1:].values, 200) for x in range(len(test))]
for k in [1, 2, 5, 10]:
    print("Recall @ {}, 10 total choices: {:g}".format(k, recall_at_k(y, k)))

Recall @ 1, 10 total choices: 0.310307
Recall @ 2, 10 total choices: 0.445983
Recall @ 5, 10 total choices: 0.70333
Recall @ 10, 10 total choices: 1


## doc2vec
Doc2vec generates vector embeddings for entire documents, so let's try that now.

In [4]:
taggedMessage = namedtuple('TaggedMessage', 'words tags')
documents = []

# Preprocess messages
for i, message in enumerate(train_data):
    # Split into lists of words
    words = message.split()
    # Add a tag corresponding to the index of the message
    tags = [i]
    x = taggedMessage(words, tags)
    documents.append(taggedMessage(words, tags))

In [7]:
# Train doc2vec model
start = timeit.default_timer()
d2v = doc2vec.Doc2Vec(documents, size=100, workers=4, iter=5)
stop = timeit.default_timer()
print stop - start 

1040.17123508


In [8]:
d2v.save("d2v_size100_iter5")

In [9]:
# Train doc2vec model with different parameters
start = timeit.default_timer()
d2v2 = doc2vec.Doc2Vec(documents, size=200, workers=4, iter=20)
stop = timeit.default_timer()
print stop - start 

5021.152601


In [10]:
d2v2.save("d2v_size200_iter20")

In [15]:
def d2v_predict(model, context, responses):
    '''
    Calculates the cosine similarity between the context and each of the possible responses
    
    Args:
        model: a doc2vec model
        context: a context that we want to find the response for
        responses: list of candidate responses containing the actual response
        
    Returns:
        List of response indices sorted in descending order by cosine similarity with context
    '''
    context_vector = model.infer_vector(context.split())
    sims = []
    
    for response in responses:
        # Calculate cosine similarity between the doc2vec vectors of response and context
        response_vector = model.infer_vector(response.split())
        sim = cosine_similarity(context_vector.reshape(1, -1), response_vector.reshape(1, -1))[0][0]
        sims.append(sim)
    
    return np.argsort(sims, axis=0)[::-1]

In [17]:
# Evaluate model performance with doc2vec model with size 200, 20 iterations
y = [d2v_predict(d2v2, test.Context[x], test.iloc[x,1:].values) for x in range(test.shape[0])]
for k in [1, 2, 5, 10]:
    print("Recall @ {}, 10 total choices: {:g}".format(k, recall_at_k(y, k)))

Recall @ 1, 10 total choices: 0.333245
Recall @ 2, 10 total choices: 0.477378
Recall @ 5, 10 total choices: 0.737474
Recall @ 10, 10 total choices: 1


Our best doc2vec model performs slightly worse than TF-IDF. This may have to do with the length discrepancy between contexts and response. It may be interesting to see how doc2vec performs with datasets with similar message lengths.

In [18]:
# Evaluate model performance with doc2vec model with size 100, 5 iterations
y = [d2v_predict(d2v, test.Context[x], test.iloc[x,1:].values) for x in range(test.shape[0])]
for k in [1, 2, 5, 10]:
    print("Recall @ {}, 10 total choices: {:g}".format(k, recall_at_k(y, k)))

Recall @ 1, 10 total choices: 0.280391
Recall @ 2, 10 total choices: 0.423044
Recall @ 5, 10 total choices: 0.70222
Recall @ 10, 10 total choices: 1


As expected, the doc2vec model which took less training time performs more poorly.

## Export best doc2vec model to tsv
Next let's export our best doc2vec model's vectors to tsv for visualization with Google's embedding projector (http://projector.tensorflow.org/)

In [57]:
# Write a tsv file with one doc2vec per row
with open('tsv/doc2vec_first10000.tsv', 'w') as tsvfile:
    writer = csv.writer(tsvfile, delimiter='\t')
#     for v in d2v2.docvecs:
#         writer.writerow(v)
    # Take a subset for now
    for i in range(10000):
        writer.writerow(d2v2.docvecs[i])

In [58]:
# Write tsv file with metadata (i.e. all the training text, one per line)
with open('tsv/metadata_first10000.tsv', 'w') as tsvfile:
    writer = csv.writer(tsvfile, delimiter='\t')
    for doc in train_data[:10000]:
        writer.writerow([doc])

## LDA
LDA is a well known topic modelling algorithm from which you can learn the topic distributions of a corpus and infer the topic distribution for a new text.

In [38]:
# Generate corpus for use with LDA
texts = [doc.split() for doc in train_data]

In [40]:
# Create dictionary for corpus
dictionary = gensim.corpora.Dictionary(texts)
# Save dictionary for use later
dictionary.save('dict/ubuntu.dict')

In [None]:
# Create corpus of bag of words
corpus = [dictionary.doc2bow(text) for text in texts]
gensim.corpora.MmCorpus.serialize('/corpora/ubuntu_bow.mm', corpus)  # store to disk, for later use

In [None]:
start = timeit.default_timer()
# Train LDA model
lda = gensim.models.ldamulticore.LdaMulticore(corpus, num_topics=100, workers=4)
stop = timeit.default_timer()
print stop - start 

In [97]:
# Load trained LDA model
lda = gensim.models.LdaModel.load("LDA/lda_model1")

In [98]:
# load saved dictionary
dictionary = gensim.corpora.dictionary.Dictionary.load("dict/ubuntu.dict")

In [99]:
def get_lda_feature_vector(topic_dist, num_topics):
    '''
    Convert gensim LDA topic distribution for a given document to feature vector of length number of topics
    
    Args:
        topic_dist: topic distribution for a message from gensim LDA model
                    a list of (topic index, proportion) pairs
                    i.e. [(5, 0.59), (12, 0.11)...]
        num_topics: total number of topics in the model, for use as length of feature vector
            
    Returns:
        Feature vector for document where the index corresponds to topic number and value is proportion for that topic
    '''
    vector = np.zeros(num_topics)
    
    # Fill in values for nonzero topics
    for index, value in topic_dist:
        vector[index] = value
            
    return vector

In [91]:
def lda_predict(model, dictionary, context, responses):
    '''
    Calculates the cosine similarity between the context and each of the possible responses,
    returning a ranked list of responses sorted in decreasing order by cosine similarity
    
    Args:
        model: a lda model
        dictionary: dictionary associated with the lda model
        context: a context that we want to find the response for
        responses: list of candidate responses containing the actual response
        
    Returns:
        List of response indices sorted in descending order by cosine similarity with context
    '''
    # Infer topic distribution and vectorize
    context_vector = get_lda_feature_vector(model[dictionary.doc2bow(context.split())], model.num_topics)
    sims = []
    
    for response in responses:
        # Calculate cosine similarity between the lda feature vectors of response and context
        response_vector = get_lda_feature_vector(model[dictionary.doc2bow(response.split())], model.num_topics)
        sim = cosine_similarity(context_vector.reshape(1, -1), response_vector.reshape(1, -1))[0][0]
        sims.append(sim)
        
    return np.argsort(sims, axis=0)[::-1]

In [92]:
# Evaluate model performance with lda model
y = [lda_predict(lda, dictionary, test.Context[x], test.iloc[x,1:].values) for x in range(len(test.shape[0]))]
for k in [1, 2, 5, 10]:
    print("Recall @ {}, 10 total choices: {:g}".format(k, recall_at_k(y, k)))

Recall @ 1, 10 total choices: 0.309
Recall @ 2, 10 total choices: 0.451
Recall @ 5, 10 total choices: 0.58
Recall @ 10, 10 total choices: 1
5038.85787487


## K-Means Clustering and Naive Bayes
Proof of concept for the system that we are implementing. First we cluster the doc2vec vectorizations of each text using k-means. Then we train a naive bayes classifier using the clustering for labels. We generate vectors for new texts using the posterior probabilities for each cluster based on Naive Bayes.

In [3]:
# Load trained doc2vec model
d2v2 = doc2vec.Doc2Vec.load("d2v_models/d2v_size200_iter20")

In [4]:
# Create array of doc2vec vectors
X = []
for v in d2v2.docvecs:
    X.append(v)

X = np.array(X)

In [5]:
# Cluster messages using minibatch k-means
mbkmeans = MiniBatchKMeans(n_clusters=100).fit(X)

In [6]:
def nb_predict(nb_model, count_vectorizer, context, responses):
    '''
    Calculates the cosine similarity between the context and each of the possible responses
    
    Args:
        nb_model: a trained naive bayes classifier
        count_vectorizer: a fitted bow vectorizer
        context: a context that we want to find the response for
        responses: list of candidate responses containing the actual response
        
    Returns:
        List of response indices sorted in descending order by cosine similarity with context
    '''
    # Get vector of probabilities for each of the clusters for the context
    context_vector = nb_model.predict_proba(count_vectorizer.transform([context]).toarray())[0]
    sims = []
    
    for response in responses:
        # Calculate cosine similarity between the doc2vec vectors of response and context
        response_vector = nb_model.predict_proba(count_vectorizer.transform([response]).toarray())[0]
        sim = cosine_similarity(context_vector.reshape(1, -1), response_vector.reshape(1, -1))[0][0]
        sims.append(sim)
            
    return np.argsort(sims, axis=0)[::-1]

In [9]:
# Create bag of words representations for each of the texts
count_vect = CountVectorizer()
X_bow = count_vect.fit_transform(train_data)

In [1]:
# Train Multinomial naive bayes classifier
multinomial_nb = MultinomialNB()
multinomial_nb.fit(X_bow.toarray()[:10], mbkmeans.labels_[:10])

NameError: name 'MultinomialNB' is not defined

In [70]:
# Evaluate model performance with clustering + multinomial naive bayes approach
y = [nb_predict(multinomial_nb, count_vect, test.Context[x], test.iloc[x,1:].values) for x in range(20)]
for k in [1, 2, 5, 10]:
    print("Recall @ {}, 10 total choices: {:g}".format(k, recall_at_k(y, k)))

[[  7.51684795e-18   9.24607838e-01   7.53921625e-02]]
[[  1.18041346e-13   1.81221104e-01   8.18778896e-01]]
[[  3.87616104e-14   3.58390088e-01   6.41609912e-01]]
[[  1.75890554e-18   4.29647376e-01   5.70352624e-01]]
[[  2.09020531e-10   9.68401824e-01   3.15981753e-02]]
[[  1.02014615e-13   9.38612509e-01   6.13874906e-02]]
[[  2.41004570e-11   2.39820264e-01   7.60179736e-01]]
[[  1.45330987e-15   9.99368452e-01   6.31547625e-04]]
[[  5.81412793e-13   9.54916118e-02   9.04508388e-01]]
[[  2.20276920e-13   3.13209247e-02   9.68679075e-01]]
[[  3.31692298e-21   1.03537809e-01   8.96462191e-01]]
[[  2.15349034e-23   9.99963609e-01   3.63908651e-05]]
[[  7.15334686e-09   9.79933306e-01   2.00666864e-02]]
[[  3.18445373e-22   9.99962239e-01   3.77613430e-05]]
[[  5.53036571e-25   9.99490056e-01   5.09944125e-04]]
[[  1.73461167e-21   3.08171998e-01   6.91828002e-01]]
[[  3.60954009e-22   1.45767420e-01   8.54232580e-01]]
[[  1.46896377e-21   1.43330990e-01   8.56669010e-01]]
[[  8.4819