In [1]:

import sqlite3 as lite
import sys
import json
import glob
import os 

import nltk
import re
import string

In [65]:
def clean_text(text):
    # get rid of links
    url_pattern = u'((http|ftp|https):\/\/)?[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&amp;:/~\+#]*[\w\-\@?^=%&amp;/~\+#])?'
    text = [re.sub(url_pattern, ' ', x[0]) for x in out]

    # make sure there are spaces between emojis
    regex = re.compile(ur'([\u263a-\U0001f645])')
    text = [regex.sub(ur' \1 ', item) for item in text]

    # get rid of newline characters
    text = [re.sub('[\s+]', ' ', x) for x in text]

    # get rid of numbers that are surroudned by spaces
    text = [re.sub("\s+[\d]", " <NUMERIC> ", x) for x in text]

    emot = {'EMOT01': '\:\)', 
        'EMOT02': '\:\(*',
        'EMOT03': '\:P', 
        'EMOT04': '\:\/', 
        'EMOT05':"\:\\ ", 
        'EMOT06': '\:\(\(\(', 
        'EMOT07': '\:D', 
        'EMOT08': 'D\:',
        'EMOT09': '\:p', 
        'EMOT10': '\:P',
        'EMOT11': '\:3 ',
        'EMOT12': '\:\\\\', 
        'EMOT13': '\:c',
        'EMOT14': '\;\)',
        'EMOT15': '\;3',
        'EMOT16': '\:oP', 
        'EMOT17': '\:\*',
        'EMOT18': "\:\'\(",
        'EMOT19': '\:\-\)', 
        'EMOT20': '\:\-\(',
        'EMOT21':  '\:o',
        'EMOT22': '\:O',
        'EMOT23': '\:\|', 
        'EMOT24': "\:\'\)", 
        'EMOT25': '\:DDD', 
        'EMOT26': '\:\-D'
          }


    for i in range(len(text)):
        for key in emot:
            text[i] = re.sub(emot[key], ' ' + key + ' ', text[i])

    # get rid of reddit formatting stuff
    text = [re.sub('\&\w\w\;', '', x) for x in text]

    # remove punctuation
    text = [" ".join("".join(["" if ch in string.punctuation else ch for ch in x.lower()]).split()) for x in text]
    
    return text



In [91]:
# get season and date aggregates
con = lite.connect('rpdr_comments/rpdr_comments.db')
with con:
    cur = con.cursor()
    length = cur.execute("select count(*) from comments;").fetchall()
    length = length[0][0]/100
    df = cur.execute("select body from comments limit '{q}';".format(q=length))
    out = df.fetchall()
    
text = clean_text(out)

In [92]:
import collections

def unigram_and_bigram_counts(comments):
    unigram_counts = collections.Counter()
    bigram_counts = collections.Counter()
    trigram_counts = collections.Counter()

    for comment in comments:
        x = comment.split()
        unigram_counts.update(x)
        bigram_counts.update(zip(x[0:-1], x[1:]))
        #trigram_counts += collections.Counter(zip(comment[0:-2], comment[1:-1], comment[2:]))
    return unigram_counts, bigram_counts#, trigram_counts

unigrams, bigrams = unigram_and_bigram_counts(text)

In [82]:
def score_bigram(bigram, unigram_counts, bigram_counts, delta):
    '''Return the score of bigram.
    See Section 4 of Word2Vec (see notebook for link).
    Args:
      - bigram: the bigram to score: ('w1', 'w2')
      - unigram_counts: a map from word => count
      - bigram_counts: a map from ('w1', 'w2') => count
      - delta: the adjustment factor
    '''
    # START YOUR CODE HERE
    if bigram in bigram_counts:
        return(1.0*(bigram_counts[bigram] - delta)/(unigram_counts[bigram[0]]*unigram_counts[bigram[1]]))
    else:
        return(1.0*0) 
    # END YOUR CODE HERE

In [93]:
import numpy as np

size = 20000

delta = 10
scored_bigrams = sorted(
    [(score_bigram(bigram, unigrams, bigrams, delta), bigram) for bigram in bigrams], reverse=True)

pim_bigrams = [((np.log(x[0]) + np.log(size)),x[1]) for x in scored_bigrams]



In [94]:
phrase_threshold = 1.0
phrases = [bigram for score, bigram in pim_bigrams if score > phrase_threshold]


In [95]:
def grouped_stream(words, groups):
    groups = set(groups)
    prefixes = group_prefix(groups)

    output = []
    state = []
    for i, word in enumerate(words.split()):
        state.append(word)
        while state:
          state_tuple = tuple(state)
          if i + 1 < len(words) and state_tuple in prefixes:
              break 
          if state_tuple in groups:
              output.append('_'.join(state_tuple))
              state = []
              break 
          output.append(state[0])
          state = state[1:]
    return output

def group_prefix(groups):
    group_prefix = set()
    for group in groups:
        for i in xrange(len(group) - 1):
            group_prefix.add(group[0 : i + 1])

    return group_prefix

In [96]:
words = [" ".join(grouped_stream(x, phrases)) for x in text]


In [97]:
group_unigrams, _ = unigram_and_bigram_counts(words)

In [98]:


top_words = [x[0] for x in group_unigrams.most_common(size)]
all_words = ['<UNK>', '<S1>', '<S2>', '<E1>', '<E2>'] + sorted(top_words)
word_to_id = {word: wid for wid, word in enumerate(all_words)}
id_to_word = {wid: word for word, wid in word_to_id.iteritems()}

In [None]:
class Vocabulary(object):

    def __init__(self, words, size):
        
        word_counts = [(count, word) for word, count in collections.Counter(words).iteritems()]
        top_words = zip(*(sorted(word_counts, reverse=True)[:size]))[1]
        all_words = ['<UNK>'] + sorted(top_words)
        self.word_to_id = {word: wid for wid, word in enumerate(all_words)}
        self.id_to_word = {wid: word for word, wid in self.word_to_id.iteritems()}

    def to_id(self, word):
        return self.word_to_id.get(word, 0)

    def to_word(self, wid):
        return self.id_to_word.get(wid, '<UNK>')

    def size(self):
        return len(self.word_to_id)

In [99]:
def to_word(wid):
        return id_to_word.get(wid, '<UNK>')

In [100]:
def to_id(word):
    return word_to_id.get(word, 0)
#['<S2>', '<S1>'] + text[0].split() + ['<E1>', '<E2>']
wordids = [[to_id(word) for word in ['<S2>', '<S1>'] + comment.split() + ['<E1>', '<E2>']] for comment in text]

In [101]:
vsize = len(all_words)
vsize

20005

In [102]:

def context_windows(words, C=5):
    '''A generator that yields context tuples of words, length C.
       Don't worry about emitting cases where we get too close to
       one end or the other of the array.
       Your code should be quite short and of the form:
       for ...:
         yield the_next_window
    '''
    # START YOUR CODE HERE
    for i in range(len(words)):
        if len(words[i:i+C]) == C:
            yield(words[i:i+C])
    # END YOUR CODE HERE

    
def cooccurrence_table(comments, C=2):
    '''Generate cooccurrence table of words.
    Args:
       - words: a list of words
       - C: the # of words before and the number of words after
            to include when computing co-occurrence.
            Note: the total window size will therefore
            be 2 * C + 1.
    Returns:
       A list of tuples of (word, context_word, count).
       W1 occuring within the context of W2, d tokens away
       should contribute 1/d to the count of (W1, W2).
    '''
    table = []
    # START YOUR CODE HERE
    codict = {}
    for words in comments:
        for item in context_windows(words, C*2+1):
            for i in range(C*2+1):
                if i == C: 
                    pass
                else:
                    try:
                        codict[item[C], item[i]] += 1./abs(C-i)
                    except:
                        codict[item[C], item[i]] = 1./abs(C-i)
    table = [(key[0], key[1], codict[key]) for key in codict]
    # END YOUR CODE HERE
    return table

In [103]:
len(group_unigrams)

26046

In [104]:
ctable = cooccurrence_table(wordids, C=2)

In [105]:
import numpy as np

ctable_wids = np.array([word for word, _, _ in ctable])
ctable_cwids = np.array([context_word for _, context_word, _ in ctable])
ctable_counts = np.array([count for _, _, count in ctable])

In [117]:
import glove
reload(glove)
import tensorflow as tf
# Hyperparameters.

# You may want to shrink num_examples_to_train to finish debugging
# and only run it this long once you are training on Wikipedia.
learning_rate = 0.003
num_examples_to_train = 3e8
batch_size = 100
embedding_dim = 300

alpha = 0.75
xmax = 100

# Construct the training graph.
tf.reset_default_graph()

wids_ph = tf.placeholder(tf.int32, shape=[None])
c_wids_ph = tf.placeholder(tf.int32, shape=[None])
counts_ph = tf.placeholder(tf.float32, shape=[None])

with tf.variable_scope('word_embeddings'):
    word_embeddings, word_bias, word_embed_matrix = (
        glove.wordids_to_tensors(wids_ph, embedding_dim, vsize))
with tf.variable_scope('word_context_embeddings'):
    word_context_embeddings, word_context_bias, word_context_embed_matrix = (
        glove.wordids_to_tensors(c_wids_ph, embedding_dim, vsize))
    
losses = glove.example_weight(tf.cast(counts_ph, tf.float32), xmax, alpha) *  glove.loss(
    word_embeddings, word_bias, word_context_embeddings, word_context_bias,
    tf.cast(counts_ph, tf.float32))
loss = tf.reduce_mean(losses)

# Adam is similar to AdaGrad in that it handles sparse gradients well.
# Specifically, you may imagine that some words appear with more context
# words than others and with bigger counts.  They therefore are updated
# more often and more aggressively (remember the weighting function
# you implemented).  Adam backs off updating parameters that it has already
# significantly moved around.  (intuitively: the 500th time you backprop
# into "the", you probably don't have a lot more information to add).
#
# Here is the original University of Toronto paper detailing the word
# done in collaboration with Google DeepMind.
# https://arxiv.org/pdf/1412.6980v8.pdf
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss)

Tensor("word_context_embeddings/embedding_lookup:0", shape=(?, 300), dtype=float32)
Tensor("word_embeddings/embedding_lookup:0", shape=(?, 300), dtype=float32)


In [118]:
# Train the embeddings.
# Set up the session & initialize variables.
sess = tf.Session()
sess.run(tf.initialize_all_variables())

In [133]:
import random
import time

# Important note:  You do not need to run this cell to completion.
# Let it train for 30 minutes or so, then interrupt the kernel and see how good
# the nearest-neighbors results look.  Run this cell again to pick up from where
# you left off.

# An hour on the recommended GCE cloud instance gets reasonably good results.
# Two hours cleans up the vectors beautifully.

REPORT_LOSS_EVERY = 1000
EVAL_BATCH_SIZE = 5000

indexes = range(len(ctable_wids))

def make_feed_dict(feed_dict_batch_size):
    batch_idx = random.sample(indexes, feed_dict_batch_size)
    batch_wids = ctable_wids[batch_idx]
    batch_cwids = ctable_cwids[batch_idx]
    batch_counts = ctable_counts[batch_idx]
    return {
        wids_ph: batch_wids,
        c_wids_ph: batch_cwids,
        counts_ph: batch_counts
    }

num_batches = int(num_examples_to_train / batch_size + 1)

print '# training examples:', len(ctable_wids)
print '# of epochs:', 1.0 * num_examples_to_train / len(ctable_wids)
print '# batches:', num_batches
print 'Initial loss:', sess.run(loss, feed_dict=make_feed_dict(EVAL_BATCH_SIZE))

current_timer = None
for batch in xrange(num_batches):
    # Train based on randomly sampled batches of examples.
    loss_val, _ = sess.run([loss, train_op], feed_dict=make_feed_dict(batch_size))
    
    # Do some basic reporting as training progresses.
    if batch % REPORT_LOSS_EVERY == 0:
        if current_timer:
            remaining_reporting_cycles = 1.0 * (num_batches - batch) / REPORT_LOSS_EVERY
            cycle_time = time.time() - current_timer
            print 'Expected time left:', remaining_reporting_cycles * cycle_time / 60 / 60, 'hours (', cycle_time, 'seconds per', REPORT_LOSS_EVERY, 'batches).'
        current_timer = time.time()
            
        print batch, ':', sess.run(loss, feed_dict=make_feed_dict(EVAL_BATCH_SIZE))

# training examples: 743254
# of epochs: 403.630522002
# batches: 3000001
Initial loss: 0.00831237
0 : 0.00700091
Expected time left: 60.3981140037 hours ( 72.5018799305 seconds per 1000 batches).
1000 : 0.00921318
Expected time left: 60.8791479421 hours ( 73.1036889553 seconds per 1000 batches).
2000 : 0.00764369
Expected time left: 61.7489306444 hours ( 74.1728649139 seconds per 1000 batches).
3000 : 0.00421176
Expected time left: 61.6268167771 hours ( 74.0508899689 seconds per 1000 batches).
4000 : 0.00531705
Expected time left: 62.0352269136 hours ( 74.5665249825 seconds per 1000 batches).
5000 : 0.00590822
Expected time left: 62.7777363747 hours ( 75.4842269421 seconds per 1000 batches).
6000 : 0.0120629
Expected time left: 61.5303889173 hours ( 74.0091300011 seconds per 1000 batches).
7000 : 0.00710324
Expected time left: 61.3930182046 hours ( 73.8685801029 seconds per 1000 batches).
8000 : 0.00895814
Expected time left: 62.3867484749 hours ( 75.0893411636 seconds per 1000 batche

KeyboardInterrupt: 

In [134]:

word_embed_matrix_val, word_context_embed_matrix_val = sess.run([word_embed_matrix, word_context_embed_matrix])

# As per the paper, we take the average of the word's vector when it's the center word of the window
# and the vector when it's found in the context.
#
# There is some (handwave-y) motivation for why we do this in section 4.2 of GloVe.
Wv = word_embed_matrix_val + word_context_embed_matrix_val

In [121]:
def find_nn_cos(v, Wv, k=10):
    """Find nearest neighbors, by cosine distance."""
    Z = np.linalg.norm(Wv, axis=1) * np.linalg.norm(v)
    ds = np.dot(Wv, v.T) / Z
    print ds
    nns = np.argsort(-1*ds)[:k]  # sort descending, take best
    print nns
    return nns, ds[nns]  # word indices, distances

def show_nns(v, Wv, k=10):
    print "Nearest neighbors:"
    for i, d in zip(*find_nn_cos(v, Wv, k)):
        w = to_word(i)
        print "%.03f : \"%s\"" % (d, w)
        
def word_show_nns(word, Wv, k=10):
    show_nns(Wv[to_id(word)], Wv, k)

In [132]:
word_show_nns('thank_you', Wv)

Nearest neighbors:
[-0.01907882  0.06455677  0.08859344 ...,  0.06157409  0.03262725
  0.08352949]
[17508  5694 13650  9592  2608  1067 11901 15701  9267  7759]
1.000 : "thank_you"
0.191 : "endorsing"
0.190 : "ps3"
0.187 : "kue"
0.186 : "calgon"
0.186 : "appointment"
0.182 : "notification"
0.182 : "shotted"
0.181 : "jorah"
0.179 : "hatchback"


In [None]:
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
import numpy as np

Z = linkage(Wv, 'ward')

In [None]:
Wv.shape

In [128]:
from sklearn.cluster import KMeans
import time

start = time.time() # Start time

# Set "k" (num_clusters) to be 1/5th of the vocabulary size, or an
# average of 5 words per cluster
num_clusters = Wv.shape[0] / 50

# Initalize a k-means object and use it to extract centroids
kmeans_clustering = KMeans( n_clusters = num_clusters )
idx = kmeans_clustering.fit_predict( Wv )

# Get the end time and print how long the process took
end = time.time()
elapsed = end - start
print "Time taken for K Means clustering: ", elapsed, "seconds."

Time taken for K Means clustering:  75.9231550694 seconds.


In [130]:
# Create a Word / Index dictionary, mapping each vocabulary word to
# a cluster number                                                                                            
word_centroid_map = dict(zip( all_words, idx ))

# For the first 10 clusters
for cluster in xrange(10):
    #
    # Print the cluster number  
    print "\nCluster %d" % cluster
    #
    # Find all of the words for that cluster number, and print them out
    words = []
    for i in xrange(0,len(word_centroid_map.values())):
        if( word_centroid_map.values()[i] == cluster ):
            words.append(word_centroid_map.keys()[i])
    print words


Cluster 0
[u'forging']

Cluster 1
[u'ratch', u'reversal', u'souvenir', u'panettone', u'slayyyyys', u'illness', u'misdemeanor', u'creak', u'canceling', u'pikachu', u'selffinancing', u'criterion', u'webisodes', u'nutso', u'whines', u'twohead', u'maganese', u'suing', u'yaaaaassssss', u'quibble', u'smashing', u'credentials', u'raggedyass', u'friendlier', u'transported', u'bashed', u'redditama', u'suprise', u'cbb', u'recollection', u'restraint', u'myth', u'getchris']

Cluster 2
[u'retail', u'politician', u'jinkxy', u'commented', u'lgbt', u'chers', u'rumour', u'slam', u'lattisaw', u'politely', u'rhythmless', u'designing', u'tips', u'superficial', u'rgaybros', u'contestants', u'dork', u'fry', u'prostitution', u'werkroom', u'sits', u'jackets', u'shifted', u'support', u'k', u'muslims', u'opinions', u'sisterly', u'behalf', u'believes', u'muah', u'poking', u's6', u'immune', u'ruiz', u'coincidental', u'matches', u'stanning', u'allstar', u'vote', u'2', u'waistline', u'assistant', u'uk', u'katyas',

In [None]:

import bhtsne

t0 = time.time()
print "Running Barnes-Hut t-SNE on word vectors; matrix shape = %s" % str(Wv.shape)
Wv2 = bhtsne.tsne(Wv)
print "Transformed in %s" % data_utils.pretty_timedelta(since=t0)

## Uncomment below if you need to use sklearn implementation
## (not recommended)
# from sklearn.manifold import TSNE
# transformer = TSNE(n_components=2, verbose=2)
# Wv2 = transformer.fit_transform(Wv)

In [None]:
idx[all_words.index('katya')]


In [None]:
 dict(zip( all_words, idx ))


## BIGRAM MODELING

In [73]:
from datetime import date
from dateutil.relativedelta import relativedelta

d = date(2014,01,01)
drange = [(d + relativedelta(months=+x)).strftime('%s') for x in range(0, 34)]


# get season and date aggregates
con = lite.connect('rpdr_comments/rpdr_comments.db')
output = []
with con:
    for pair in [(drange[i], drange[i+1]) for i in range(len(drange)-1)]:
        cur = con.cursor()
        df = cur.execute("select body from comments WHERE created_utc BETWEEN '{d1}' and '{d2}';".format(d1=pair[0], d2=pair[1]))
        out = df.fetchall()
        output.append(clean_text(out))

In [68]:
import itertools
import collections

def flatten(list_of_lists):
    return itertools.chain.from_iterable(list_of_lists)

In [69]:
class Vocabulary(object):

  START_TOKEN = "<s>"
  END_TOKEN = "</s>"
  UNK_TOKEN = "<unk>"

  def __init__(self, tokens, size=None):
    self.unigram_counts = collections.Counter(tokens)
    # leave space for "<s>", "</s>", and "<unk>"
    top_counts = self.unigram_counts.most_common(None if size is None else (size - 3))
    vocab = ([self.START_TOKEN, self.END_TOKEN, self.UNK_TOKEN] +
             [w for w,c in top_counts])

    # Assign an id to each word, by frequency
    self.id_to_word = dict(enumerate(vocab))
    self.word_to_id = {v:k for k,v in self.id_to_word.iteritems()}
    self.size = len(self.id_to_word)
    if size is not None:
        assert(self.size <= size)

    # Store special IDs
    self.START_ID = self.word_to_id[self.START_TOKEN]
    self.END_ID = self.word_to_id[self.END_TOKEN]
    self.UNK_ID = self.word_to_id[self.UNK_TOKEN]

  def words_to_ids(self, words):
    return [self.word_to_id.get(w, self.UNK_ID) for w in words]

  def ids_to_words(self, ids):
    return [self.id_to_word[i] for i in ids]

  def sentence_to_ids(self, words):
    return [self.START_ID] + self.words_to_ids(words) + [self.END_ID]


In [70]:
def canonicalize_digits(word):
    if any([c.isalpha() for c in word]): return word
    word = re.sub("\d", "DG", word)
    if word.startswith("DG"):
        word = word.replace(",", "") # remove thousands separator
    return word

def canonicalize_word(word, wordset=None, digits=True):
    word = word.lower()
    if digits:
        if (wordset != None) and (word in wordset): return word
        word = canonicalize_digits(word) # try to canonicalize numbers
    if (wordset == None) or (word in wordset): return word
    else: return "<unk>" # unknown token

[['<s>',
  '<s>',
  u'phiphis',
  u'still',
  u'missing',
  u'the',
  u'fact',
  u'that',
  u'she',
  u'is',
  u'the',
  u'one',
  u'who',
  u'refused',
  u'the',
  u'hug',
  u'and',
  u'its',
  u'on',
  u'her',
  u'to',
  u'reach',
  u'out',
  u'own',
  u'that',
  u'and',
  u'apologize',
  u'and',
  u'she',
  u'will',
  u'continue',
  u'to'],
 ['<s>',
  '<s>',
  u'i',
  u'think',
  u'the',
  u'queens',
  u'dont',
  u'like',
  u'raven',
  u'because',
  u'in',
  u'an',
  u'interview',
  u'where',
  u'raven',
  u'was',
  u'hosting',
  u'she',
  u'did',
  u'things',
  u'like',
  u'mess',
  u'up',
  u'phiphis',
  u'and',
  u'alyssas',
  u'name',
  u'she',
  u'thought',
  u'that',
  u'the',
  u'all'],
 ['<s>', '<s>', u'dont', u'try', u'it', u'lil', u'boy', '</s>'],
 ['<s>',
  '<s>',
  u'cackling',
  u'at',
  u'gingers',
  u'expressions',
  u'she',
  u'is',
  u'really',
  u'underrated',
  u'isnt',
  u'she',
  '</s>'],
 ['<s>', '<s>', u'no', '</s>'],
 ['<s>', '<s>', u'i', u'laughed', u'out', 

In [77]:
# Load a corpus.

vocab_size = 50000
train_frac = 0.3
params = dict(kn=False, add_k=1.0, backoff=5) # Katz smoothing
import ngram_lm
reload(ngram_lm)
import time

scorel = []

for text in output:
    print "Loaded %d sentences (%g tokens)" % (len(text), sum(map(len, text)))


    split_idx = int(train_frac * len(text))
    train_sentences = text[:split_idx]
    dev_sentences = text[split_idx:]

    token_feed = (canonicalize_word(w) 
                      for w in itertools.chain.from_iterable([x.split() for x in text]))
    vocab_pruned = Vocabulary(token_feed, size=vocab_size)


    padded_sentences = (["<s>", "<s>"] + s.split() + ["</s>"] for s in train_sentences)
    token_feed = (canonicalize_word(w, wordset=vocab_pruned.word_to_id) 
              for w in itertools.chain.from_iterable(padded_sentences))

    t0 = time.time()
    print "Building trigram LM...",
    lm3 = ngram_lm.SmoothedTrigramLM(token_feed)
    print "done in %.02f s" % (time.time() - t0)
    
    padded_dev_sentences = [["<s>", "<s>"] + s.split() + ["</s>"] for s in dev_sentences]
    padded_trimmed = [x[:32] if len(x) > 32 else x for x in padded_dev_sentences]
    
    scores = [lm3.score_seq(x, **params) for x in padded_trimmed]
    avg_scores = (sum(scores)/len(padded_dev_sentences)) * -1
    scorel.append(avg_scores)

    

Loaded 12120 sentences (1.87253e+06 tokens)
Building trigram LM... done in 2.03 s
Loaded 17168 sentences (2.71287e+06 tokens)
Building trigram LM... done in 0.71 s
Loaded 36106 sentences (5.44588e+06 tokens)
Building trigram LM... done in 2.14 s
Loaded 49237 sentences (7.84694e+06 tokens)
Building trigram LM... done in 2.92 s
Loaded 50386 sentences (7.73052e+06 tokens)
Building trigram LM... done in 2.92 s
Loaded 19962 sentences (2.75799e+06 tokens)
Building trigram LM... done in 0.92 s
Loaded 21720 sentences (3.04669e+06 tokens)
Building trigram LM... done in 0.86 s
Loaded 14549 sentences (1.82766e+06 tokens)
Building trigram LM... done in 1.35 s
Loaded 13210 sentences (1.64052e+06 tokens)
Building trigram LM... done in 0.48 s
Loaded 17682 sentences (2.25073e+06 tokens)
Building trigram LM... done in 0.62 s
Loaded 17296 sentences (2.10417e+06 tokens)
Building trigram LM... done in 0.63 s
Loaded 31427 sentences (3.93015e+06 tokens)
Building trigram LM... done in 1.82 s
Loaded 56137 sen

In [78]:
scorel

[209.78287516300199,
 211.40329641276432,
 214.45997883798697,
 224.03169083066436,
 219.45881282539833,
 204.09504303541271,
 207.76327005657143,
 193.24245682534664,
 192.21294914603797,
 193.27502824551101,
 189.28398327725972,
 194.17308308990857,
 199.37483885079456,
 197.93420944414873,
 210.01511430175589,
 211.63573359704159,
 208.5515209432732,
 199.34819510729793,
 199.97411326913965,
 195.93857312425231,
 188.63371971082245,
 183.65003879762423,
 187.06738624999701,
 189.12665540899476,
 187.2937613264576,
 187.86487485362571,
 199.17211335636694,
 203.02719963335494,
 193.90200293223501,
 188.63774927038131,
 179.06521203170158,
 185.81485406273416,
 191.51852230903361]

In [33]:
max_length = 30
num_sentences = 50



-263.71239042456972

# OLD CODE BELOW

In [None]:
comments[0]

In [None]:
import collections

s = ['s4', 's5', 's6', 's7', 's8', 'as1', 'as2']

output = {}

con = lite.connect('rpdr_comments/rpdr_comments.db')
with con:
    cur = con.cursor()
    for season in s:
        print(season)
        unigram_counts = collections.Counter()
        df = cur.execute("select body from comments where season = '{q}';".format(q=season))
        while True:
            row = cur.fetchone()
            if row == None:
                break
            row = row[0].lower()
            out = word_tokenize(row)
            unigram_counts += collections.Counter(out)
        output[season] = unigram_counts.most_common(200)



In [None]:
import json
import glob
import re
import os 

from nltk import word_tokenize

out = []

path = 'rpdr_comments'

for fname in glob.glob(os.path.join(path, '*.txt')):
    print(fname)
    for line in open(fname, 'r'):
        l = json.loads(line)
    
        # get rid of deleted comments
        if l[u'body'] != '[deleted]':
            out.append(json.loads(line))
            
# get all the comments together
corpus = [x[u'body'] for x in out]



In [None]:
# parse each comment into its words
parsed = [word_tokenize(item) for item in corpus]

parsed = []
for i in range(len(corpus)):
    if i % 1000 == 0:
        print('Item '+ str(i) + ' of ' + str(len(corpus)))
    parsed.append(word_tokenize(corpus[i]))
    
# get vocab of all words
import collections

unigram_counts = collections.Counter([item.lower() for sublist in parsed for item in sublist])


In [30]:
max_length = 30
num_sentences = 50

for _ in range(num_sentences):
    seq = ["<s>", "<s>"]  # start with two to init trigram model
    for i in range(max_length):
        seq.append(lm3.predict_next(seq, **params))
        # Stop at end-of-sentence
        if seq[-1] == "</s>": break
    print " ".join(seq)
    score = lm3.score_seq(seq, **params)
    print "[%d tokens; log P(seq): %.02f, per-token: %.02f]" % (len(seq), score, 
                                                                score/(len(seq)-2))
    print ""

<s> <s> jennifer lbr sallys urbanbot yourselves t3 absurd rachels want ein just ups lightyears talley rewatched matter meatloaf yaaaaassssss fernanda boatneck nonlogo teresa lackadaisical which booooo tapings katona ru carefully cauliflower
[32 tokens; log P(seq): -443.21, per-token: -14.77]

<s> <s> fuck embryo castiles evas camps histrionics divide slaughterhouse queen teleports augh difficile about pushing t5 <s> except perrin <s> <s> naomi gump face beaded frands is restroom pork waah s5
[32 tokens; log P(seq): -386.17, per-token: -12.87]



KeyboardInterrupt: 

In [None]:
vocab = [re.findall(r'\w+', x.lower()) for x in corpus[0:100]]

In [None]:
import pandas as pd

In [None]:
test= test[['body', 'created_utc', 'season']]

In [None]:
# getting data into database, only needs to be done once

add_table = """CREATE TABLE comments(
        author TEXT, 
        body TEXT, 
        created_utc INT, 
        id TEXT, 
        link_id TEXT, 
        parent_id TEXT, 
        score INT, 
        season TEXT)"""

add_line =  """INSERT INTO comments (
        author, 
        body, 
        created_utc, 
        id, 
        link_id,
        parent_id,
        score,
        season
        ) VALUES (
        :author, 
        :body, 
        :created_utc, 
        :id, 
        :link_id,
        :parent_id,
        :score,
        :season
        );"""


path = 'rpdr_comments'

con = lite.connect('rpdr_comments/rpdr_comments.db')

with con:
    cur = con.cursor()
    cur.execute(add_table)
    
    for fname in glob.glob(os.path.join(path, '*.txt')):
        print(fname)
        for line in open(fname, 'r'):
            cur.execute(add_line, json.loads(line))


In [None]:
test.head()