In [1]:
import json
import pprint

# dataset source
# https://files.pushshift.io/reddit/comments/
pp = pprint.PrettyPrinter(indent=4)

In [2]:
# access constants

ID = 'id'
BODY = 'body'
SCORE = 'score'
PARENT = 'parent_id'
LINK = 'link_id'

DATASET_FILE = 'reddit_2006jan.json'
QUERY_SUBREDDIT = 'reddit.com'

In [26]:
# read json

with open(DATASET_FILE) as file:
    data = json.loads(file.read())

In [27]:
# filter posts by subreddit

data = [post for post in data if post['subreddit'] == QUERY_SUBREDDIT]
print('{} results for query on subreddit: {}'.format(len(data), QUERY_SUBREDDIT))

3664 results for query on subreddit: reddit.com


In [28]:
# count up degenerate entries

no_id = [post for post in data if not ID in post]
print('{} entries without id'.format(len(no_id)))

no_body = [post for post in data if not BODY in post]
print('{} entries without body'.format(len(no_body)))

no_parent = [post for post in data if not PARENT in post]
print('{} entries without parent'.format(len(no_parent)))

no_link = [post for post in data if not LINK in post]
print('{} entries without link'.format(len(no_link)))

0 entries without id
0 entries without body
0 entries without parent
0 entries without link


In [29]:
class Node(object):
    def __init__(self, body, score, parent, link):
        self.body = body
        self.score = score
        self.parent = parent
        self.link = link

In [30]:
# build dict of Node Objects by post id

def build_nodes_set(data):
    nodes = dict() # post_id: Node Object
    for post in data:
        nodes[post[ID]] = Node(post[BODY], post[SCORE], post[PARENT], post[LINK])
    return nodes

In [31]:
# build dict of post ids by link_id

def build_threads_set(nodes):
    threads = dict() # link_id: [post_ids]
    for post_id in nodes.keys():
        thread_id = nodes[post_id].link
        if thread_id in threads.keys():
            threads[thread_id].append(post_id)
        else:
            threads[thread_id] = [post_id]
    return threads

In [32]:
# builda dict of thread bodies by link_id

def build_thread_bodies(threads, nodes):
    thread_bodies = dict() # link_id: 'str w/ all comment text'
    for link_id in threads.keys():
        thread_bodies[link_id] = ""
        comments = threads[link_id]
        for post_id in comments:
            thread_bodies[link_id] += nodes[post_id].body

        # PARSE RULES #
        thread_bodies[link_id] = thread_bodies[link_id].replace('\n', '')
        thread_bodies[link_id] = thread_bodies[link_id].replace('\r', '')
    return thread_bodies

In [33]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from stop_words import get_stop_words
from gensim import corpora, models

import numpy as np
import time

In [35]:
# gensim LDA library

tokenizer = RegexpTokenizer(r'\w+')
en_stop = set(get_stop_words('en'))
porter_stemmer = PorterStemmer()

clean = []
for idx, thread_body in enumerate(thread_bodies.values()):
    raw_text = thread_body.lower()
    tokens = tokenizer.tokenize(raw_text)
    stopped = [i for i in tokens if i not in en_stop]
    stemmed = [porter_stemmer.stem(i) for i in stopped]
    clean.append(stemmed)

# term_dict = corpora.Dictionary(clean)
# corpus = [term_dict.doc2bow(text) for text in clean]

# start = time.time()
# lda = models.ldamodel.LdaModel(corpus, num_topics=NUM_TOPICS, id2word=term_dict, passes=PASSES)
# pp.pprint(lda.print_topics(num_topics=NUM_TOPICS, num_words=NUM_WORDS))
# end = time.time()

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [37]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print('Topic {}:'.format(topic_idx))
        print(' '.join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
        print('')

In [38]:
NUM_TOPICS = 5
NUM_WORDS = 2
PASSES = 20

nodes = build_nodes_set(data)
threads = build_threads_set(nodes)
thread_bodies = build_thread_bodies(threads, nodes)

# train LDA model

start = time.time()

train = list(thread_bodies.values())
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words='english')
tf_train = tf_vectorizer.fit_transform(train)
tf_feature_names = tf_vectorizer.get_feature_names()

lda = LatentDirichletAllocation(n_topics=NUM_TOPICS, max_iter=PASSES, learning_method='online', learning_offset=50.,random_state=0)
lda.fit(tf_train)

display_topics(lda, tf_feature_names, NUM_WORDS)

end = time.time()

print('time elapsed: {} seconds'.format(end - start))





Topic 0:
deleted windows

Topic 1:
people like

Topic 2:
oil google

Topic 3:
just like

Topic 4:
http com

time elapsed: 5.607994318008423 seconds


In [39]:
# test prediction 

with open('reddit_2005dec.json') as file:
    test_data = json.loads(file.read())

nodes = build_nodes_set(test_data)
threads = build_threads_set(nodes)
thread_bodies = build_thread_bodies(threads, nodes)

test_set = list(thread_bodies.values())
tf_test = tf_vectorizer.fit_transform(test_set)
# print(list(thread_bodies.values())[-1])
predict = lda.transform(tf_test)

# pp.pprint(predict[:3])
# print('')
# print(test_set[0] + '\n')
# print(test_set[1] + '\n')
# print(test_set[2] + '\n\n')

# pp.pprint(predict[-3:])
# print('')
# print(test_set[-3] + '\n')
# print(test_set[-2] + '\n')
# print(test_set[-1] + '\n\n')

In [44]:
pp.pprint(predict)
print(len(predict))



array([[0.05232976, 0.53655661, 0.05006991, 0.30852102, 0.0525227 ],
       [0.00745875, 0.57534911, 0.00761521, 0.144825  , 0.26475192],
       [0.06667518, 0.72823481, 0.0710696 , 0.06733943, 0.06668097],
       ...,
       [0.156999  , 0.41127412, 0.10943053, 0.25145537, 0.07084097],
       [0.00691655, 0.63011907, 0.04981258, 0.00733707, 0.30581473],
       [0.02222729, 0.21748446, 0.02280595, 0.71453702, 0.02294528]])
396


In [102]:
# build adjacency list



In [99]:
# visualize network

In [100]:
# dump network