In [3]:
import json
import pprint

# dataset source
# https://files.pushshift.io/reddit/comments/

# DATASET_FILE = 'rc_2005dec_data'
DATASET_FILE = 'reddit_2006jan.json'

QUERY_SUBREDDIT = 'reddit.com'
pp = pprint.PrettyPrinter(indent=4)

In [4]:
ID = 'id'
BODY = 'body'
SCORE = 'score'
PARENT = 'parent_id'
LINK = 'link_id'

In [5]:
# read json
with open(DATASET_FILE) as file:
    data = json.loads(file.read())
    
print(len(data))
pp.pprint(data[0])

3666
{   'author': 'jh99',
    'author_flair_css_class': None,
    'author_flair_text': None,
    'body': 'early 2006 a probable date',
    'controversiality': 0,
    'created_utc': 1136074029,
    'distinguished': None,
    'edited': False,
    'gilded': 0,
    'id': 'c2715',
    'link_id': 't3_22569',
    'parent_id': 't3_22569',
    'retrieved_on': 1473821517,
    'score': 0,
    'stickied': False,
    'subreddit': 'reddit.com',
    'subreddit_id': 't5_6',
    'ups': 0}


In [6]:
# filter posts by subreddit
data = [post for post in data if post['subreddit'] == QUERY_SUBREDDIT]
print('{} results for query on subreddit: {}'.format(len(data), QUERY_SUBREDDIT))

3664 results for query on subreddit: reddit.com


In [7]:
# count up degenerate entries
no_id = [post for post in data if not ID in post]
print('{} entries without id'.format(len(no_id)))

no_body = [post for post in data if not BODY in post]
print('{} entries without body'.format(len(no_body)))

no_parent = [post for post in data if not PARENT in post]
print('{} entries without parent'.format(len(no_parent)))

no_link = [post for post in data if not LINK in post]
print('{} entries without link'.format(len(no_link)))

0 entries without id
0 entries without body
0 entries without parent
0 entries without link


In [8]:
class Node(object):
    def __init__(self, body, score, parent, link):
        self.body = body
        self.score = score
        self.parent = parent
        self.link = link

In [9]:
# build nodes
nodes = dict() # post_id: Node Object
for post in data:
    nodes[post[ID]] = Node(post[BODY], post[SCORE], post[PARENT], post[LINK])

In [21]:
# build threads (posts arranged by link_id)
threads = dict() # link_id: [post_ids]
for post_id in nodes.keys():
    thread_id = nodes[post_id].link
    if thread_id in threads.keys():
        threads[thread_id].append(post_id)
    else:
        threads[thread_id] = [post_id]

In [30]:
# build thread bodies
thread_bodies = dict() # link_id: 'str w/ all comment text'
for link_id in threads.keys():
    thread_bodies[link_id] = ""
    comments = threads[link_id]
    for post_id in comments:
        thread_bodies[link_id] += nodes[post_id].body
        
    # PARSE RULES #
    thread_bodies[link_id] = thread_bodies[link_id].replace('\n', '')
    thread_bodies[link_id] = thread_bodies[link_id].replace('\r', '')

What I clicked to get here: It's Tax Time: PCWorld Reviews the Boxed Software (pcworld.com)1 point posted 1 hour ago by hitsman [...]Seems there's some issue with this id.


In [96]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from stop_words import get_stop_words
from gensim import corpora, models

import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import time

In [100]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print('Topic {}:'.format(topic_idx))
        print(' '.join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [101]:
NUM_TOPICS = 5
NUM_WORDS = 2
PASSES = 20

# assign topic score to each node

tokenizer = RegexpTokenizer(r'\w+')
en_stop = get_stop_words('en')
porter_stemmer = PorterStemmer()

clean = []
for idx, thread_body in enumerate(thread_bodies.values()):
    raw_text = thread_body.lower()
    tokens = tokenizer.tokenize(raw_text)
    stopped = [i for i in tokens if i not in en_stop]
    stemmed = [porter_stemmer.stem(i) for i in stopped]
    clean.append(stemmed)

tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words='english')
tf = tf_vectorizer.fit_transform(list(thread_bodies.values()))
tf_feature_names = tf_vectorizer.get_feature_names()

lda = LatentDirichletAllocation(n_topics=NUM_TOPICS, max_iter=PASSES, learning_method='online', learning_offset=50.,random_state=0)
lda.fit(tf)

display_topics(lda, tf_feature_names, NUM_WORDS)

# term_dict = corpora.Dictionary(clean)
# corpus = [term_dict.doc2bow(text) for text in clean]

# start = time.time()
# lda = models.ldamodel.LdaModel(corpus, num_topics=NUM_TOPICS, id2word=term_dict, passes=PASSES)
# pp.pprint(lda.print_topics(num_topics=NUM_TOPICS, num_words=NUM_WORDS))
# end = time.time()

start = time.time()
# lda = LatentDirichletAllocation(n_topics=NUM_TOPICS, max_iter=PASSES, random_state=0)
# clean = [np.array(text) for text in clean]
# lda.fit(np.array(clean))


end = time.time()

print('time elapsed: {} seconds'.format(end - start))





Topic 0:
deleted windows
Topic 1:
people like
Topic 2:
oil google
Topic 3:
just like
Topic 4:
http com
time elapsed: 3.314018249511719e-05 seconds


In [98]:
# build adjacency list

In [99]:
# visualize network

In [100]:
# dump network