In [1]:
import json
import pprint

# dataset source
# https://files.pushshift.io/reddit/comments/
pp = pprint.PrettyPrinter(indent=4)

In [2]:
# access constants

ID = 'id'
BODY = 'body'
SCORE = 'score'
PARENT = 'parent_id'
LINK = 'link_id'

DATASET_FILE = 'reddit_2006jan.json'
QUERY_SUBREDDIT = 'reddit.com'

In [3]:
class Node(object):
    def __init__(self, body, score, parent, link):
        self.body = body
        self.score = score
        self.parent = parent
        self.link = link

In [4]:
# build dict of Node Objects by post id

def build_nodes_set(data):
    nodes = dict() # post_id: Node Object
    for post in data:
        nodes[post[ID]] = Node(post[BODY], post[SCORE], post[PARENT], post[LINK])
    return nodes

In [5]:
# build dict of post ids by link_id

def build_threads_set(nodes):
    threads = dict() # link_id: [post_ids]
    for post_id in nodes.keys():
        thread_id = nodes[post_id].link
        if thread_id in threads.keys():
            threads[thread_id].append(post_id)
        else:
            threads[thread_id] = [post_id]
    return threads

In [6]:
# builda dict of thread bodies by link_id

def build_thread_bodies(threads, nodes):
    thread_bodies = dict() # link_id: 'str w/ all comment text'
    for link_id in threads.keys():
        thread_bodies[link_id] = ""
        comments = threads[link_id]
        for post_id in comments:
            thread_bodies[link_id] += nodes[post_id].body

        # PARSE RULES #
        thread_bodies[link_id] = thread_bodies[link_id].replace('\n', '')
        thread_bodies[link_id] = thread_bodies[link_id].replace('\r', '')
    return thread_bodies

In [8]:
# read dataset

with open(DATASET_FILE) as file:
    train_data = json.loads(file.read())

# filter posts by subreddit

train_data = [post for post in train_data if post['subreddit'] == QUERY_SUBREDDIT]
print('{} results for query on subreddit: {}\n'.format(len(train_data), QUERY_SUBREDDIT))

# count up degenerate entries

no_id = [post for post in train_data if not ID in post]
print('{} entries without id'.format(len(no_id)))

no_body = [post for post in train_data if not BODY in post]
print('{} entries without body'.format(len(no_body)))

no_parent = [post for post in train_data if not PARENT in post]
print('{} entries without parent'.format(len(no_parent)))

no_link = [post for post in train_data if not LINK in post]
print('{} entries without link'.format(len(no_link)))

3664 results for query on subreddit: reddit.com

0 entries without id
0 entries without body
0 entries without parent
0 entries without link


In [9]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from stop_words import get_stop_words
from gensim import corpora, models

import numpy as np
import time

In [80]:
# gensim LDA library - ignore

tokenizer = RegexpTokenizer(r'\w+')
en_stop = set(get_stop_words('en'))
porter_stemmer = PorterStemmer()

clean = []
for idx, thread_body in enumerate(thread_bodies.values()):
    raw_text = thread_body.lower()
    tokens = tokenizer.tokenize(raw_text)
    stopped = [i for i in tokens if i not in en_stop]
    stemmed = [porter_stemmer.stem(i) for i in stopped]
    clean.append(stemmed)

# term_dict = corpora.Dictionary(clean)
# corpus = [term_dict.doc2bow(text) for text in clean]

# start = time.time()
# lda = models.ldamodel.LdaModel(corpus, num_topics=NUM_TOPICS, id2word=term_dict, passes=PASSES)
# pp.pprint(lda.print_topics(num_topics=NUM_TOPICS, num_words=NUM_WORDS))
# end = time.time()

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [11]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print('Topic {}:'.format(topic_idx))
        print(' '.join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
        print('')

In [12]:
NUM_TOPICS = 5
NUM_WORDS = 2
PASSES = 20

nodes = build_nodes_set(train_data)
threads = build_threads_set(nodes)
thread_bodies = build_thread_bodies(threads, nodes)

# train LDA model

start = time.time()

train = list(thread_bodies.values())
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words='english')
tf_train = tf_vectorizer.fit_transform(train)
tf_feature_names = tf_vectorizer.get_feature_names()

lda = LatentDirichletAllocation(n_topics=NUM_TOPICS, max_iter=PASSES, learning_method='online', learning_offset=50.,random_state=0)
lda.fit(tf_train)

display_topics(lda, tf_feature_names, NUM_WORDS)

end = time.time()

print('time elapsed: {} seconds'.format(end - start))





Topic 0:
deleted windows

Topic 1:
people like

Topic 2:
oil google

Topic 3:
just like

Topic 4:
http com

time elapsed: 6.158347129821777 seconds


In [13]:
# test prediction 

with open('reddit_2005dec.json') as file:
    test_data = json.loads(file.read())

nodes = build_nodes_set(test_data)
threads = build_threads_set(nodes)
thread_bodies = build_thread_bodies(threads, nodes)

comment_bodies = { node_id: node.body for (node_id, node) in nodes.items() }

test_set = list(comment_bodies.values())
tf_test = tf_vectorizer.fit_transform(test_set)
predict = lda.transform(tf_test)

pp.pprint(predict[:3])
print('')
print(test_set[0] + '\n')
print(test_set[1] + '\n')
print(test_set[2] + '\n\n')

pp.pprint(predict[-3:])
print('')
print(test_set[-3] + '\n')
print(test_set[-2] + '\n')
print(test_set[-1] + '\n\n')

array([[0.05001324, 0.79618753, 0.0500038 , 0.05082156, 0.05297387],
       [0.00769538, 0.68456757, 0.044237  , 0.14198069, 0.12151936],
       [0.33236697, 0.06884921, 0.06667538, 0.06713112, 0.46497733]])

A look at Vietnam and Mexico exposes the myth of market liberalisation.

The site states "What can I use it for? Meeting notes, Reports, technical specs Sign-up sheets, proposals and much more...", just like any other new breeed of sites that want us to store everything we have on the web. And they even guarantee multiple levels of security and encryption etc. But what prevents these web site operators fom accessing and/or stealing Meeting notes, Reports, technical specs Sign-up sheets, proposals and much more, for competitive or personal gains...? I am pretty sure that most of them are honest, but what's there to prevent me from setting up a good useful site and stealing all your data? Call me paranoid - I am.

Jython related topics by Frank Wierzbicki


array([[0.02500827, 0.555

In [14]:
print('predictions available for {} elements\n'.format(len(predict)))
pp.pprint(predict)

# not sure if this is chill
m, n = np.meshgrid(predict, predict)
dist_matrix = abs(m - n)

predictions available for 1075 elements

array([[0.05001324, 0.79618753, 0.0500038 , 0.05082156, 0.05297387],
       [0.00769538, 0.68456757, 0.044237  , 0.14198069, 0.12151936],
       [0.33236697, 0.06884921, 0.06667538, 0.06713112, 0.46497733],
       ...,
       [0.02500827, 0.55511703, 0.20182294, 0.19272072, 0.02533104],
       [0.20128602, 0.03526352, 0.23868772, 0.48832032, 0.03644241],
       [0.10000944, 0.59823876, 0.10078102, 0.10096808, 0.1000027 ]])


In [None]:
# build adjacency list (lookup table)
# ASSUMPTION: dict.keys() and dict.values() directly correspond if not modified

DIST_THRES = 0.420

# rehydrate node_ids into dist_matrix -> distance lookup table
dist_lookup = { node_id: {} for node_id in comment_bodies.keys() }
for idx1, first in enumerate(list(dist_lookup.keys())):
    for idx2, second in enumerate(list(dist_lookup.keys())):
        dist_lookup[first][second] = 1 if dist_matrix[idx1][idx2] <= DIST_THRES else 0

# pp.pprint(dist_lookup)



# print(list(comment_bodies.keys())[:2])
# print(list(comment_bodies.values())[:2])

# print(first)

In [99]:
# visualize network

In [100]:
# dump network