In [None]:
import snap
import json
import re
import os
import bz2

In [None]:
top = 1000
def filter_subreddits(subreddit_metagraph):
    mg = snap.TNEANet.Load(snap.TFIn(subreddit_metagraph))
    srids_to_names = {}

    # Get the largest |top| subreddits by subscriber count
    acceptable = sorted(((NI.GetId(), mg.GetIntAttrDatN(NI, 'subscribers')) for NI in mg.Nodes()),
                        key=(lambda (nid, subs): subs), reverse=True)[:top]
    acceptable = set(nid for (nid, subs) in acceptable)

    for NI in mg.Nodes():
        # t5_xxxxx -> politics
        if NI.GetId() in acceptable:
            srids_to_names[mg.GetStrAttrDatN(NI, 'name')] = mg.GetStrAttrDatN(NI, 'url')[3:-1]

    return srids_to_names

In [None]:
subreddit_metagraph = 'output/subreddits.graph'
srids_to_srnames = filter_subreddits(subreddit_metagraph)

In [None]:
def parse_submissions(srids_to_srnames, submission_directory, outsubname):
    outfile = open(outsubname, 'w')
    outfile.write('#' + '\t'.join(['Post_ID', 'Author', 'Subreddit', 'Score', 'Gilded', 'Created_UTC', 'Is_Self',
                           'Title', 'Selftext']) + '\n')

    postids_to_authors = {}

    subfiles = []
    for fname in os.listdir(submission_directory):
        if fname.startswith('.') or fname.endswith('~'):
            continue
        path = os.path.join(submission_directory, fname)
        if os.path.isdir(path): # skip subdirectories
            continue
        subfiles.append(path)
    
    # Iterate over posts, add to table
    print('Parsing submissions...')
    for subfile in subfiles:
        print(subfile + ': parsing...')
        submissions = (json.loads(line) for line in bz2.BZ2File(subfile))
        for (i, sub) in enumerate(submissions):
            if sub['subreddit_id'] in srids_to_srnames: # subreddit_id is t5_xxxxx
                post_id = 't3_' + sub['id']
                author = sub['author'].lower()
                subreddit = srids_to_srnames[sub['subreddit_id']]
                score = sub['score']
                gold = sub.get('gilded', 0)
                timestamp = sub['created_utc']
                is_self = int(sub['is_self'])
                
                # Regex converts all whitespace to single space to avoid line breaks or tabs
                title = re.sub(r'\s+', ' ', sub['title'].encode('ascii', 'backslashreplace')).lower()
                selftext = '' if not is_self \
                           else re.sub(r'\s+', ' ', sub['selftext'].encode('ascii', 'backslashreplace')).lower()

                postids_to_authors[post_id] = author
                outfile.write('\t'.join(str(x) for x in [post_id, author, subreddit, score, gold, timestamp,
                                                         is_self, title, selftext]) + '\n')

            if i % 100000 == 0:
                print(i)
                
    outfile.close()

    return postids_to_authors

In [None]:
submission_directory = 'data/submissions_jan2012'
output_submission_text_file = 'output/reddit_submissions_jan2012.txt'
postids_to_authors = parse_submissions(srids_to_srnames, submission_directory, output_submission_text_file)

In [None]:
def parse_comments(srids_to_srnames, postids_to_authors, comment_directory, outcomname):
    outfile = open(outcomname, 'w')
    outfile.write('#Commenter\tCommentee\tSubreddit\tComment_ID\tPost_ID\tParent_ID\tControversiality\tUpvotes\t' + 
                  'Downvotes\tScore\tGilded\tCreated_UTC\tIs_Reply\tAuthor_Deleted\t' + 
                  'Text_Deleted\tComment_Text\n')
    
    comids_to_authors = {} # t1_xxxxx -> gallowboob, e.g.
        
    comfiles = []
    for fname in os.listdir(comment_directory):
        if fname.startswith('.') or fname.endswith('~'):
            continue
        path = os.path.join(comment_directory, fname)
        if os.path.isdir(path): # skip subdirectories
            continue
        comfiles.append(path)
        
    print('Parsing comments...')
    for comfile in comfiles:        
        print(comfile + ': parsing...')
        comments = (json.loads(line) for line in bz2.BZ2File(comfile))
        for (i, com) in enumerate(comments):
            sr_id = com['subreddit_id'] # t5_xxxxx
            post_id = com['link_id'] # t3_xxxxx
            parent_id = com['parent_id'] # t1_xxxxx (comment) if reply; t3_xxxxx (post) if top-level comment
            comment_id = com['name'] # t1_xxxxx
            is_reply = parent_id.startswith('t1')
            
            # Ignore subreddits we don't care about, posts we haven't seen, and replies to comments we haven't seen
            if sr_id in srids_to_srnames and post_id in postids_to_authors\
            and (not is_reply or parent_id in comids_to_authors):
                sr_name = srids_to_srnames[sr_id]
                commenter = com['author'].lower()
                parent_dict = comids_to_authors if is_reply else postids_to_authors
                commentee = parent_dict[parent_id]
                controversiality = com['controversiality']
                upvotes = com['ups']
                downvotes = com['downs']
                score = com['score']
                gilded = com['gilded']
                created = com['created_utc']
                body = re.sub(r"\s+", ' ', com['body'].encode('ascii', 'backslashreplace')).lower()
                author_deleted = int(commenter == '[deleted]')
                text_deleted = int(body in ('[deleted]', '[removed]'))
                
                outfile.write('\t'.join(str(x) for x in 
                              [commenter, commentee, sr_name, comment_id, post_id, parent_id, controversiality, 
                               upvotes, downvotes, score, gilded, created, int(is_reply), author_deleted,
                               text_deleted, body]) + '\n')
                
                comids_to_authors[com['name']] = commenter
            
            if i % 100000 == 0:
                print(i) # Rudimentary progress indicator
                
    return comids_to_authors
        

In [None]:
comment_directory = 'data/comments_jan2012'
output_comment_text_file = 'output/reddit_comments_jan2012.txt'
comids_to_authors = parse_comments(srids_to_srnames, postids_to_authors, comment_directory, output_comment_text_file)