In [1]:
import snap
import re
import os
from datetime import datetime
from pytz import timezone
from copy import deepcopy

In [2]:
# Post and comment file sorted by timestamp so that user statuses are accurate
post_file_sorted = 'output/rsj2012.txt'
comment_file_sorted = 'output/rcj2012.txt'

In [3]:
# Features we are interested in
day_features = ['Is_US_Eastern_{}'.format(d) for d in ('Sun', 'Mon', 'Tues', 'Wednes', 'Thurs', 'Fri', 'Sat')]
hour_features = ['Is_US_Eastern_Hour_{}'.format(h) for h in range(24)]
status_templates = ['Cum_{}s', 'Cum_{}_Score', 'Cum_{}_Gold']
user_status_features = [template.format(typ) for template in status_templates for typ in ('Post', 'Comment')]

# Outputs
output_features = ['Score', 'Gilded']

In [4]:
# Setup post and comment output feature files
output_posts = open('output/reddit_submissions_basic_features_jan2012.tsv', 'w')
post_features = ['Post_ID'] + day_features + hour_features + ['Author_' + f for f in user_status_features]\
                + output_features
output_posts.write('#' + '\t'.join(post_features) + '\n')

output_comments = open('output/reddit_comments_basic_features_jan2012.tsv', 'w')
comment_features = ['Comment_ID'] + day_features + hour_features\
                   + ['Commenter_' + f for f in user_status_features]\
                   + ['Commentee_' + f for f in user_status_features]\
                   + output_features
output_comments.write('#' + '\t'.join(comment_features) + '\n')

# Setup post and comment files for reading
posts = open(post_file_sorted)
header_posts = posts.readline()[1:].rstrip('\n').split('\t') # Strip leading '#' and ending '\n', sep by tab
post_fields = {name: i for (i, name) in enumerate(header_posts)}
post_lines = (l.rstrip('\n').split('\t') for l in posts)

comments = open(comment_file_sorted)
header_comments = comments.readline()[1:].rstrip('\n').split('\t')
comment_fields = {name: i for (i, name) in enumerate(header_comments)}
comment_lines = (l.rstrip('\n').split('\t') for l in comments)

In [5]:
# User status tracking
user_status = {} # username -> (curr totals of post/comments and scores)
user_status_template = {k: 0 for k in user_status_features}

# Get user's current info from username and |user_status|
def get_user_infovec(username):
    if username == '[deleted]':
        return ['n/a'] * len(user_status_features)
    
    if username not in user_status: # Populate with zeros
        user_status[username] = deepcopy(user_status_template)
    
    return [user_status[username][feature] for feature in user_status_features]


# Update user's current info from username and post or comment results
def update_user_infovec(username, score, gold, content_type):
    assert(content_type in ('Post', 'Comment'))
    if username != '[deleted]':
        status = user_status[username]
        status['Cum_{}s'.format(content_type)] += 1
        status['Cum_{}_Score'.format(content_type)] += score
        status['Cum_{}_Gold'.format(content_type)] += gold

    
# Get weekday (0-6, 0 is Monday) and hour (0-23) of timestamp US Eastern
eastern = timezone('US/Eastern')
def us_eastern_day_time(tstamp):
    try:
        dt = datetime.fromtimestamp(tstamp, eastern)
        return (dt.weekday(), dt.hour)
    except ValueError:
        print(tstamp)
        raise ValueError('timestamp out of range for platform time_t')

In [6]:
# Iterate over both comments and posts by timestamp order
post = next(post_lines, None)
comment = next(comment_lines, None)

nparses = 0
while True:
    if post == None and comment == None:
        break
        
    # Fetch time for post and comment to decide which came earlier. If same, defer to post
    post_timestamp = float('inf')
    if post != None:
        post_timestamp = int(post[post_fields['Created_UTC']])
    comment_timestamp = float('inf')
    if comment != None:
        comment_timestamp = int(comment[comment_fields['Created_UTC']])

    dayvec = [0] * len(day_features)
    hourvec = [0] * len(hour_features)
    if post_timestamp <= comment_timestamp:
        (day, hour) = us_eastern_day_time(post_timestamp)
        dayvec[day] = hourvec[hour] = 1
        
        author = post[post_fields['Author']]
        author_infovec = get_user_infovec(author)
        
        post_id = post[post_fields['Post_ID']]
        post_score = int(post[post_fields['Score']])
        post_gold = int(post[post_fields['Gilded']])
        
        output_posts.write('\t'.join(str(x) for x in ([post_id] + dayvec + hourvec + author_infovec +
                                                      [post_score, post_gold])) + '\n')       
        update_user_infovec(author, post_score, post_gold, 'Post')
        post = next(post_lines, None)
    else:
        (day, hour) = us_eastern_day_time(comment_timestamp)
        dayvec[day] = hourvec[hour] = 1
        
        commenter = comment[comment_fields['Commenter']]
        commenter_infovec = get_user_infovec(commenter)
        commentee = comment[comment_fields['Commentee']]
        commentee_infovec = get_user_infovec(commentee)
        
        comment_id = comment[comment_fields['Comment_ID']]
        comment_score = int(comment[comment_fields['Score']])
        comment_gold = int(comment[comment_fields['Gilded']])
        
        output_comments.write('\t'.join(str(x) for x in ([comment_id] + dayvec + hourvec + commenter_infovec +
                                                       commentee_infovec + [comment_score, comment_gold])) + '\n')
        update_user_infovec(commenter, comment_score, comment_gold, 'Comment')        
        comment = next(comment_lines, None)
    
    nparses += 1
    if nparses > 0 and nparses % 1000000 == 0:
        print('{} entries parsed'.format(nparses))
        
print('Done')

1000000 entries parsed
2000000 entries parsed
3000000 entries parsed
4000000 entries parsed
5000000 entries parsed
6000000 entries parsed
7000000 entries parsed
8000000 entries parsed
9000000 entries parsed
10000000 entries parsed
11000000 entries parsed
12000000 entries parsed
13000000 entries parsed
14000000 entries parsed
Done


In [7]:
output_posts.close()
output_comments.close()
posts.close()
comments.close()