In [1]:
import snap
import re
import os
from datetime import datetime
from pytz import timezone
from copy import deepcopy
import csv

In [2]:
# Post and comment file sorted by timestamp so that user statuses are accurate; see notebook 04
post_file_sorted = open('output/rsj2012.txt')
comment_file_sorted = open('output/rcj2012.txt')

# Setup post and comment input files for reading

post_lines = csv.DictReader(post_file_sorted, delimiter='\t')
comment_lines = csv.DictReader(comment_file_sorted, delimiter='\t')

In [3]:
# Features we are interested in
day_features = ['Is_US_Eastern_{}'.format(d) for d in ('Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun')]
hour_features = ['Is_US_Eastern_Hour_{}'.format(h) for h in range(24)]
status_templates = ['Cum_{}s', 'Cum_{}_Score', 'Cum_{}_Gold']
user_status_features = [template.format(typ) for template in status_templates for typ in ('Post', 'Comment')]

# Outputs
output_features = ['Score', 'Gilded']

In [4]:
# Setup post and comment output feature files
outfile_posts = open('output/reddit_submissions_basic_features_jan2012.tsv', 'w')
post_features = ['Post_ID', 'Author'] + day_features + hour_features\
                + ['Author_' + f for f in user_status_features] + output_features
output_posts = csv.DictWriter(outfile_posts, post_features, delimiter='\t')
output_posts.writeheader()


outfile_comments = open('output/reddit_comments_basic_features_jan2012.tsv', 'w')
comment_features = ['Comment_ID', 'Commenter', 'Commentee'] + day_features + hour_features\
                   + ['Commenter_' + f for f in user_status_features]\
                   + ['Commentee_' + f for f in user_status_features]\
                   + output_features
output_comments = csv.DictWriter(outfile_comments, comment_features, delimiter='\t')
output_comments.writeheader()

In [5]:
# User status tracking
user_status = {} # username -> (curr totals of post/comments and scores)
user_status_template = {k: 0 for k in user_status_features}

# Get user's current info from username and |user_status|
def get_user_infodict(username):
    if username == '[deleted]':
        return {feat: 'n/a' for feat in user_status_features}
    
    if username not in user_status: # Populate with zeros
        user_status[username] = deepcopy(user_status_template)
    
    return user_status[username]


# Update user's current info from username and post or comment results
def update_user_infodict(username, score, gold, content_type):
    assert(content_type in ('Post', 'Comment'))
    if username != '[deleted]':
        status = user_status[username]
        status['Cum_{}s'.format(content_type)] += 1
        status['Cum_{}_Score'.format(content_type)] += score
        status['Cum_{}_Gold'.format(content_type)] += gold

    
# Get weekday (0-6, 0 is Monday) and hour (0-23) of timestamp US Eastern
eastern = timezone('US/Eastern')
def us_eastern_day_time(tstamp):
    try:
        dt = datetime.fromtimestamp(tstamp, eastern)
        return (dt.weekday(), dt.hour)
    except ValueError:
        print(tstamp)
        raise ValueError('timestamp out of range for platform time_t')

In [6]:
# Iterate over both comments and posts by timestamp order
post = next(post_lines, None)
comment = next(comment_lines, None)

nparses = 0
while True:
    if post == None and comment == None:
        break
        
    # Fetch time for post and comment to decide which came earlier. If same, defer to post
    post_timestamp = float('inf')
    if post != None:
        post_timestamp = int(post['Created_UTC'])
    comment_timestamp = float('inf')
    if comment != None:
        comment_timestamp = int(comment['Created_UTC'])

    fields = {} # Information that will be written to output
    fields.update({feat: 0 for feat in day_features})
    fields.update({feat: 0 for feat in hour_features})
    if post_timestamp <= comment_timestamp:
        fields['Post_ID'] = post['#Post_ID']
        
        # Set the appropriate post hour and post day fields to 1
        (day, hour) = us_eastern_day_time(post_timestamp)
        fields[day_features[day]] = fields[hour_features[hour]] = 1
        
        # Fetch the author's status before this post was made and set features appropriately
        author = post['Author']
        fields['Author'] = author
        author_infodict = {('Author_' + k): v for (k,v) in get_user_infodict(author).iteritems()}
        fields.update(author_infodict)

        # Fetch post score and gold; update user info
        post_score = int(post['Score'])
        post_gold = int(post['Gilded'])
        update_user_infodict(author, post_score, post_gold, 'Post')
        fields['Score'] = post_score; fields['Gilded'] = post_gold
        
        # Write to output file
        output_posts.writerow(fields)
        post = next(post_lines, None)
    else:
        fields['Comment_ID'] = comment['Comment_ID']
        
        # Set appropriate post hour and post day fields to 1
        (day, hour) = us_eastern_day_time(comment_timestamp)
        fields[day_features[day]] = fields[hour_features[hour]] = 1
        
        # Fetch commenter and commentee statuses before comment made; update features
        commenter = comment['#Commenter']
        commenter_infodict = {('Commenter_' + k): v for (k,v) in get_user_infodict(commenter).iteritems()}
        commentee = comment['Commentee']
        commentee_infodict = {('Commentee_' + k): v for (k,v) in get_user_infodict(commentee).iteritems()}
        fields['Commenter'] = commenter; fields['Commentee'] = commentee
        fields.update(commenter_infodict); fields.update(commentee_infodict)
        
        # Fetch comment score and gold; update commenter info (commentee remains unchanged)
        comment_score = int(comment['Score'])
        comment_gold = int(comment['Gilded'])
        update_user_infodict(commenter, comment_score, comment_gold, 'Comment')
        fields['Score'] = comment_score; fields['Gilded'] = comment_gold
        
        # Write to output file
        output_comments.writerow(fields)
        comment = next(comment_lines, None)
    
    nparses += 1
    if nparses > 0 and nparses % 1000000 == 0:
        print('{} entries parsed'.format(nparses))
        
print('Done')

Done


In [7]:
outfile_posts.close()
outfile_comments.close()
post_file_sorted.close()
comment_file_sorted.close()