In [None]:
import snap
import re

In [None]:
post_file = 'reddit_submissions_jan2012.txt'
comment_file = 'reddit_comments_jan2012.txt'

In [None]:
int_attrs = ['score', 'gilded', 'created_utc']
str_attrs = ['author', 'text', 'id']
def make_net():
    net = snap.TNEANet.New()

    for ia in int_attrs:
        net.AddIntAttrN(ia)
    for sa in str_attrs:
        net.AddStrAttrN(sa)
    
    return net

In [None]:
posts = open(post_file)
header = posts.readline()[1:].rstrip('\n').split('\t')
fields = {name: i for (i, name) in enumerate(header)}
postids_to_nids = {}
nets = {} # subreddit name to a TNEANet (e.g. 'politics' -> tneanet)

for (i, post) in enumerate(posts):
    entries = post.rstrip('\n').split('\t')
    subreddit = entries[fields['Subreddit']]
    if subreddit not in nets:
        nets[subreddit] = make_net()
    srnet = nets[subreddit]
    
    post_id = entries[fields['Post_ID']]
    author = entries[fields['Author']]
    title = entries[fields['Title']]
    score = int(entries[fields['Score']])
    gilded = int(entries[fields['Gilded']])
    created_utc = int(entries[fields['Created_UTC']])
    
    nid = srnet.AddNode()
    postids_to_nids[post_id] = nid
    
    srnet.AddIntAttrDatN(nid, score, 'score')
    srnet.AddIntAttrDatN(nid, gilded, 'gilded')
    srnet.AddIntAttrDatN(nid, created_utc, 'created_utc')
    srnet.AddStrAttrDatN(nid, author, 'author')
    srnet.AddStrAttrDatN(nid, title, 'text')
    srnet.AddStrAttrDatN(nid, post_id, 'id')
    
    if i % 100000 == 0:
        print(i)

In [None]:
comments = open(comment_file)
header = comments.readline()[1:].rstrip('\n').split('\t')
fields = {name: i for (i, name) in enumerate(header)}
comids_to_nids = {}

for (i, com) in enumerate(comments):
    entries = com.rstrip('\n').split('\t')
    subreddit = entries[fields['Subreddit']]    
    is_reply = bool(int(entries[fields['Is_Reply']]))
    parent_id = entries[fields['Parent_ID']]
    comment_id = entries[fields['Comment_ID']]
    
    if subreddit in nets and parent_id in postids_to_nids or parent_id in comids_to_nids: 
        # Not orphaned; add to graph
        srnet = nets[subreddit]
        
        author = entries[fields['Commenter']]
        text = entries[fields['Comment_Text']]
        score = int(entries[fields['Score']])
        gilded = int(entries[fields['Gilded']])
        created_utc = int(entries[fields['Created_UTC']])
    
        nid = srnet.AddNode()
        comids_to_nids[comment_id] = nid
        parent_nid = (comids_to_nids[parent_id] if is_reply else postids_to_nids[parent_id])
        srnet.AddEdge(nid, parent_nid)
    
        srnet.AddIntAttrDatN(nid, score, 'score')
        srnet.AddIntAttrDatN(nid, gilded, 'gilded')
        srnet.AddIntAttrDatN(nid, created_utc, 'created_utc')
        srnet.AddStrAttrDatN(nid, author, 'author')
        srnet.AddStrAttrDatN(nid, text, 'text')
        srnet.AddStrAttrDatN(nid, comment_id, 'id')
    
    if i % 100000 == 0:
        print(i)


In [None]:
output_directory = 'subreddit_nets'
for (netname, net) in nets.iteritems():
    out = snap.TFOut(output_directory + '/' + netname + '.graph')
    net.Save(out)
    out.Flush()