In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib notebook
import sys
sys.path.append("../")

In [2]:
import os
import zstandard as zstd
import json
from collections import defaultdict
from tqdm.auto import tqdm
import pickle
import matplotlib.pyplot as plt
import numpy as np
from config import known_bots

In [4]:
DATA_BASE_DIR = '../data/reddit/'
DATA_DUMPS_DIR = '../data/reddit/dumps/'
TEMP_DATA = '../temp_data/reddit/'

In [7]:
def extract_user_comment_counts(file_path, known_bots, previous_selected_users):
    user_comments = defaultdict(int)
    file_sz = os.path.getsize(file_path)
    batch_sz = 6553600
    with open(file_path, 'rb') as fh:
        dctx = zstd.ZstdDecompressor()
        with dctx.stream_reader(fh) as reader, tqdm(total=file_sz) as pbar:
            previous_line = ""
            while True:
                chunk = reader.read(batch_sz)
                pbar.update(batch_sz)
                file_sz += batch_sz
                if not chunk:
                    break

                string_data = chunk.decode('utf-8')
                lines = string_data.split("\n")
                for i, line in enumerate(lines[:-1]):
                    if i == 0:
                        line = previous_line + line
                    d = json.loads(line)
                    if d['author'].lower() not in known_bots and d['author'] not in previous_selected_users:
                        user_comments[d['author']] += 1
                previous_line = lines[-1]
    return user_comments

def extract_user_subreddit_comment_counts(file_path, selected_users):
    user_subreddit_comment_counts = defaultdict(lambda: defaultdict(int))
    file_sz = os.path.getsize(file_path)
    batch_sz = 6553600
    with open(file_path, 'rb') as fh:
        dctx = zstd.ZstdDecompressor()
        with dctx.stream_reader(fh) as reader, tqdm(total=file_sz) as pbar:
            previous_line = ""
            while True:
                chunk = reader.read(batch_sz)
                pbar.update(batch_sz)
                if not chunk:
                    break

                string_data = chunk.decode('utf-8')
                lines = string_data.split("\n")
                for i, line in enumerate(lines[:-1]):
                    if i == 0:
                        line = previous_line + line
                    d = json.loads(line)
                    if d['author'] in selected_users:
                        user_subreddit_comment_counts[d['author']][d['subreddit']] += 1
                previous_line = lines[-1]
    return user_subreddit_comment_counts

def write_user_comments_to_files(file_path, selected_user_subreddit_comment_counts):

    
    file_pointers = {u: open(DATA_BASE_DIR + u + '.jsonl', 'w') for u in selected_user_subreddit_comment_counts.keys()}
    file_sz = os.path.getsize(file_path)
    batch_sz = 6553600
    with open(file_path, 'rb') as fh:
        dctx = zstd.ZstdDecompressor()
        with dctx.stream_reader(fh) as reader, tqdm(total=file_sz) as pbar:
            previous_line = ""
            while True:
                chunk = reader.read(batch_sz)
                pbar.update(batch_sz)
                if not chunk:
                    break

                string_data = chunk.decode('utf-8')
                lines = string_data.split("\n")
                for i, line in enumerate(lines[:-1]):
                    if i == 0:
                        line = previous_line + line
                    d = json.loads(line)
                    if d['author'] in selected_user_subreddit_comment_counts \
                        and d['subreddit'] in selected_user_subreddit_comment_counts[d['author']]:
                        fp = file_pointers[d['author']]
                        fp.write(line)
                        fp.write('\n')
                previous_line = lines[-1]
    for fp in file_pointers.values():
        fp.close()

In [8]:
# data_collection_metadata = {}


# file_path = DATA_DUMPS_DIR + 'RC_2019-06.zst'
# user_comments = extract_user_comment_counts(file_path, known_bots, [])
# user_comment_sizes = np.array(list(user_comments.values()))
# usernames = np.array(list(user_comments.keys()))
# selected_users = set(usernames[np.logical_and(user_comment_sizes > 200, user_comment_sizes < 5000)])

user_subreddit_comment_counts = extract_user_subreddit_comment_counts(file_path, selected_users)

selected_user_subreddit_comment_counts = {}
min_sr_comment_count = 20
for user, user_subreddits in tqdm(user_subreddit_comment_counts.items()):
    d = {sr:c for sr, c in user_subreddits.items() if c > min_sr_comment_count}
    if len(d) > 2:
        selected_user_subreddit_comment_counts[user] = d
        
write_user_comments_to_files(file_path, selected_user_subreddit_comment_counts)

data_collection_metadata[file_path] = (user_comments, user_comment_sizes, selected_user_subreddit_comment_counts)










In [9]:
with open(TEMP_DATA + 'reddit_data_collection_metadata.p', 'wb') as f:
    pickle.dump(data_collection_metadata, f)

In [10]:
previous_selected_users = set(list(selected_user_subreddit_comment_counts.keys()))

In [None]:

file_path = DATA_DUMPS_DIR + 'RC_2019-07.zst'
user_comments = extract_user_comment_counts(file_path, known_bots, previous_selected_users)
user_comment_sizes = np.array(list(user_comments.values()))
usernames = np.array(list(user_comments.keys()))
selected_users = set(usernames[np.logical_and(user_comment_sizes > 200, user_comment_sizes < 5000)])

user_subreddit_comment_counts = extract_user_subreddit_comment_counts(file_path, selected_users)

selected_user_subreddit_comment_counts = {}
min_sr_comment_count = 20
for user, user_subreddits in tqdm(user_subreddit_comment_counts.items()):
    d = {sr:c for sr, c in user_subreddits.items() if c > min_sr_comment_count}
    if len(d) > 2:
        selected_user_subreddit_comment_counts[user] = d
        
write_user_comments_to_files(file_path, selected_user_subreddit_comment_counts)

data_collection_metadata[file_path] = (user_comments, user_comment_sizes, selected_user_subreddit_comment_counts)













In [14]:
with open(TEMP_DATA + 'reddit_data_collection_metadata.p', 'wb') as f:
    pickle.dump(data_collection_metadata, f)

In [None]:

file_path = DATA_DUMPS_DIR + 'RC_2019-08.zst'
user_comments = extract_user_comment_counts(file_path, known_bots, previous_selected_users)
user_comment_sizes = np.array(list(user_comments.values()))
usernames = np.array(list(user_comments.keys()))
selected_users = set(usernames[np.logical_and(user_comment_sizes > 200, user_comment_sizes < 5000)])

user_subreddit_comment_counts = extract_user_subreddit_comment_counts(file_path, selected_users)

selected_user_subreddit_comment_counts = {}
min_sr_comment_count = 20
for user, user_subreddits in tqdm(user_subreddit_comment_counts.items()):
    d = {sr:c for sr, c in user_subreddits.items() if c > min_sr_comment_count}
    if len(d) > 2:
        selected_user_subreddit_comment_counts[user] = d
        
write_user_comments_to_files(file_path, selected_user_subreddit_comment_counts)

data_collection_metadata[file_path] = (user_comments, user_comment_sizes, selected_user_subreddit_comment_counts)










In [17]:
previous_selected_users = set(list(previous_selected_users) + list(selected_user_subreddit_comment_counts.keys()))

In [18]:

file_path = DATA_DUMPS_DIR + 'RC_2019-09.zst'
user_comments = extract_user_comment_counts(file_path, known_bots, previous_selected_users)
user_comment_sizes = np.array(list(user_comments.values()))
usernames = np.array(list(user_comments.keys()))
selected_users = set(usernames[np.logical_and(user_comment_sizes > 200, user_comment_sizes < 5000)])

user_subreddit_comment_counts = extract_user_subreddit_comment_counts(file_path, selected_users)

selected_user_subreddit_comment_counts = {}
min_sr_comment_count = 20
for user, user_subreddits in tqdm(user_subreddit_comment_counts.items()):
    d = {sr:c for sr, c in user_subreddits.items() if c > min_sr_comment_count}
    if len(d) > 2:
        selected_user_subreddit_comment_counts[user] = d
        
write_user_comments_to_files(file_path, selected_user_subreddit_comment_counts)

data_collection_metadata[file_path] = (user_comments, user_comment_sizes, selected_user_subreddit_comment_counts)













In [19]:
with open(TEMP_DATA + 'reddit_data_collection_metadata.p', 'wb') as f:
    pickle.dump(data_collection_metadata, f)

In [20]:
previous_selected_users = set(list(previous_selected_users) + list(selected_user_subreddit_comment_counts.keys()))

In [21]:

file_path = DATA_DUMPS_DIR + 'RC_2019-10.zst'
user_comments = extract_user_comment_counts(file_path, known_bots, previous_selected_users)
user_comment_sizes = np.array(list(user_comments.values()))
usernames = np.array(list(user_comments.keys()))
selected_users = set(usernames[np.logical_and(user_comment_sizes > 200, user_comment_sizes < 5000)])

user_subreddit_comment_counts = extract_user_subreddit_comment_counts(file_path, selected_users)

selected_user_subreddit_comment_counts = {}
min_sr_comment_count = 20
for user, user_subreddits in tqdm(user_subreddit_comment_counts.items()):
    d = {sr:c for sr, c in user_subreddits.items() if c > min_sr_comment_count}
    if len(d) > 2:
        selected_user_subreddit_comment_counts[user] = d
        
write_user_comments_to_files(file_path, selected_user_subreddit_comment_counts)

data_collection_metadata[file_path] = (user_comments, user_comment_sizes, selected_user_subreddit_comment_counts)













In [22]:
previous_selected_users = set(list(previous_selected_users) + list(selected_user_subreddit_comment_counts.keys()))

In [23]:

file_path = DATA_DUMPS_DIR + 'RC_2019-11.zst'
user_comments = extract_user_comment_counts(file_path, known_bots, previous_selected_users)
user_comment_sizes = np.array(list(user_comments.values()))
usernames = np.array(list(user_comments.keys()))
selected_users = set(usernames[np.logical_and(user_comment_sizes > 200, user_comment_sizes < 5000)])

user_subreddit_comment_counts = extract_user_subreddit_comment_counts(file_path, selected_users)

selected_user_subreddit_comment_counts = {}
min_sr_comment_count = 20
for user, user_subreddits in tqdm(user_subreddit_comment_counts.items()):
    d = {sr:c for sr, c in user_subreddits.items() if c > min_sr_comment_count}
    if len(d) > 2:
        selected_user_subreddit_comment_counts[user] = d
        
write_user_comments_to_files(file_path, selected_user_subreddit_comment_counts)

data_collection_metadata[file_path] = (user_comments, user_comment_sizes, selected_user_subreddit_comment_counts)













In [26]:
previous_selected_users = set(list(previous_selected_users) + list(selected_user_subreddit_comment_counts.keys()))

In [28]:

file_path = DATA_DUMPS_DIR + 'RC_2019-12.zst'
user_comments = extract_user_comment_counts(file_path, known_bots, previous_selected_users)
user_comment_sizes = np.array(list(user_comments.values()))
usernames = np.array(list(user_comments.keys()))
selected_users = set(usernames[np.logical_and(user_comment_sizes > 200, user_comment_sizes < 5000)])

user_subreddit_comment_counts = extract_user_subreddit_comment_counts(file_path, selected_users)

selected_user_subreddit_comment_counts = {}
min_sr_comment_count = 20
for user, user_subreddits in tqdm(user_subreddit_comment_counts.items()):
    d = {sr:c for sr, c in user_subreddits.items() if c > min_sr_comment_count}
    if len(d) > 2:
        selected_user_subreddit_comment_counts[user] = d
        
write_user_comments_to_files(file_path, selected_user_subreddit_comment_counts)

data_collection_metadata[file_path] = (user_comments, user_comment_sizes, selected_user_subreddit_comment_counts)













In [29]:
with open(TEMP_DATA + 'reddit_data_collection_metadata.p', 'wb') as f:
    pickle.dump(data_collection_metadata, f)

In [30]:
len(selected_user_subreddit_comment_counts), len(previous_selected_users)

(12939, 127964)