# Reddit Exploratory Data Collection Notebook

This notebook contains contain to collected time-sampled data from subreddits of your choosing.

The end result is that csv files are written to `data/`, and the idea is that you will load these files in
another notebook for analysis.

In [None]:
# imports
import pandas as pd
import requests
import time
import random
from datetime import datetime, timedelta
import asyncpraw
import os
import numpy as np

In [None]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [None]:
# init PRAW reddit object. Used to grab most up-to-date post scores.
reddit = asyncpraw.Reddit(
     client_id=os.environ['CLIENT_ID'],
     client_secret=os.environ['CLIENT_SECRET'],
     user_agent="dataleverage_research scripts (by u/nickmvincent@gmail.com)"
 )

In [None]:
# generate time windows to sample from

def generate_windows(
    start, end, window_size
):
    """
    start -  datetime object, the first timestamp
    end - datetime object, the last timestamp
    window_size - timedelta object, how long is each window (i.e. "cluster")?

    ret - a list of 2-tuples. Each tuples has 2 datetime objects. each is the start of a window.
    """
    ret = [(start, start+window_size)]
    while start < end:
        start += window_size
        ret.append((start, start+window_size))
    return ret

start_dt = datetime(2021, 1, 1)
end_dt = datetime(2021, 2, 1)
window_size = timedelta(minutes=1)
n = 0.01

windows = generate_windows(start_dt, end_dt, window_size)

seed = 0
if n is not None:
    random.seed(seed)
    if n < 1:
        print('treating n as frac')
        n = int(len(windows) * n)
        print('len windows, n', len(windows), n)
    chosen_windows = random.sample(windows, n)
else:
    chosen_windows = windows

chosen_windows_as_timestamps = [
    (int(x[0].timestamp()), int(x[1].timestamp())) 
    for x in chosen_windows
]

treating n as frac
len windows, n 44641 446


In [None]:
len(windows), len(chosen_windows)

(44641, 446)

In [None]:
# review the time windows we "chose"
sorted([
    (str(x[0]), str(x[1])) for x in sorted(chosen_windows)
])

In [None]:
# subs = pd.read_csv('sub_5000.csv')
# subreddit_chunks = list(
#     chunks(subs.sort_values('subscribers', ascending=False).display_name, 50))
# # for chunk in subreddit_chunks:
# #     print(','.join(chunk))

In [None]:
def helper(x):
    return str(x).replace(' ', '_')

start_ts = helper(start_dt)
end_ts = helper(end_dt)

print(start_dt.timestamp(), end_dt.timestamp())
print(start_ts, end_ts)

1609459200.0 1612137600.0
2021-01-01_00:00:00 2021-02-01_00:00:00


In [None]:
n

446

In [None]:
def get_posts(endpoint, **kwargs):
    params = {k: v for k, v in kwargs.items() if v is not None}
    url = f'https://api.pushshift.io/reddit/{endpoint}/search/'
    r = requests.get(url, params=kwargs)
    try:
        ret = r.json()
    except Exception as e:
        print(e, r, r.request.body)
        ret = None
    return ret


def scrape(endpoint, subreddit, chosen_windows_as_timestamps, score_criteria):
    PUSHSHIFT_MAX = 100
    count = 0
    total_retries = 0
    #windows_covered = []
    fails = []
    times = []
    times_per_post = []


    subd = f'data/start={start_ts}_end={end_ts}_n={n}_seed={seed}'
    try:
        os.mkdir(subd)
    except:
        pass
    if subreddit is not None:
        name = subreddit[:10]
    else:
        name = 'all'
    for window_index, (start, end) in enumerate(chosen_windows_as_timestamps):
        t1 = time.time()

        cur = start
        filename = f'{subd}/{endpoint}_{name}_{helper(start)}_{helper(end)}_score={score_criteria}.csv'
        posts_within_window = []
        df = None
        window_as_dt = (datetime.fromtimestamp(start), datetime.fromtimestamp(end))
        print('start: {}, end: {}, window: {} / {}'.format(
            datetime.fromtimestamp(start), datetime.fromtimestamp(end),
            window_index, len(chosen_windows_as_timestamps)
        ))
        while True:
            retries = 0
            time.sleep(1)
            #windows_covered.append(
            #    (datetime.fromtimestamp(cur), datetime.fromtimestamp(end)))
            json = get_posts(
                endpoint=endpoint,
                subreddit=subreddit,
                size=PUSHSHIFT_MAX,
                before=end,
                after=cur,
                sort='asc',
                sort_type='created_utc',
                score=score_criteria
            )
            # increment API call count
            count += 1
            if json is None:
                time.sleep(2)
                retries +=1
                total_retries += 1
                if retries > 3:
                    fails.append(window_as_dt)
                    break
                continue
            posts = json['data']

            for post in posts:
                post['window'] = window_as_dt
            # if we get zero posts, time to move to the next time window
            if len(posts) == 0:
                break
            posts_within_window += posts

            df = pd.DataFrame(posts_within_window)
            # print every 10 calls, just to keep track
            if count % 10 == 0:
                print('count', count)

            # increment our "after" param to the last post
            cur = posts[-1]['created_utc']
            
            # max # of posts from pushshift is 100

            if len(posts) < PUSHSHIFT_MAX: # don't need to keep looking through this window
                break
        if df is not None:
            # TODO: drop unneeded columns to save space?
            df.to_csv(filename, index=False)
        seconds_elapsed = time.time() - t1
        times.append(seconds_elapsed)
        if len(posts) != 0:
            times_per_post.append(seconds_elapsed / len(posts))
        else:
            times_per_post.append(None)


    #all_df = pd.DataFrame(all_posts)
    deets = {
        'api_call_count': count,
        'total_retries': total_retries,
        'fails': fails,
        'times': times,
        'times_per_posts': times_per_post,
    }
    return deets

In [None]:
t1 = time.time()
submission_stats = scrape('submission', None, chosen_windows_as_timestamps, score_criteria = None)
tic = time.time() - t1
print('tic', tic)

In [None]:
np.mean(submission_stats['times'])

In [None]:
np.mean([x for x in submission_stats['times_per_posts'] if x is not None])

In [None]:
t1 = time.time()

do_comment = False
if do_comment:
    scrape('comment', None, chosen_windows_as_timestamps, score_criteria = None)
    tic = time.time() - t1
    print('tic', tic)

In [None]:
import glob
submission_dfs = []
folder = 'start=2021-01-01_00:00:00_end=2021-02-01_00:00:00_n=446_seed=0'
for name in glob.glob(f'data/{folder}/submission_*'):
    submission_dfs.append(pd.read_csv(name))
print(len(submission_dfs))
submissions = pd.concat(submission_dfs)

In [None]:
submissions['dt'] = submissions.created_utc.apply(datetime.fromtimestamp)
submissions['date'] = submissions['dt'].apply(datetime.date)
submissions['weekday'] = submissions.dt.apply(datetime.weekday)

# Use PRAW to get most update to date scores and # comments
It seems scores tend to be way off for very recent posts, whereas num_comments is very close.

In [None]:
# this example may be useful: https://old.reddit.com/r/redditdev/comments/akv79c/getting_latest_submission_scores_for_lots_of/

times = []
praw_scores = {}
praw_num_comments = {}
num_chunks = 0
for chunk in chunks(submissions.id, 100):
    t1 = time.time()
    num_chunks += 1
    if num_chunks % 10 == 0:
        print('starting a chunk', len(chunk), len(praw_scores))
    list_of_ids = []
    for submission_id in chunk:
        list_of_ids.append("t3_{}".format(submission_id))
    
    if list_of_ids:
        reddit_submissions = reddit.info(fullnames=list_of_ids)
        time.sleep(1)
        async for submission in reddit_submissions:
            praw_scores[submission.id] = submission.score
            praw_num_comments[submission.id] = submission.num_comments
    tic = time.time() - t1
    times.append(tic)
    #print(praw_scores)
submissions['praw_score'] = submissions.id.map(praw_scores)
submissions['praw_num_comments'] = submissions.id.map(praw_num_comments)
submissions.to_csv(f'data/{folder}/submissions_with_praw.csv')

In [None]:
import numpy as np
np.sum(times) /3600

In [None]:
submissions.groupby('subreddit').praw_score.sum().sort_values(ascending=False)[:20] / submissions.praw_score.sum()

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=9d8dd8e7-abf2-4721-84cd-03779d208ef5' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>