# Import Libraries & Tools

In [1]:
import requests
import datetime
import os
from ipywidgets import widgets, interact

# Set Up Constants

In [2]:
REDDIT_USERNAME = 'n0ahhhhh'
REDDIT_PASSWORD = os.environ.get('REDDIT_PASSWORD')
APP_ID          = os.environ.get('REDDIT_APP_ID')
APP_SECRET      = os.environ.get('REDDIT_SECRET')
BASE_URL        = 'https://www.reddit.com/'

# Authentication & Request

In [3]:
data = {'grant_type': 'password', 'username': REDDIT_USERNAME, 'password': REDDIT_PASSWORD}
auth = requests.auth.HTTPBasicAuth(APP_ID, APP_SECRET)
r = requests.post(BASE_URL + 'api/v1/access_token',
                  data=data,
                  headers={'user-agent': f'Sub-Parser by {REDDIT_USERNAME}'},
                  auth=auth)
values = r.json()

if r.status_code == 200:
    for k in values.keys():
        print(f'{k:>13}: {values[k]}')
else:
    print(f'Response: {r.status_code}')

 access_token: 22887777-6Jxoxvg5jGqb8H7yEozbpyKTGnRzAA
   token_type: bearer
   expires_in: 86400
        scope: *


# Get Self Information

In [4]:
API_URL = 'https://oauth.reddit.com'
token = f'bearer {values["access_token"]}'
headers = {'Authorization': token, 'User-Agent': 'sub-parser by n0ahhhhh'}
response = requests.get(API_URL + '/api/v1/me', headers=headers)

def convert_epoch(epoch):
    return datetime.datetime.utcfromtimestamp(epoch)

if response.status_code == 200:
    print(f"Welcome {response.json()['name']}!")
    print('-' * (len(response.json()['name'])+9))
    print(f"{'Comment Karma:':>16} {response.json()['comment_karma']}")
    print(f"{'Link Karma:':>16} {response.json()['link_karma']}")
    print(f"Account Created: {convert_epoch(response.json()['created_utc'])}")
else:
    print(f'Response: {response.status_code}')

Welcome n0ahhhhh!
-----------------
  Comment Karma: 22229
     Link Karma: 19917
Account Created: 2013-10-23 21:29:15


# Search for Subreddit

In [5]:
def get_sub():
    sub_input = input('Subreddit to search: r/')
    payload = {'q': f'{sub_input.strip()}', 'limit': 5, 'sort': 'relevance'}
    response = requests.get(API_URL + '/subreddits/search', headers=headers, params=payload)
    values = response.json()
    subreddit = values['data']['children'][0]['data']['display_name_prefixed'][2:]

    if response.status_code == 200:
        # for i in range(len(values['data']['children'])):
        #     print(values['data']['children'][i]['data']['display_name'])
        print(f"Found r/{subreddit}!")
    else:
        print(f'Response: {response.status_code}')
    
    return subreddit

# Get Top Post Data

In [6]:
#create simple dataframe with basic post information
def df_from_res(res):
    data = pd.DataFrame()
    
    for post in res.json()['data']['children']:
        new_post = {
            'id': post['data']['id'],
            'kind': post['kind'],
            'title': post['data']['title'],
            'selftext': post['data']['selftext'],
            'ups': post['data']['ups'],
            'downs': post['data']['downs'],
            'upvote_ratio': post['data']['upvote_ratio'],
            'score': post['data']['score'],
            'comments': post['data']['num_comments'],
            'awards': post['data']['total_awards_received'],
            'created_utc': convert_epoch(post['data']['created_utc'])
        }
        
        # new_post['date'] = new_post['created_utc'].date()
        # new_post['time'] = new_post['created_utc'].time()

        row = pd.DataFrame.from_dict([new_post])
        data = pd.concat([data, row], ignore_index=True)
    
    return data

#gets top 500 posts from given subreddit over given timeframe
def get_top_500(subreddit, time):
    top_posts = pd.DataFrame()
    payload = {'t': time, 'limit':100} #(hour, day, week, month, year, all)
    print(f'Collecting top posts from r/{subreddit}...')
    
    #each loop gets 100 posts
    for i in range(5):
        #make request
        res = requests.get(API_URL + f'/r/{subreddit}/top', headers=headers, params=payload)

        #get dataframe from response
        new_df = df_from_res(res)

        #get oldest entry
        row = new_df.iloc[len(new_df)-1]

        # create unique fullname
        fullname = row['kind'] + '_' + row['id']

        #add/update fullname for params
        payload['after'] = fullname

        #append new_df to top_posts
        top_posts = pd.concat([top_posts, new_df], ignore_index=True)

    top_posts.sort_values('created_utc', ascending=False)
    print('...Done!\n')
    
    return top_posts

In [7]:
askreddit_500 = get_top_500(get_sub(), 'month')

Subreddit to search: r/ askreddit


Found r/AskReddit!
Collecting top posts from r/AskReddit...
...Done!



In [8]:
def get_hour(df):
    return df.hour

def get_data_sum(df, target):
    data = pd.Series(dtype='int')
    for i in range(0,24):
        data.at[i] = df[df['created_utc'].apply(get_hour) == i][target].sum()
    return data

def get_hourly_stats(df):
    stats_df = pd.DataFrame()
    stats_df['posts_per_hour'] = df['created_utc'].apply(get_hour).value_counts()
    stats_df['comments_per_hour'] = get_data_sum(df, 'comments')
    stats_df['awards_per_hour'] = get_data_sum(df, 'awards')
    stats_df['avg_comments/post'] = round(stats_df['comments_per_hour'] / stats_df['posts_per_hour'],2)
    stats_df['avg_score/post'] = round(get_data_sum(df, 'score') / stats_df['posts_per_hour'],2)
    stats_df['avg_awards/post'] = round(stats_df['awards_per_hour'] / stats_df['posts_per_hour'],2)
    stats_df['avg_up_ratio'] = round(get_data_sum(df, 'upvote_ratio') / stats_df['posts_per_hour'],2)
    
    stats_df = (
        stats_df.reset_index()
        .rename(columns={'index': 'hour'})
        .sort_values('hour')
        .reset_index(drop=True)
    )
    
    return stats_df

def display_stats(df):
    oldest = df['created_utc'].describe(datetime_is_numeric=True)[['min']][0]
    newest = df['created_utc'].describe(datetime_is_numeric=True)[['max']][0]
    
    percentiles = df['score'].describe([.25, .5, .75, .9])
    thresh_gt90 = percentiles['90%']
    top10percent = df[df['score'] > thresh_gt90].sort_values('score', ascending=False).drop(['id', 'kind'], axis=1)
    
    print(f'Oldest Post: {oldest}')
    print(f'Newest Post: {newest}')
    print("Top 10% of posts by 'Score'...\n")
    display(top10percent)
    
    return top10percent

In [9]:
askreddit_top10p = display_stats(askreddit_500)

Oldest Post: 2022-03-21 05:36:45
Newest Post: 2022-04-19 22:56:00
Top 10% of posts by 'Score'...



Unnamed: 0,title,selftext,ups,downs,upvote_ratio,score,comments,awards,created_utc
0,Chris Rock is performing a standup gig tonight...,,91187,0,0.75,91187,10573,57,2022-03-30 14:46:44
4,"What is your best insult, WITHOUT using curse ...",,65133,0,0.83,65133,23978,91,2022-04-19 13:11:21
1,"People with ‘street smarts’, what is your most...",,61731,0,0.9,61731,29480,97,2022-04-01 11:09:26
2,"Old Redditors, what's a must-know story of Red...",,54944,0,0.86,54944,12961,91,2022-04-08 08:09:10
3,Your toilet is now sentient. Would you prefer ...,,54903,0,0.77,54903,5234,89,2022-04-03 18:11:59
5,“Go to work naked day” is now a mandatory nati...,,53458,0,0.78,53458,17701,28,2022-03-28 16:37:27
6,You have a gun to your head and was told to re...,,52466,0,0.73,52466,46010,46,2022-04-18 13:27:28
7,What survival myth is completely wrong and can...,,48049,0,0.93,48049,18539,58,2022-04-14 03:59:53
8,"What is a unspoken, universal rule all males k...",,47085,0,0.82,47085,22543,87,2022-03-22 16:18:57
9,"Without revealing your age, what video game di...",,47043,0,0.79,47043,81450,86,2022-03-29 14:31:17


In [10]:
askreddit_stats = get_hourly_stats(askreddit_500)
askreddit_stats

Unnamed: 0,hour,posts_per_hour,comments_per_hour,awards_per_hour,avg_comments/post,avg_score/post,avg_awards/post,avg_up_ratio
0,0,27,115819,136,4289.59,5735.11,5.04,0.91
1,1,25,103768,186,4150.72,6305.16,7.44,0.9
2,2,20,129271,174,6463.55,10846.9,8.7,0.91
3,3,13,51091,134,3930.08,8674.92,10.31,0.91
4,4,15,68922,169,4594.8,6576.07,11.27,0.91
5,5,18,86443,143,4802.39,5997.28,7.94,0.91
6,6,14,72001,169,5142.93,8232.57,12.07,0.9
7,7,7,18541,21,2648.71,3318.57,3.0,0.93
8,8,16,139557,375,8722.31,18713.12,23.44,0.87
9,9,16,51883,65,3242.69,3642.5,4.06,0.92


# To Find Images

In [11]:
# payload = {'t': 'all', 'limit': 5}
# imghtml = ''

# imghtml += '<h3 style="clear:both">MechanicalKeyboards</h3><div>'
# r = requests.get(API_URL + '/r/MechanicalKeyboards/top', headers=headers, params=payload)
# js = r.json()
# for i in range(js['data']['dist']):
#     if js['data']['children'][i]['data']['thumbnail'] == '':
#         continue
#     imghtml += '<span style="float:left"><a href="{}"><img src="{}" title="{}" target="_blank" \></a></span>'.format(
#         js['data']['children'][i]['data']['url'],
#         js['data']['children'][i]['data']['thumbnail'],
#         js['data']['children'][i]['data']['title'],
#     )
# imghtml += '</div>'

# from IPython.display import display, HTML
# display(HTML(imghtml))

In [12]:
# with open("keyboards.html", "w", encoding='utf-8') as html_page:
    # html_page.write(imghtml)