# Import Libraries & Tools

In [1]:
import requests
import datetime
import os

# Set Up Constants

In [2]:
REDDIT_USERNAME = 'n0ahhhhh'
REDDIT_PASSWORD = os.environ.get('REDDIT_PASSWORD')
APP_ID          = os.environ.get('REDDIT_APP_ID')
APP_SECRET      = os.environ.get('REDDIT_SECRET')
BASE_URL        = 'https://www.reddit.com/'

# Authentication & Request

In [3]:
data = {'grant_type': 'password', 'username': REDDIT_USERNAME, 'password': REDDIT_PASSWORD}
auth = requests.auth.HTTPBasicAuth(APP_ID, APP_SECRET)
r = requests.post(BASE_URL + 'api/v1/access_token',
                  data=data,
                  headers={'user-agent': f'Sub-Parser by {REDDIT_USERNAME}'},
                  auth=auth)
values = r.json()

if r.status_code == 200:
    for k in values.keys():
        print(f'{k:>13}: {values[k]}')
else:
    print(f'Response: {r.status_code}')

 access_token: 22887777-4vhNBSqEt2ggjunasiaMeEr_bUoJyg
   token_type: bearer
   expires_in: 86400
        scope: *


# Get Self Information

In [4]:
API_URL = 'https://oauth.reddit.com'
token = f'bearer {values["access_token"]}'
headers = {'Authorization': token, 'User-Agent': 'sub-parser by n0ahhhhh'}
response = requests.get(API_URL + '/api/v1/me', headers=headers)

def convert_epoch(epoch):
    return datetime.datetime.utcfromtimestamp(epoch)

if response.status_code == 200:
    print(f"Welcome {response.json()['name']}!")
    print('-' * (len(response.json()['name'])+9))
    print(f"{'Comment Karma:':>16} {response.json()['comment_karma']}")
    print(f"{'Link Karma:':>16} {response.json()['link_karma']}")
    print(f"Account Created: {convert_epoch(response.json()['created_utc'])}")
else:
    print(f'Response: {response.status_code}')

Welcome n0ahhhhh!
-----------------
  Comment Karma: 22229
     Link Karma: 19917
Account Created: 2013-10-23 21:29:15


# Search for Subreddit

In [5]:
def get_sub():
    subreddit = input('Subreddit to search: r/')
    payload = {'q': f'{subreddit}', 'limit': 5, 'sort': 'relevance'}
    response = requests.get(API_URL + '/subreddits/search', headers=headers, params=payload)
    values = response.json()

    if response.status_code == 200:
        # for i in range(len(values['data']['children'])):
        #     print(values['data']['children'][i]['data']['display_name'])
        print(f"Found {values['data']['children'][0]['data']['display_name_prefixed']}!")
    else:
        print(f'Response: {response.status_code}')
    
    return subreddit

# Get Top Post Data

In [6]:
#create simple dataframe with basic post information
def df_from_res(res):
    data = pd.DataFrame()
    
    for post in res.json()['data']['children']:
        new_post = {
            'id': post['data']['id'],
            'kind': post['kind'],
            'title': post['data']['title'],
            'selftext': post['data']['selftext'],
            'ups': post['data']['ups'],
            'downs': post['data']['downs'],
            'upvote_ratio': post['data']['upvote_ratio'],
            'score': post['data']['score'],
            'comments': post['data']['num_comments'],
            'awards': post['data']['total_awards_received'],
            'created_utc': convert_epoch(post['data']['created_utc'])
        }
        
        # new_post['date'] = new_post['created_utc'].date()
        # new_post['time'] = new_post['created_utc'].time()

        row = pd.DataFrame.from_dict([new_post])
        data = pd.concat([data, row], ignore_index=True)
    
    return data

#gets top 500 posts from given subreddit over given timeframe
def get_top_500(subreddit, time):
    top_posts = pd.DataFrame()
    payload = {'t': time, 'limit':100} #(hour, day, week, month, year, all)
    print(f'Collecting top posts from r/{subreddit}...')
    
    # loop through 5 times (returning 500 posts)
    for i in range(5):
        #make request
        res = requests.get(API_URL + f'/r/{subreddit}/top', headers=headers, params=payload)

        #get dataframe from response
        new_df = df_from_res(res)

        #get oldest entry
        row = new_df.iloc[len(new_df)-1]

        # create unique fullname
        fullname = row['kind'] + '_' + row['id']

        #add/update fullname for params
        payload['after'] = fullname

        #append new_df to top_posts
        top_posts = pd.concat([top_posts, new_df], ignore_index=True)

    top_posts.sort_values('created_utc', ascending=False)
    print('...Done!\n')
    
    return top_posts

In [7]:
blender_500 = get_top_500(get_sub(), 'month')
askreddit_500 = get_top_500(get_sub(), 'month')

Subreddit to search: r/ mechanicalkeyboards


Found r/MechanicalKeyboards!
Collecting top posts from r/mechanicalkeyboards...
...Done!



Subreddit to search: r/ askreddit


Found r/AskReddit!
Collecting top posts from r/askreddit...
...Done!



In [8]:
# top_posts['created_utc'].describe(datetime_is_numeric=True)[['min', 'max']]

In [9]:
# percentiles = top_posts['score'].describe([.25, .5, .75, .9])
# percentiles

In [10]:
# #top 25%
# thresh_gt75 = percentiles['75%']
# top_posts[top_posts['score'] > thresh_gt75].sort_values('score', ascending=False)

In [11]:
# #top 10%
# thresh_gt90 = percentiles['90%']
# top_posts[top_posts['score'] > thresh_gt90].sort_values('score', ascending=False)

In [12]:
def get_hour(df):
    return df.hour

def get_data_sum(df, target):
    data = pd.Series(dtype='int')
    for i in range(0,24):
        data.at[i] = df[df['created_utc'].apply(get_hour) == i][target].sum()
    return data

def get_hourly_stats(df):
    stats_df = pd.DataFrame()
    stats_df['posts_per_hour'] = df['created_utc'].apply(get_hour).value_counts()
    stats_df['comments_per_hour'] = get_data_sum(df, 'comments')
    stats_df['awards_per_hour'] = get_data_sum(df, 'awards')
    stats_df['avg_comments/post'] = round(stats_df['comments_per_hour'] / stats_df['posts_per_hour'],2)
    stats_df['avg_score/post'] = round(get_data_sum(df, 'score') / stats_df['posts_per_hour'],2)
    stats_df['avg_awards/post'] = round(stats_df['awards_per_hour'] / stats_df['posts_per_hour'],2)
    stats_df['avg_up_ratio'] = round(get_data_sum(df, 'upvote_ratio') / stats_df['posts_per_hour'],2)
    
    stats_df = (
        stats_df.reset_index()
        .rename(columns={'index': 'hour'})
        .sort_values('hour')
        .reset_index(drop=True)
    )
    
    return stats_df

In [13]:
top_post_stats = get_hourly_stats(askreddit_500)
top_post_stats

Unnamed: 0,hour,posts_per_hour,comments_per_hour,awards_per_hour,avg_comments/post,avg_score/post,avg_awards/post,avg_up_ratio
0,0,27,115819,136,4289.59,5735.11,5.04,0.91
1,1,25,103768,186,4150.72,6303.88,7.44,0.9
2,2,20,129271,174,6463.55,10844.85,8.7,0.91
3,3,13,51087,134,3929.77,8675.31,10.31,0.91
4,4,15,68920,169,4594.67,6575.33,11.27,0.91
5,5,18,86443,143,4802.39,5998.28,7.94,0.91
6,6,14,71975,169,5141.07,8231.79,12.07,0.9
7,7,7,18541,21,2648.71,3320.43,3.0,0.93
8,8,16,139555,375,8722.19,18711.25,23.44,0.87
9,9,16,51883,65,3242.69,3639.69,4.06,0.92


In [14]:
top_post_stats = get_hourly_stats(blender_500)
top_post_stats

Unnamed: 0,hour,posts_per_hour,comments_per_hour,awards_per_hour,avg_comments/post,avg_score/post,avg_awards/post,avg_up_ratio
0,0,10,331,9,33.1,898.5,0.9,0.97
1,1,20,977,33,48.85,958.5,1.65,0.96
2,2,25,1241,37,49.64,500.36,1.48,0.95
3,3,25,656,12,26.24,395.56,0.48,0.96
4,4,8,134,8,16.75,310.75,1.0,0.97
5,5,24,609,21,25.38,504.92,0.88,0.96
6,6,11,775,13,70.45,1056.36,1.18,0.96
7,7,16,884,11,55.25,524.5,0.69,0.95
8,8,13,742,18,57.08,1016.69,1.38,0.97
9,9,19,1177,42,61.95,1291.0,2.21,0.96


### To find images

In [15]:
# payload = {'t': 'all', 'limit': 5}
# imghtml = ''

# imghtml += '<h3 style="clear:both">MechanicalKeyboards</h3><div>'
# r = requests.get(API_URL + '/r/MechanicalKeyboards/top', headers=headers, params=payload)
# js = r.json()
# for i in range(js['data']['dist']):
#     if js['data']['children'][i]['data']['thumbnail'] == '':
#         continue
#     imghtml += '<span style="float:left"><a href="{}"><img src="{}" title="{}" target="_blank" \></a></span>'.format(
#         js['data']['children'][i]['data']['url'],
#         js['data']['children'][i]['data']['thumbnail'],
#         js['data']['children'][i]['data']['title'],
#     )
# imghtml += '</div>'

# from IPython.display import display, HTML
# display(HTML(imghtml))

In [16]:
# with open("keyboards.html", "w", encoding='utf-8') as html_page:
    # html_page.write(imghtml)