In [1]:
import requests
import pandas as pd
from datetime import datetime
import numpy as np
from functools import lru_cache

url = "https://reddit.com/r/programming/.json?limit=100"

headers = {'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:66.0) Gecko/20100101 Firefox/66.0"}
REDDIT_ROOT_URL = "https://reddit.com"

def get_with_headers(url):
    return requests.get(url, headers=headers)

@lru_cache(maxsize=32)
def get_subreddit_posts(subreddit_url):
    print(f"Getting posts from {url}...")
    response = get_with_headers(url)
    raw_posts = response.json()['data']['children']

    posts = []
    for raw_post in raw_posts:
        post = {}
        raw_post = raw_post['data']
        post['name'] = raw_post['name']
        post['title'] = raw_post['title']
        post['score'] = raw_post['score']
        post['url'] = REDDIT_ROOT_URL + raw_post['permalink']
        post['created_utc'] = raw_post['created_utc']
        post['num_comments'] = raw_post['num_comments']
        
        posts.append(post)

    return posts

# Getting all posts.
posts = get_subreddit_posts(url)
all_posts= []
    
for post in posts:
    try:
        posts = get_subreddit_posts(url)
        last_value = [ sub['name'] for sub in posts][-1]
        url = "{}&after={}".format(url,last_value)
        all_posts += get_subreddit_posts(url)
        all_posts
        
    except (IndexError):
        pass

# Creating data frame

all_posts = pd.DataFrame(all_posts)

all_posts["created_utc"] = all_posts["created_utc"].apply(datetime.fromtimestamp)
all_posts.rename(columns={'created_utc':'date_hour'},inplace=True)

# Splitting the column into date and time.

all_posts['date'] = [d.date() for d in all_posts['date_hour']]
all_posts['time'] = [d.time() for d in all_posts['date_hour']]

# Adding weekday.

all_posts['weekday'] = [d.weekday() for d in all_posts['date']]


# Creating a copy with only the important columns.

all_posts_score = all_posts[['date', 'score','weekday']].copy()

# Some weekdays are repeated several times.

unique = all_posts_score.date.unique().tolist()
unique_df = pd.DataFrame(unique,columns =['date'])
unique_df['weekday'] = [d.weekday() for d in unique_df['date']]
repeated = unique_df.weekday.value_counts()
repeated = repeated.to_dict()

# Creating a data frame with the number of times a weekday is repeated.

repeated_df = pd.DataFrame.from_dict(repeated, orient='index')

# Creating a data frame with the sum of total score per weekday.

score_per_day = all_posts_score.groupby(by='weekday').agg({'score':['sum']})
posts_per_day = all_posts_score.weekday.value_counts().to_frame()

# Concatenate both data frames into one.

result = pd.concat([posts_per_day, repeated_df,score_per_day], axis=1, sort=False)
result.rename(columns ={0:'repeated_weekday'},inplace=True)


# Divide the columns "weekday" and "score" by "repeated_weekday" to have the number of posts and the score per day.

result['posts_per_day']= result.weekday/result.repeated_weekday
result['score_per_day']= result['score', 'sum']/result.repeated_weekday
result = result.sort_values(['posts_per_day'], ascending = False)
result.reset_index(inplace=True)

# Replacing numbers per days.

result['index'] = result['index'].replace(0,'Monday').replace(1,'Tuesday').replace(2,'Wednesday').replace(3,'Thursday').replace(4,'Friday').replace(5,'Saturday').replace(6,'Sunday')
result.rename = result.rename(columns = {'index':'day'}, inplace = True)


# Dropping useless columns.

result.drop(['weekday', 'repeated_weekday', ('score', 'sum')], axis=1,inplace=True)


result_score = result.sort_values(['score_per_day'], ascending = False)
result_score.reset_index(inplace=True)
print("===========================================================")
print(result)
print("===========================================================")
print(result.day[0] + " is the day that most posts are published.")
highest_score = result_score.day[0]
print("Posts publised on {} are those with highest scores.".format(highest_score))

Getting posts from https://reddit.com/r/programming/.json?limit=100...
Getting posts from https://reddit.com/r/programming/.json?limit=100&after=t3_g0erba...
Getting posts from https://reddit.com/r/programming/.json?limit=100&after=t3_g0erba&after=t3_fzw6g8...
Getting posts from https://reddit.com/r/programming/.json?limit=100&after=t3_g0erba&after=t3_fzw6g8&after=t3_fyguzd...
Getting posts from https://reddit.com/r/programming/.json?limit=100&after=t3_g0erba&after=t3_fzw6g8&after=t3_fyguzd&after=t3_fxbmwl...
Getting posts from https://reddit.com/r/programming/.json?limit=100&after=t3_g0erba&after=t3_fzw6g8&after=t3_fyguzd&after=t3_fxbmwl&after=t3_fwiug2...
Getting posts from https://reddit.com/r/programming/.json?limit=100&after=t3_g0erba&after=t3_fzw6g8&after=t3_fyguzd&after=t3_fxbmwl&after=t3_fwiug2&after=t3_fvicqq...
Getting posts from https://reddit.com/r/programming/.json?limit=100&after=t3_g0erba&after=t3_fzw6g8&after=t3_fyguzd&after=t3_fxbmwl&after=t3_fwiug2&after=t3_fvicqq&aft

In [2]:
all_posts

Unnamed: 0,name,title,score,url,date_hour,num_comments,date,time,weekday
0,t3_g0mde9,Introduction to XGBoost with an Implementation...,0,https://reddit.com/r/programming/comments/g0md...,2020-04-13 18:32:23,0,2020-04-13,18:32:23,0
1,t3_fzjg0q,R.I.P John Conway,2574,https://reddit.com/r/programming/comments/fzjg...,2020-04-11 23:23:34,127,2020-04-11,23:23:34,5
2,t3_g0pmdg,We visualized different actions to stop virus ...,0,https://reddit.com/r/programming/comments/g0pm...,2020-04-13 21:16:57,2,2020-04-13,21:16:57,0
3,t3_g0kuz7,Art of code - Dylan Beattie,0,https://reddit.com/r/programming/comments/g0ku...,2020-04-13 17:15:25,2,2020-04-13,17:15:25,0
4,t3_g0kcfc,A no-nonsense guide to environment variables i...,1,https://reddit.com/r/programming/comments/g0kc...,2020-04-13 16:48:27,0,2020-04-13,16:48:27,0
...,...,...,...,...,...,...,...,...,...
767,t3_fsl6da,Here's a little hobby project I spent a month ...,0,https://reddit.com/r/programming/comments/fsl6...,2020-03-31 22:58:14,6,2020-03-31,22:58:14,1
768,t3_fsfs33,COVID-19 Research Site,2,https://reddit.com/r/programming/comments/fsfs...,2020-03-31 18:15:30,2,2020-03-31,18:15:30,1
769,t3_fsfl9b,Building dark mode on Stack Overflow,1,https://reddit.com/r/programming/comments/fsfl...,2020-03-31 18:05:30,2,2020-03-31,18:05:30,1
770,t3_fsjbp6,Are You a Slave to your Software Knowledge?,0,https://reddit.com/r/programming/comments/fsjb...,2020-03-31 21:21:09,5,2020-03-31,21:21:09,1


In [6]:
all_posts['hour'] = all_posts['date_hour'].dt.hour
all_posts['minute'] = all_posts['date_hour'].dt.minute
all_posts['second'] = all_posts['date_hour'].dt.second

all_posts.hour.value_counts()

15    64
18    51
16    48
14    45
21    44
17    44
19    43
20    42
22    39
12    38
13    34
11    33
8     30
9     25
23    25
0     25
2     24
10    22
1     22
7     17
6     16
4     14
3     14
5     13
Name: hour, dtype: int64

In [72]:
hours_weekday = all_posts.groupby(by=['weekday','hour']).agg({'score':['sum']})#.sort_values(['weekday'],ascending = False)

hours_weekday

Unnamed: 0_level_0,Unnamed: 1_level_0,score
Unnamed: 0_level_1,Unnamed: 1_level_1,sum
weekday,hour,Unnamed: 2_level_2
0,0,17
0,1,3344
0,2,2662
0,3,0
0,4,0
...,...,...
6,19,17
6,20,0
6,21,3
6,22,23


In [71]:
#hours_weekday.columns

#hours_weekday.rename = hours_weekday.rename(columns = {('score', 'sum'):'total_sum'}, inplace = True)
#hours_weeday