In [191]:
from functools import lru_cache

import requests
import pandas as pd
from datetime import datetime

In [216]:
headers = {'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:66.0) Gecko/20100101 Firefox/66.0"}
REDDIT_ROOT_URL = "https://reddit.com"


def add_json_sufix(url):
    url = REDDIT_ROOT_URL + url if not url.startswith("http") else url
    return url if url.endswith('.json?limit=100') else url + '.json?limit=100'


def get_with_headers(url):
    return requests.get(url, headers=headers)


def get_subreddit_posts(subreddit_url):
    print(f"Getting posts from {subreddit_url}...")
    subreddit_url = add_json_sufix(subreddit_url)
    response = get_with_headers(subreddit_url)
    raw_posts = response.json()['data']['children']

    posts = []
    for raw_post in raw_posts:
        post = {}
        raw_post = raw_post['data']
        post['title'] = raw_post['title']
        post['score'] = raw_post['score']
        post['url'] = REDDIT_ROOT_URL + raw_post['permalink']
        post['created_utc'] = raw_post['created_utc']
        post['num_comments'] = raw_post['num_comments']
        
        posts.append(post)

    return posts

In [217]:
subreddit_url = "/r/programming" # https://reddit.com/r/askreddit
posts = get_subreddit_posts(subreddit_url)

Getting posts from /r/programming...


In [218]:
df_posts = pd.DataFrame(posts)
df_posts.head()

Unnamed: 0,title,score,url,created_utc,num_comments
0,Natural Language Processing for Web Developers,176,https://reddit.com/r/programming/comments/ery1...,1579628000.0,10
1,.NET everywhere apparently also means Windows ...,718,https://reddit.com/r/programming/comments/erqq...,1579588000.0,89
2,The Edge of Emulation,277,https://reddit.com/r/programming/comments/ersx...,1579604000.0,29
3,What is Rust and why is it so popular?,98,https://reddit.com/r/programming/comments/erwa...,1579621000.0,127
4,(from /r/webdev) awesome article on good progr...,26,https://reddit.com/r/programming/comments/erzc...,1579634000.0,0


In [219]:
df_posts.shape

(100, 5)

In [220]:
df_posts["created_utc"] = df_posts["created_utc"].apply(datetime.fromtimestamp)

In [221]:
df_posts.rename(columns={'created_utc':'date_hour'},inplace=True)

In [222]:
df_posts

Unnamed: 0,title,score,url,date_hour,num_comments
0,Natural Language Processing for Web Developers,176,https://reddit.com/r/programming/comments/ery1...,2020-01-21 18:36:08,10
1,.NET everywhere apparently also means Windows ...,718,https://reddit.com/r/programming/comments/erqq...,2020-01-21 07:31:19,89
2,The Edge of Emulation,277,https://reddit.com/r/programming/comments/ersx...,2020-01-21 11:48:57,29
3,What is Rust and why is it so popular?,98,https://reddit.com/r/programming/comments/erwa...,2020-01-21 16:40:59,127
4,(from /r/webdev) awesome article on good progr...,26,https://reddit.com/r/programming/comments/erzc...,2020-01-21 20:08:46,0
...,...,...,...,...,...
95,I am starting a series of live streams setting...,0,https://reddit.com/r/programming/comments/erml...,2020-01-21 01:33:29,0
96,Clang Hacking,1,https://reddit.com/r/programming/comments/erdz...,2020-01-20 15:36:42,0
97,Hidden Computational Power Found in the Arms o...,0,https://reddit.com/r/programming/comments/erdn...,2020-01-20 15:09:52,2
98,What We Learned Hosting Our First-Ever Devops ...,0,https://reddit.com/r/programming/comments/erdi...,2020-01-20 14:58:30,0


In [224]:
df_posts['date'] = [d.date() for d in df_posts['date_hour']]
df_posts['time'] = [d.time() for d in df_posts['date_hour']]

In [225]:
df_posts

Unnamed: 0,title,score,url,date_hour,num_comments,date,time
0,Natural Language Processing for Web Developers,176,https://reddit.com/r/programming/comments/ery1...,2020-01-21 18:36:08,10,2020-01-21,18:36:08
1,.NET everywhere apparently also means Windows ...,718,https://reddit.com/r/programming/comments/erqq...,2020-01-21 07:31:19,89,2020-01-21,07:31:19
2,The Edge of Emulation,277,https://reddit.com/r/programming/comments/ersx...,2020-01-21 11:48:57,29,2020-01-21,11:48:57
3,What is Rust and why is it so popular?,98,https://reddit.com/r/programming/comments/erwa...,2020-01-21 16:40:59,127,2020-01-21,16:40:59
4,(from /r/webdev) awesome article on good progr...,26,https://reddit.com/r/programming/comments/erzc...,2020-01-21 20:08:46,0,2020-01-21,20:08:46
...,...,...,...,...,...,...,...
95,I am starting a series of live streams setting...,0,https://reddit.com/r/programming/comments/erml...,2020-01-21 01:33:29,0,2020-01-21,01:33:29
96,Clang Hacking,1,https://reddit.com/r/programming/comments/erdz...,2020-01-20 15:36:42,0,2020-01-20,15:36:42
97,Hidden Computational Power Found in the Arms o...,0,https://reddit.com/r/programming/comments/erdn...,2020-01-20 15:09:52,2,2020-01-20,15:09:52
98,What We Learned Hosting Our First-Ever Devops ...,0,https://reddit.com/r/programming/comments/erdi...,2020-01-20 14:58:30,0,2020-01-20,14:58:30


In [226]:
df_posts.date.value_counts()

2020-01-21    61
2020-01-20    36
2020-01-19     3
Name: date, dtype: int64