In [191]:
from functools import lru_cache

import requests
import pandas as pd
from datetime import datetime

In [192]:
headers = {'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:66.0) Gecko/20100101 Firefox/66.0"}
REDDIT_ROOT_URL = "https://reddit.com"


def add_json_sufix(url):
    url = REDDIT_ROOT_URL + url if not url.startswith("http") else url
    return url if url.endswith('.json') else url + '.json'


def get_with_headers(url):
    return requests.get(url, headers=headers)


def get_subreddit_posts(subreddit_url):
    print(f"Getting posts from {subreddit_url}...")
    subreddit_url = add_json_sufix(subreddit_url)
    response = get_with_headers(subreddit_url)
    raw_posts = response.json()['data']['children']

    posts = []
    for raw_post in raw_posts:
        post = {}
        raw_post = raw_post['data']
        post['title'] = raw_post['title']
        post['score'] = raw_post['score']
        post['url'] = REDDIT_ROOT_URL + raw_post['permalink']
        post['created_utc'] = raw_post['created_utc']
        post['num_comments'] = raw_post['num_comments']
        
        posts.append(post)

    return posts

In [193]:
subreddit_url = "/r/programming" # https://reddit.com/r/askreddit
posts = get_subreddit_posts(subreddit_url)

Getting posts from /r/programming...


In [194]:
df_posts = pd.DataFrame(posts)
df_posts.head()

Unnamed: 0,title,score,url,created_utc,num_comments
0,Natural Language Processing for Web Developers,165,https://reddit.com/r/programming/comments/ery1...,1579628000.0,6
1,.NET everywhere apparently also means Windows ...,704,https://reddit.com/r/programming/comments/erqq...,1579588000.0,89
2,The Edge of Emulation,270,https://reddit.com/r/programming/comments/ersx...,1579604000.0,29
3,What is Rust and why is it so popular?,82,https://reddit.com/r/programming/comments/erwa...,1579621000.0,114
4,(from /r/webdev) awesome article on good progr...,18,https://reddit.com/r/programming/comments/erzc...,1579634000.0,0


In [195]:
df_posts.shape

(25, 5)

In [196]:
df_posts["created_utc"] = df_posts["created_utc"].apply(datetime.fromtimestamp)

In [197]:
df_posts.rename(columns={'created_utc':'date_hour'},inplace=True)

In [198]:
df_posts

Unnamed: 0,title,score,url,date_hour,num_comments
0,Natural Language Processing for Web Developers,165,https://reddit.com/r/programming/comments/ery1...,2020-01-21 18:36:08,6
1,.NET everywhere apparently also means Windows ...,704,https://reddit.com/r/programming/comments/erqq...,2020-01-21 07:31:19,89
2,The Edge of Emulation,270,https://reddit.com/r/programming/comments/ersx...,2020-01-21 11:48:57,29
3,What is Rust and why is it so popular?,82,https://reddit.com/r/programming/comments/erwa...,2020-01-21 16:40:59,114
4,(from /r/webdev) awesome article on good progr...,18,https://reddit.com/r/programming/comments/erzc...,2020-01-21 20:08:46,0
5,The 2038 problem is already affecting some sys...,1880,https://reddit.com/r/programming/comments/erfd...,2020-01-20 17:21:16,530
6,Seq — a language for bioinformatics,14,https://reddit.com/r/programming/comments/erxo...,2020-01-21 18:12:25,2
7,The Hunt for the Fastest Zero,71,https://reddit.com/r/programming/comments/err5...,2020-01-21 08:15:25,12
8,Database architecture — using one database sch...,7,https://reddit.com/r/programming/comments/erwy...,2020-01-21 17:25:45,0
9,PHP in 2020,26,https://reddit.com/r/programming/comments/ersa...,2020-01-21 10:27:40,32
