In [138]:
from functools import lru_cache

import requests
import pandas as pd
from datetime import datetime

In [145]:
headers = {'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:66.0) Gecko/20100101 Firefox/66.0"}
REDDIT_ROOT_URL = "https://reddit.com"


def add_json_sufix(url):
    url = REDDIT_ROOT_URL + url if not url.startswith("http") else url
    return url if url.endswith('.json') else url + '.json'


def get_with_headers(url):
    return requests.get(url, headers=headers)


def get_subreddit_posts(subreddit_url):
    print(f"Getting posts from {subreddit_url}...")
    subreddit_url = add_json_sufix(subreddit_url)
    response = get_with_headers(subreddit_url)
    raw_posts = response.json()['data']['children']

    posts = []
    for raw_post in raw_posts:
        post = {}
        raw_post = raw_post['data']
        post['title'] = raw_post['title']
        post['score'] = raw_post['score']
        post['url'] = REDDIT_ROOT_URL + raw_post['permalink']
        post['nsfw'] = raw_post['over_18']
        post['created_utc'] = raw_post['created_utc']
        post['num_comments'] = raw_post['num_comments']

        posts.append(post)

    return posts

In [146]:
subreddit_url = "/r/programming" # https://reddit.com/r/askreddit
posts = get_subreddit_posts(subreddit_url)

Getting posts from /r/programming...


In [147]:
df_posts = pd.DataFrame(posts)
df_posts.head()

Unnamed: 0,title,score,url,nsfw,created_utc,num_comments
0,"Pharo 8.0 (the immersive, pure object oriented...",447,https://reddit.com/r/programming/comments/erd6...,False,1579527000.0,145
1,The 2038 problem is already affecting some sys...,64,https://reddit.com/r/programming/comments/erfd...,False,1579537000.0,29
2,The polygons of Another World on the Super Nin...,113,https://reddit.com/r/programming/comments/erbk...,False,1579516000.0,11
3,"New developers, a piece of advice. Learn a tex...",380,https://reddit.com/r/programming/comments/er7l...,False,1579492000.0,737
4,The Wave/Particle Duality of Git Commits,55,https://reddit.com/r/programming/comments/erdk...,False,1579529000.0,14


In [148]:
df_posts.shape

(25, 6)

In [149]:
df_posts["created_utc"] = df_posts["created_utc"].apply(datetime.fromtimestamp)

In [150]:
df_posts

Unnamed: 0,title,score,url,nsfw,created_utc,num_comments
0,"Pharo 8.0 (the immersive, pure object oriented...",447,https://reddit.com/r/programming/comments/erd6...,False,2020-01-20 14:25:54,145
1,The 2038 problem is already affecting some sys...,64,https://reddit.com/r/programming/comments/erfd...,False,2020-01-20 17:21:16,29
2,The polygons of Another World on the Super Nin...,113,https://reddit.com/r/programming/comments/erbk...,False,2020-01-20 11:30:42,11
3,"New developers, a piece of advice. Learn a tex...",380,https://reddit.com/r/programming/comments/er7l...,False,2020-01-20 04:53:02,737
4,The Wave/Particle Duality of Git Commits,55,https://reddit.com/r/programming/comments/erdk...,False,2020-01-20 15:03:25,14
5,Is a round Minecraft world possible?,23,https://reddit.com/r/programming/comments/ergg...,False,2020-01-20 18:34:45,11
6,Technical Debt Is like a Tetris Game,36,https://reddit.com/r/programming/comments/erdn...,False,2020-01-20 15:09:08,8
7,Why is quicksort better than other sorting alg...,32,https://reddit.com/r/programming/comments/ercg...,False,2020-01-20 13:12:53,37
8,"Analysis of compensation, level, and experienc...",9,https://reddit.com/r/programming/comments/erh9...,False,2020-01-20 19:27:42,4
9,How (not) to start a revolution in decentraliz...,19,https://reddit.com/r/programming/comments/erhq...,False,2020-01-20 19:58:55,2
