In [1]:
from functools import lru_cache

import requests
import pandas as pd
from datetime import datetime

In [2]:
headers = {'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:66.0) Gecko/20100101 Firefox/66.0"}
REDDIT_ROOT_URL = "https://reddit.com"


def add_json_sufix(url):
    url = REDDIT_ROOT_URL + url if not url.startswith("http") else url
    return url if url.endswith('.json?limit=100') else url + '.json?limit=100'


def get_with_headers(url):
    return requests.get(url, headers=headers)


def get_subreddit_posts(subreddit_url):
    print(f"Getting posts from {subreddit_url}...")
    subreddit_url = add_json_sufix(subreddit_url)
    response = get_with_headers(subreddit_url)
    raw_posts = response.json()['data']['children']

    posts = []
    for raw_post in raw_posts:
        post = {}
        raw_post = raw_post['data']
        post['title'] = raw_post['title']
        post['score'] = raw_post['score']
        post['url'] = REDDIT_ROOT_URL + raw_post['permalink']
        post['created_utc'] = raw_post['created_utc']
        post['num_comments'] = raw_post['num_comments']
        
        posts.append(post)

    return posts

In [3]:
subreddit_url = "/r/programming" # https://reddit.com/r/askreddit
posts = get_subreddit_posts(subreddit_url)

Getting posts from /r/programming...


In [4]:
df_posts = pd.DataFrame(posts)
df_posts.head()

Unnamed: 0,title,score,url,created_utc,num_comments
0,I Made an Extension for Visual Debugging in VS...,494,https://reddit.com/r/programming/comments/f88z...,1582462000.0,22
1,Let's Build a Simple Database,287,https://reddit.com/r/programming/comments/f83w...,1582429000.0,40
2,Taking care of code … more and more code,35,https://reddit.com/r/programming/comments/f879...,1582450000.0,11
3,In depth guide to running Elasticsearch in pro...,6,https://reddit.com/r/programming/comments/f8cc...,1582478000.0,1
4,Rust/WinRT coming soon,8,https://reddit.com/r/programming/comments/f891...,1582462000.0,9


In [5]:
df_posts.shape

(100, 5)

In [6]:
df_posts["created_utc"] = df_posts["created_utc"].apply(datetime.fromtimestamp)

In [7]:
df_posts.rename(columns={'created_utc':'date_hour'},inplace=True)

In [8]:
df_posts

Unnamed: 0,title,score,url,date_hour,num_comments
0,I Made an Extension for Visual Debugging in VS...,494,https://reddit.com/r/programming/comments/f88z...,2020-02-23 13:43:44,22
1,Let's Build a Simple Database,287,https://reddit.com/r/programming/comments/f83w...,2020-02-23 04:30:06,40
2,Taking care of code … more and more code,35,https://reddit.com/r/programming/comments/f879...,2020-02-23 10:18:50,11
3,In depth guide to running Elasticsearch in pro...,6,https://reddit.com/r/programming/comments/f8cc...,2020-02-23 18:13:41,1
4,Rust/WinRT coming soon,8,https://reddit.com/r/programming/comments/f891...,2020-02-23 13:48:03,9
...,...,...,...,...,...
95,Working with strings in Rust,157,https://reddit.com/r/programming/comments/f6q1...,2020-02-20 09:19:18,52
96,"A competitor to Google Cloud Vision, Amazon Te...",11,https://reddit.com/r/programming/comments/f71o...,2020-02-21 00:13:22,2
97,Writing the database layer: Command/query obje...,0,https://reddit.com/r/programming/comments/f7co...,2020-02-21 16:28:23,0
98,Cloud Vision API will not return gendered labe...,128,https://reddit.com/r/programming/comments/f6py...,2020-02-20 09:10:12,460


In [9]:
df_posts['date'] = [d.date() for d in df_posts['date_hour']]
df_posts['time'] = [d.time() for d in df_posts['date_hour']]

In [10]:
df_posts

Unnamed: 0,title,score,url,date_hour,num_comments,date,time
0,I Made an Extension for Visual Debugging in VS...,494,https://reddit.com/r/programming/comments/f88z...,2020-02-23 13:43:44,22,2020-02-23,13:43:44
1,Let's Build a Simple Database,287,https://reddit.com/r/programming/comments/f83w...,2020-02-23 04:30:06,40,2020-02-23,04:30:06
2,Taking care of code … more and more code,35,https://reddit.com/r/programming/comments/f879...,2020-02-23 10:18:50,11,2020-02-23,10:18:50
3,In depth guide to running Elasticsearch in pro...,6,https://reddit.com/r/programming/comments/f8cc...,2020-02-23 18:13:41,1,2020-02-23,18:13:41
4,Rust/WinRT coming soon,8,https://reddit.com/r/programming/comments/f891...,2020-02-23 13:48:03,9,2020-02-23,13:48:03
...,...,...,...,...,...,...,...
95,Working with strings in Rust,157,https://reddit.com/r/programming/comments/f6q1...,2020-02-20 09:19:18,52,2020-02-20,09:19:18
96,"A competitor to Google Cloud Vision, Amazon Te...",11,https://reddit.com/r/programming/comments/f71o...,2020-02-21 00:13:22,2,2020-02-21,00:13:22
97,Writing the database layer: Command/query obje...,0,https://reddit.com/r/programming/comments/f7co...,2020-02-21 16:28:23,0,2020-02-21,16:28:23
98,Cloud Vision API will not return gendered labe...,128,https://reddit.com/r/programming/comments/f6py...,2020-02-20 09:10:12,460,2020-02-20,09:10:12


In [11]:
df_posts.date.value_counts()

2020-02-22    42
2020-02-21    27
2020-02-23    21
2020-02-20     9
2020-02-19     1
Name: date, dtype: int64

#### Creating my own ddbb.

In [13]:
df_posts.to_csv('23Feb.csv')