In [1]:
from functools import lru_cache

import requests
import pandas as pd
from datetime import datetime

In [2]:
headers = {'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:66.0) Gecko/20100101 Firefox/66.0"}
REDDIT_ROOT_URL = "https://reddit.com"


def add_json_sufix(url):
    url = REDDIT_ROOT_URL + url if not url.startswith("http") else url
    return url if url.endswith('.json?limit=100') else url + '.json?limit=100'


def get_with_headers(url):
    return requests.get(url, headers=headers)


def get_subreddit_posts(subreddit_url):
    print(f"Getting posts from {subreddit_url}...")
    subreddit_url = add_json_sufix(subreddit_url)
    response = get_with_headers(subreddit_url)
    raw_posts = response.json()['data']['children']

    posts = []
    for raw_post in raw_posts:
        post = {}
        raw_post = raw_post['data']
        post['title'] = raw_post['title']
        post['score'] = raw_post['score']
        post['url'] = REDDIT_ROOT_URL + raw_post['permalink']
        post['created_utc'] = raw_post['created_utc']
        post['num_comments'] = raw_post['num_comments']
        
        posts.append(post)

    return posts

In [3]:
subreddit_url = "/r/programming" # https://reddit.com/r/askreddit
posts = get_subreddit_posts(subreddit_url)

Getting posts from /r/programming...


In [4]:
df_posts = pd.DataFrame(posts)
df_posts.head()

Unnamed: 0,title,score,url,created_utc,num_comments
0,Copyright implications of brute forcing all 12...,514,https://reddit.com/r/programming/comments/f1tu...,1581357000.0,88
1,USWDS: US Gov's surprisingly modern web design...,1035,https://reddit.com/r/programming/comments/f1iv...,1581299000.0,163
2,Someone suggested I should host my website on ...,2186,https://reddit.com/r/programming/comments/f18r...,1581256000.0,150
3,Apache Groovy 3.0 Released,14,https://reddit.com/r/programming/comments/f1rv...,1581349000.0,4
4,Kotlin Census 2019: Call for Respondents,14,https://reddit.com/r/programming/comments/f1qm...,1581343000.0,7


In [5]:
df_posts.shape

(100, 5)

In [6]:
df_posts["created_utc"] = df_posts["created_utc"].apply(datetime.fromtimestamp)

In [7]:
df_posts.rename(columns={'created_utc':'date_hour'},inplace=True)

In [8]:
df_posts

Unnamed: 0,title,score,url,date_hour,num_comments
0,Copyright implications of brute forcing all 12...,514,https://reddit.com/r/programming/comments/f1tu...,2020-02-10 18:46:00,88
1,USWDS: US Gov's surprisingly modern web design...,1035,https://reddit.com/r/programming/comments/f1iv...,2020-02-10 02:36:12,163
2,Someone suggested I should host my website on ...,2186,https://reddit.com/r/programming/comments/f18r...,2020-02-09 14:41:47,150
3,Apache Groovy 3.0 Released,14,https://reddit.com/r/programming/comments/f1rv...,2020-02-10 16:29:40,4
4,Kotlin Census 2019: Call for Respondents,14,https://reddit.com/r/programming/comments/f1qm...,2020-02-10 14:56:42,7
...,...,...,...,...,...
95,My day as a Software Engineer is about meeting...,0,https://reddit.com/r/programming/comments/f1cg...,2020-02-09 19:10:46,4
96,How to create your programming study plan,0,https://reddit.com/r/programming/comments/f16f...,2020-02-09 10:25:04,0
97,Critical Bluetooth vulnerability in Android,202,https://reddit.com/r/programming/comments/f0eq...,2020-02-07 19:38:38,41
98,I made this program to earn coins for me in cl...,0,https://reddit.com/r/programming/comments/f1aj...,2020-02-09 17:01:47,1


In [9]:
df_posts['date'] = [d.date() for d in df_posts['date_hour']]
df_posts['time'] = [d.time() for d in df_posts['date_hour']]

In [10]:
df_posts

Unnamed: 0,title,score,url,date_hour,num_comments,date,time
0,Copyright implications of brute forcing all 12...,514,https://reddit.com/r/programming/comments/f1tu...,2020-02-10 18:46:00,88,2020-02-10,18:46:00
1,USWDS: US Gov's surprisingly modern web design...,1035,https://reddit.com/r/programming/comments/f1iv...,2020-02-10 02:36:12,163,2020-02-10,02:36:12
2,Someone suggested I should host my website on ...,2186,https://reddit.com/r/programming/comments/f18r...,2020-02-09 14:41:47,150,2020-02-09,14:41:47
3,Apache Groovy 3.0 Released,14,https://reddit.com/r/programming/comments/f1rv...,2020-02-10 16:29:40,4,2020-02-10,16:29:40
4,Kotlin Census 2019: Call for Respondents,14,https://reddit.com/r/programming/comments/f1qm...,2020-02-10 14:56:42,7,2020-02-10,14:56:42
...,...,...,...,...,...,...,...
95,My day as a Software Engineer is about meeting...,0,https://reddit.com/r/programming/comments/f1cg...,2020-02-09 19:10:46,4,2020-02-09,19:10:46
96,How to create your programming study plan,0,https://reddit.com/r/programming/comments/f16f...,2020-02-09 10:25:04,0,2020-02-09,10:25:04
97,Critical Bluetooth vulnerability in Android,202,https://reddit.com/r/programming/comments/f0eq...,2020-02-07 19:38:38,41,2020-02-07,19:38:38
98,I made this program to earn coins for me in cl...,0,https://reddit.com/r/programming/comments/f1aj...,2020-02-09 17:01:47,1,2020-02-09,17:01:47


In [11]:
df_posts.date.value_counts()

2020-02-10    60
2020-02-09    25
2020-02-08    13
2020-02-07     2
Name: date, dtype: int64

#### Creating my own ddbb.

In [25]:
df_posts.to_csv('10Feb.csv')