In [12]:
from functools import lru_cache

import requests
import pandas as pd
from datetime import datetime

In [15]:
headers = {'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:66.0) Gecko/20100101 Firefox/66.0"}
REDDIT_ROOT_URL = "https://reddit.com"


def add_json_sufix(url):
    url = REDDIT_ROOT_URL + url if not url.startswith("http") else url
    return url if url.endswith('.json?limit=100') else url + '.json?limit=100'


def get_with_headers(url):
    return requests.get(url, headers=headers)


def get_subreddit_posts(subreddit_url):
    print(f"Getting posts from {subreddit_url}...")
    subreddit_url = add_json_sufix(subreddit_url)
    response = get_with_headers(subreddit_url)
    raw_posts = response.json()['data']['children']

    posts = []
    for raw_post in raw_posts:
        post = {}
        raw_post = raw_post['data']
        post['title'] = raw_post['title']
        post['score'] = raw_post['score']
        post['url'] = REDDIT_ROOT_URL + raw_post['permalink']
        post['nsfw'] = raw_post['over_18']

        posts.append(post)

    return posts


def get_comments_from_post(post_url):
    post_url = add_json_sufix(post_url)

    print(f"Getting comments from {post_url}...")
    response = get_with_headers(post_url)
    raw_comments = response.json()[1]['data']['children']

    comments = []

    for raw_comment in raw_comments:
        comment = {}
        try:
            raw_comment = raw_comment['data']
            comment['score'] = raw_comment.get('score', 0)
            comment['content'] = raw_comment['body']
            comment['created_utc'] = raw_comment['created_utc']
            comments.append(comment)
        except:
            pass

    return comments


@lru_cache(maxsize=32)
def get_all_comments_from_subreddit(subreddit_url):
    posts = get_subreddit_posts(subreddit_url)
    all_comments = []
    for post in posts:
        all_comments += get_comments_from_post(post['url'])
    return all_comments

In [16]:
subreddit_url = "/r/programming" # https://reddit.com/r/askreddit
comments = get_all_comments_from_subreddit(subreddit_url)

Getting posts from /r/programming...
Getting comments from https://reddit.com/r/programming/comments/ery1da/natural_language_processing_for_web_developers/.json?limit=100...
Getting comments from https://reddit.com/r/programming/comments/erqqwn/net_everywhere_apparently_also_means_windows_311/.json?limit=100...
Getting comments from https://reddit.com/r/programming/comments/ersxx5/the_edge_of_emulation/.json?limit=100...
Getting comments from https://reddit.com/r/programming/comments/erwa65/what_is_rust_and_why_is_it_so_popular/.json?limit=100...
Getting comments from https://reddit.com/r/programming/comments/erzcem/from_rwebdev_awesome_article_on_good_programming/.json?limit=100...
Getting comments from https://reddit.com/r/programming/comments/erfd6h/the_2038_problem_is_already_affecting_some_systems/.json?limit=100...
Getting comments from https://reddit.com/r/programming/comments/erxovb/seq_a_language_for_bioinformatics/.json?limit=100...
Getting comments from https://reddit.com/r/

Getting comments from https://reddit.com/r/programming/comments/erunwv/why_do_we_fall_into_the_rewrite_trap/.json?limit=100...
Getting comments from https://reddit.com/r/programming/comments/ers433/devjourney_podcast_episode_84_molly_struve_an/.json?limit=100...
Getting comments from https://reddit.com/r/programming/comments/ernusk/solving_problems_properly_is_often_not_viable/.json?limit=100...
Getting comments from https://reddit.com/r/programming/comments/eru570/http_server_socket_that_responds_to_a_browsers/.json?limit=100...
Getting comments from https://reddit.com/r/programming/comments/erheoo/redpoint_a_webnative_computational_notebook_for/.json?limit=100...
Getting comments from https://reddit.com/r/programming/comments/ervup9/qa_is_evil/.json?limit=100...
Getting comments from https://reddit.com/r/programming/comments/erq93l/the_quickselect_algorithm_efficiently_finding_the/.json?limit=100...
Getting comments from https://reddit.com/r/programming/comments/ers553/sha1_is_a_sham

In [17]:
df = pd.DataFrame(comments)
df.head()

Unnamed: 0,score,content,created_utc
0,67,This doesn't explain how to actually expose th...,1579636000.0
1,5,"[First example, autocomplete](https://transfor...",1579639000.0
2,108,&gt; Michal learned that the object files that...,1579599000.0
3,291,"Oh, now THIS is the kind of dedicated pointles...",1579592000.0
4,117,&gt; He also removes the need for the garbage ...,1579616000.0


In [18]:
df.shape

(288, 3)

In [19]:
df["created_utc"] = df["created_utc"].apply(datetime.fromtimestamp)

In [20]:
df.head()

Unnamed: 0,score,content,created_utc
0,67,This doesn't explain how to actually expose th...,2020-01-21 20:45:42
1,5,"[First example, autocomplete](https://transfor...",2020-01-21 21:36:20
2,108,&gt; Michal learned that the object files that...,2020-01-21 10:36:03
3,291,"Oh, now THIS is the kind of dedicated pointles...",2020-01-21 08:40:10
4,117,&gt; He also removes the need for the garbage ...,2020-01-21 15:15:15


In [24]:
df['date'] = [d.date() for d in df['created_utc']]
df['time'] = [d.time() for d in df['created_utc']]

In [26]:
df.head()

Unnamed: 0,score,content,created_utc,date,time
0,67,This doesn't explain how to actually expose th...,2020-01-21 20:45:42,2020-01-21,20:45:42
1,5,"[First example, autocomplete](https://transfor...",2020-01-21 21:36:20,2020-01-21,21:36:20
2,108,&gt; Michal learned that the object files that...,2020-01-21 10:36:03,2020-01-21,10:36:03
3,291,"Oh, now THIS is the kind of dedicated pointles...",2020-01-21 08:40:10,2020-01-21,08:40:10
4,117,&gt; He also removes the need for the garbage ...,2020-01-21 15:15:15,2020-01-21,15:15:15


In [28]:
df.date.value_counts()

2020-01-21    143
2020-01-20    138
2020-01-19      7
Name: date, dtype: int64