In [1]:
from functools import lru_cache

import requests
import pandas as pd
from datetime import datetime

In [2]:
headers = {'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:66.0) Gecko/20100101 Firefox/66.0"}
REDDIT_ROOT_URL = "https://reddit.com"


def add_json_sufix(url):
    url = REDDIT_ROOT_URL + url if not url.startswith("http") else url
    return url if url.endswith('.json?limit=100') else url + '.json?limit=100'


def get_with_headers(url):
    return requests.get(url, headers=headers)


def get_subreddit_posts(subreddit_url):
    print(f"Getting posts from {subreddit_url}...")
    subreddit_url = add_json_sufix(subreddit_url)
    response = get_with_headers(subreddit_url)
    raw_posts = response.json()['data']['children']

    posts = []
    for raw_post in raw_posts:
        post = {}
        raw_post = raw_post['data']
        post['title'] = raw_post['title']
        post['score'] = raw_post['score']
        post['url'] = REDDIT_ROOT_URL + raw_post['permalink']
        post['nsfw'] = raw_post['over_18']

        posts.append(post)

    return posts


def get_comments_from_post(post_url):
    post_url = add_json_sufix(post_url)

    print(f"Getting comments from {post_url}...")
    response = get_with_headers(post_url)
    raw_comments = response.json()[1]['data']['children']

    comments = []

    for raw_comment in raw_comments:
        comment = {}
        try:
            raw_comment = raw_comment['data']
            comment['score'] = raw_comment.get('score', 0)
            comment['content'] = raw_comment['body']
            comment['created_utc'] = raw_comment['created_utc']
            comments.append(comment)
        except:
            pass

    return comments


@lru_cache(maxsize=32)
def get_all_comments_from_subreddit(subreddit_url):
    posts = get_subreddit_posts(subreddit_url)
    all_comments = []
    for post in posts:
        all_comments += get_comments_from_post(post['url'])
    return all_comments

In [3]:
subreddit_url = "/r/programming" # https://reddit.com/r/askreddit
comments = get_all_comments_from_subreddit(subreddit_url)

Getting posts from /r/programming...
Getting comments from https://reddit.com/r/programming/comments/f88zom/i_made_an_extension_for_visual_debugging_in_vs/.json?limit=100...
Getting comments from https://reddit.com/r/programming/comments/f83wvo/lets_build_a_simple_database/.json?limit=100...
Getting comments from https://reddit.com/r/programming/comments/f879c5/taking_care_of_code_more_and_more_code/.json?limit=100...
Getting comments from https://reddit.com/r/programming/comments/f89152/rustwinrt_coming_soon/.json?limit=100...
Getting comments from https://reddit.com/r/programming/comments/f7upm2/github_microsoftelectionguard_electionguard_is_a/.json?limit=100...
Getting comments from https://reddit.com/r/programming/comments/f7q2q3/i_made_a_commandline_script_to_make_glitched/.json?limit=100...
Getting comments from https://reddit.com/r/programming/comments/f8cci3/in_depth_guide_to_running_elasticsearch_in/.json?limit=100...
Getting comments from https://reddit.com/r/programming/comm

Getting comments from https://reddit.com/r/programming/comments/f6ux05/blurhash_extremely_compact_representations_of/.json?limit=100...
Getting comments from https://reddit.com/r/programming/comments/f7vjmn/what_are_the_best_software_engineering_principles/.json?limit=100...
Getting comments from https://reddit.com/r/programming/comments/f7en6d/a_3d_rendering_engine_written_completely_from/.json?limit=100...
Getting comments from https://reddit.com/r/programming/comments/f7rieb/the_full_stack_team_of_the_20s_must_own_its_data/.json?limit=100...
Getting comments from https://reddit.com/r/programming/comments/f7d3af/wheres_that_log_file_debugging_failed_docker/.json?limit=100...
Getting comments from https://reddit.com/r/programming/comments/f7spce/the_ultraconservative_developer/.json?limit=100...
Getting comments from https://reddit.com/r/programming/comments/f7oxr8/announcing_the_new_spring_website_from_springio/.json?limit=100...
Getting comments from https://reddit.com/r/programming

In [4]:
df = pd.DataFrame(comments)
df.head()

Unnamed: 0,score,content,created_utc
0,14,Work with C++ and C#,1582466000.0
1,6,"For mobile users, [here is the direct link to ...",1582477000.0
2,16,[Here](https://marketplace.visualstudio.com/it...,1582463000.0
3,6,Looks great!!!,1582465000.0
4,3,That's insanely impressive. I wish there was s...,1582474000.0


In [5]:
df.shape

(359, 3)

In [6]:
df["created_utc"] = df["created_utc"].apply(datetime.fromtimestamp)

In [7]:
df.head()

Unnamed: 0,score,content,created_utc
0,14,Work with C++ and C#,2020-02-23 14:59:06
1,6,"For mobile users, [here is the direct link to ...",2020-02-23 17:50:47
2,16,[Here](https://marketplace.visualstudio.com/it...,2020-02-23 14:03:24
3,6,Looks great!!!,2020-02-23 14:38:29
4,3,That's insanely impressive. I wish there was s...,2020-02-23 17:05:32


In [8]:
df['date'] = [d.date() for d in df['created_utc']]
df['time'] = [d.time() for d in df['created_utc']]

In [9]:
df.head()

Unnamed: 0,score,content,created_utc,date,time
0,14,Work with C++ and C#,2020-02-23 14:59:06,2020-02-23,14:59:06
1,6,"For mobile users, [here is the direct link to ...",2020-02-23 17:50:47,2020-02-23,17:50:47
2,16,[Here](https://marketplace.visualstudio.com/it...,2020-02-23 14:03:24,2020-02-23,14:03:24
3,6,Looks great!!!,2020-02-23 14:38:29,2020-02-23,14:38:29
4,3,That's insanely impressive. I wish there was s...,2020-02-23 17:05:32,2020-02-23,17:05:32


In [10]:
df.date.value_counts()

2020-02-21    101
2020-02-22     94
2020-02-23     88
2020-02-20     69
2020-02-19      7
Name: date, dtype: int64

#### Creating my own ddbb of comments.

In [11]:
df.to_csv('23Feb_comments.csv')