In [1]:
from functools import lru_cache

import requests
import pandas as pd
from datetime import datetime

In [2]:
headers = {'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:66.0) Gecko/20100101 Firefox/66.0"}
REDDIT_ROOT_URL = "https://reddit.com"


def add_json_sufix(url):
    url = REDDIT_ROOT_URL + url if not url.startswith("http") else url
    return url if url.endswith('.json?limit=100') else url + '.json?limit=100'


def get_with_headers(url):
    return requests.get(url, headers=headers)


def get_subreddit_posts(subreddit_url):
    print(f"Getting posts from {subreddit_url}...")
    subreddit_url = add_json_sufix(subreddit_url)
    response = get_with_headers(subreddit_url)
    raw_posts = response.json()['data']['children']

    posts = []
    for raw_post in raw_posts:
        post = {}
        raw_post = raw_post['data']
        post['title'] = raw_post['title']
        post['score'] = raw_post['score']
        post['url'] = REDDIT_ROOT_URL + raw_post['permalink']
        post['nsfw'] = raw_post['over_18']

        posts.append(post)

    return posts


def get_comments_from_post(post_url):
    post_url = add_json_sufix(post_url)

    print(f"Getting comments from {post_url}...")
    response = get_with_headers(post_url)
    raw_comments = response.json()[1]['data']['children']

    comments = []

    for raw_comment in raw_comments:
        comment = {}
        try:
            raw_comment = raw_comment['data']
            comment['score'] = raw_comment.get('score', 0)
            comment['content'] = raw_comment['body']
            comment['created_utc'] = raw_comment['created_utc']
            comments.append(comment)
        except:
            pass

    return comments


@lru_cache(maxsize=32)
def get_all_comments_from_subreddit(subreddit_url):
    posts = get_subreddit_posts(subreddit_url)
    all_comments = []
    for post in posts:
        all_comments += get_comments_from_post(post['url'])
    return all_comments

In [3]:
subreddit_url = "/r/programming" # https://reddit.com/r/askreddit
comments = get_all_comments_from_subreddit(subreddit_url)

Getting posts from /r/programming...
Getting comments from https://reddit.com/r/programming/comments/f1tuwo/copyright_implications_of_brute_forcing_all/.json?limit=100...
Getting comments from https://reddit.com/r/programming/comments/f1ivz0/uswds_us_govs_surprisingly_modern_web_design/.json?limit=100...
Getting comments from https://reddit.com/r/programming/comments/f18rii/someone_suggested_i_should_host_my_website_on_my/.json?limit=100...
Getting comments from https://reddit.com/r/programming/comments/f1rvjb/apache_groovy_30_released/.json?limit=100...
Getting comments from https://reddit.com/r/programming/comments/f1qmup/kotlin_census_2019_call_for_respondents/.json?limit=100...
Getting comments from https://reddit.com/r/programming/comments/f1op1l/minimal_totp_generator_in_20_lines_of_python/.json?limit=100...
Getting comments from https://reddit.com/r/programming/comments/f1sx3d/my_fp_journey/.json?limit=100...
Getting comments from https://reddit.com/r/programming/comments/f1v8yu

Getting comments from https://reddit.com/r/programming/comments/f1mi4i/mean_stack_tutorial_build_a_full_stack_mean/.json?limit=100...
Getting comments from https://reddit.com/r/programming/comments/f1ki3z/calculate_factorial_of_a_large_number_in_c_python/.json?limit=100...
Getting comments from https://reddit.com/r/programming/comments/f1cu78/introducing_the_ly_language_the_universal_block/.json?limit=100...
Getting comments from https://reddit.com/r/programming/comments/f0l12r/timegov_was_overhauled_today_save_for_web/.json?limit=100...
Getting comments from https://reddit.com/r/programming/comments/f1ks5k/i_couldnt_find_any_new_vr_dev_tutorials_for_unity/.json?limit=100...
Getting comments from https://reddit.com/r/programming/comments/f1h50x/python_and_flask_project_management/.json?limit=100...
Getting comments from https://reddit.com/r/programming/comments/f1l5ao/if_you_are_a_student_here_are_a_lot_of_resources/.json?limit=100...
Getting comments from https://reddit.com/r/programm

In [4]:
df = pd.DataFrame(comments)
df.head()

Unnamed: 0,score,content,created_utc
0,247,&gt; ... they have copyrighted every possible ...,1581358000.0
1,15,"Ah yes, a video has three million views on You...",1581363000.0
2,28,Fascinating! I wonder how would things look li...,1581360000.0
3,14,I'm more surprised how this took that long to ...,1581361000.0
4,6,"You can compress that data by just saying ""Gen...",1581363000.0


In [5]:
df.shape

(339, 3)

In [6]:
df["created_utc"] = df["created_utc"].apply(datetime.fromtimestamp)

In [7]:
df.head()

Unnamed: 0,score,content,created_utc
0,247,&gt; ... they have copyrighted every possible ...,2020-02-10 19:03:22
1,15,"Ah yes, a video has three million views on You...",2020-02-10 20:31:01
2,28,Fascinating! I wonder how would things look li...,2020-02-10 19:34:01
3,14,I'm more surprised how this took that long to ...,2020-02-10 19:50:39
4,6,"You can compress that data by just saying ""Gen...",2020-02-10 20:32:14


In [8]:
df['date'] = [d.date() for d in df['created_utc']]
df['time'] = [d.time() for d in df['created_utc']]

In [9]:
df.head()

Unnamed: 0,score,content,created_utc,date,time
0,247,&gt; ... they have copyrighted every possible ...,2020-02-10 19:03:22,2020-02-10,19:03:22
1,15,"Ah yes, a video has three million views on You...",2020-02-10 20:31:01,2020-02-10,20:31:01
2,28,Fascinating! I wonder how would things look li...,2020-02-10 19:34:01,2020-02-10,19:34:01
3,14,I'm more surprised how this took that long to ...,2020-02-10 19:50:39,2020-02-10,19:50:39
4,6,"You can compress that data by just saying ""Gen...",2020-02-10 20:32:14,2020-02-10,20:32:14


In [10]:
df.date.value_counts()

2020-02-10    145
2020-02-08     92
2020-02-09     86
2020-02-07     16
Name: date, dtype: int64

#### Creating my own ddbb of comments.

In [11]:
df.to_csv('10Feb_comments.csv')