In [1]:
## Collect Reddit content via public API
import json
import praw

In [2]:
# IMPORTANT: enter proper access credential in the config-file;
# make sure it is available in the same folder where this notebook is running
import config_reddit

In [3]:
# establish an API connection and verify read-only access
reddit = praw.Reddit(user_agent    = f"Content exploration by {config_reddit.app_name}",
                     client_id     = config_reddit.app_id,
                     client_secret = config_reddit.app_secret)
reddit.read_only

Version 7.7.1 of praw is outdated. Version 7.8.1 was released Friday October 25, 2024.


True

In [4]:
# choose a subreddit of interest
# MODIFY this to what you prefer to analyze
#
# Example (take the string from the ending-part of the subreddit URL):
#  https://www.reddit.com/r/travel/
query_subreddit = 'travel'

In [5]:
# decide how many top-"hot" posts to query
# keep in mind data query limitations for very active subreddits
nposts = 20

In [6]:
# collect ids of the top posts within the chosen subreddit
post_ids = []
subreddit = reddit.subreddit(query_subreddit)
for p in subreddit.hot(limit = nposts):
    post_ids.append(p.id)
# check how many posts (submissions) were collected
len(post_ids)

20

In [7]:
# example post details
post_details = reddit.submission(id = post_ids[0])
print(post_details.title)
print(post_details.selftext)

All Layover Questions - READ THIS NOTICE
**READ THE NEW LAYOVER FAQ:** [**https://www.reddit.com/r/travel/wiki/mfaq-flying/layovers**](https://www.reddit.com/r/travel/wiki/mfaq-flying/layovers)

All layover questions will be removed unless your situation is unique and cannot be answered by the wiki.

**Members of the community**: please report any layover questions that can be answered by the wiki and we will remove them promptly.

Self-transfers times are not covered under this new guideline and wiki.


In [8]:
# decide how many top comments to query per post;
# NOTE: larger number of comments may dilute the content (irrelevant text)
ncomments = 10

In [9]:
# function to collect post data
def collect_post_data(post_id, ncomments, reddit):
    psubm = reddit.submission(id = post_id)
    pdata = {'id': post_id, 'title': psubm.title, 'text': psubm.selftext}
    
    # collect first- and second-level comments
    pcomm = []
    psubcomm = []
    psubm.comments.replace_more(limit = ncomments)
    for top_comment in psubm.comments:
        pcomm.append(top_comment.body)
        for lev2_comment in top_comment.replies:
            psubcomm.append(lev2_comment.body)
    
    # assemble the data together
    pdata['comments_lev1'] = pcomm
    pdata['comments_lev2'] = psubcomm
    
    return pdata

In [10]:
# collect information for each post;
# if you receive server error, try reducing the number of posts requested
posts_all = [collect_post_data(pid, ncomments, reddit) for pid in post_ids]

In [11]:
# save collected data to json file (you can inspect it in a web browser, load into pandas etc.)
file_out = f"raw_post_comment_data.json"
with open(file_out, mode='w') as f:
    f.write(json.dumps(posts_all, indent = 2))