# Retrieve Post Comments

In this notebook, we will iterate through our post lists and retrieve comments from each post

In [3]:
import requests
import pandas as pd
import json
import praw
from praw.models import MoreComments
import re
from datetime import date, timedelta
import time
import os
from configparser import ConfigParser

## Set up Reddit Instance
This code reads info necessary for using the Reddit API from a config file (for an example of how I have the file set up, see `sampleconfig.ini`).

In [4]:
config_object = ConfigParser()
config_object.read("../config.ini")
reddit = praw.Reddit(
    client_id=config_object['REDDITINFO']['client_id'],
    client_secret=config_object['REDDITINFO']['client_secret'],
    user_agent=config_object['REDDITINFO']['user_agent'],
    username=config_object['REDDITINFO']['username'],
    password=config_object['REDDITINFO']['password']
)

## Setup

In [6]:
subreddit = 'naturalhair'
start_date = date(2018, 1, 1)
end_date = date(2022, 3, 28)

In [7]:
if not os.path.isdir('../output/comments'):
    os.mkdir('../output/comments')
        
if not os.path.isdir('output/comments/'+subreddit):
    os.mkdir('../output/comments/'+subreddit)

In [10]:
def daterange(start_date, end_date):
    for n in range(0, int((end_date - start_date).days), 7):
        yield start_date + timedelta(n)

def get_post_comments(submission):
    
    post_comments = {}
    if submission.author is not None:
        post_comments[submission.author.name] = [submission.title, submission.selftext]

        for top_level_comment in submission.comments:
            if top_level_comment.author is None:
                continue
            elif top_level_comment.author.name in post_comments:
                post_comments[top_level_comment.author.name].append(top_level_comment.body)
            else:
                post_comments[top_level_comment.author.name] = [top_level_comment.body]
            for second_level_comment in top_level_comment.replies:
                if isinstance(second_level_comment, MoreComments):
                    continue
                elif second_level_comment.author is None:
                    continue
                elif second_level_comment.author.name in post_comments:
                    post_comments[second_level_comment.author.name].append(second_level_comment.body)
                else:
                    post_comments[second_level_comment.author.name] = [second_level_comment.body]

        for key in post_comments:
            post_comments[key] = '&&&'.join(post_comments[key])
            
        if 'AutoModerator' in post_comments:
            post_comments.pop('AutoModerator')
                
    return ({
        'id': submission.id,
        'comments': post_comments
    })

# Loop it up y'all

In [11]:
all_posts = []

for single_date in daterange(start_date, end_date):
    output_filename = '../output/comments/' + subreddit + "/" + single_date.strftime("%Y-%m-%d") + ".txt"
    postlist_filename = '../output/post_lists/' + subreddit + "/" + single_date.strftime("%Y-%m-%d") + ".txt"
    
    if not os.path.isfile(output_filename):
    
        print(single_date.strftime("%Y-%m-%d"))
        
        # if we dont have a post list, skip
        if not os.path.isfile(postlist_filename):
            continue

        # get list of links from post_lists file
        links = [x.strip() for x in open(postlist_filename, 'r').readlines()]
        if len(links) == 0:
            continue
        
        # create df for this time frame
        all_posts = []
        for url in links:
            submission = reddit.submission(url=url)
            all_posts.append(get_post_comments(submission))
            
        # save df
        pd.concat([pd.DataFrame.from_dict(x).reset_index() for x in all_posts]).to_csv(output_filename,index=False)

2018-01-01
2018-01-08
2018-01-15
2018-01-22
2018-01-29
2018-02-05
2018-02-12
2018-02-19
2018-02-26
2018-03-05
2018-03-12
2018-03-19
2018-03-26
2018-04-02
2018-04-09
2018-04-16
2018-04-23
2018-04-30
2018-05-07
2018-05-14
2018-05-21
2018-05-28
2018-06-04
2018-06-11
2018-06-18
2018-06-25
2018-07-02
2018-07-09
2018-07-16
2018-07-23
2018-07-30
2018-08-06
2018-08-13
2018-08-20
2018-08-27
2018-09-03
2018-09-10
2018-09-17
2018-09-24
2018-10-01
2018-10-08
2018-10-15
2018-10-22
2018-10-29
2018-11-05
2018-11-12
2018-11-19
2018-11-26
2018-12-03
2018-12-10
2018-12-17
2018-12-24
2018-12-31
2019-01-07
2019-01-14
2019-01-21
2019-01-28
2019-02-04
2019-02-11
2019-02-18
2019-02-25
2019-03-04
2019-03-11
2019-03-18
2019-03-25
2019-04-01
2019-04-08
2019-04-15
2019-04-22
2019-04-29
2019-05-06
2019-05-13
2019-05-20
2019-05-27
2019-06-03
2019-06-10
2019-06-17
2019-06-24
2019-07-01
2019-07-08
2019-07-15
2019-07-22
2019-07-29
2019-08-05
2019-08-12
2019-08-19
2019-08-26
2019-09-02
2019-09-09
2019-09-16
2019-09-23