In [1]:
import ast
import pandas as pd
import scipy

from data_scraper import Scraper

In [2]:
# read txt file containing Reddit credentials stored in dictionary

file = open("reddit_credentials.txt", "r")
contents = file.read()
reddit_credentials = ast.literal_eval(contents)
file.close()

In [3]:
# Scrape sentiment data

# define reddit credentials using imported dictionary
client_id = reddit_credentials["client_id"]
client_secret = reddit_credentials["client_secret"]
user_agent = reddit_credentials["user_agent"]
username = reddit_credentials["username"]
password = reddit_credentials["password"]

# Scraper class takes 9 arguments:
# Date to start scraping data
# Date to stop scraping data
# Subreddit to scrape
# Query term (it will look for comments where this term is present in the post and comment)
# Also: client_id, client_secret, user_agent, username, and password

# Output is a CSV file containing the following columns for each scraped comment:
# ID of comment
# Date that comment was posted ('%Y-%m-%d')
# Body of the comment
# Score (upvotes) of comment
# Positive sentiment score
# Negative sentiment score
# Neutral sentiment score
# Compound sentiment score
# The CSV file will be named as "subreddit"+_+"query".CSV 
# and output into same directory as the notebook.

# While running, the class prints the current date being collected. 
# I found this to be useful when debugging issues with the API hanging. 

# Here, we're collecting all comments made in the /r/conservative subreddit
# where the term "vaccine" was present in the initial post and the comment 
# (from the date range 2020-07-15 to 2021-07-15). For each date, only the first
# 100 posts containing the query are scraped, and for each post, 
# only the first 100 comments containing the query are scraped.
# This caps the total possible comments at 10,000 per day,
# which should be plenty of data to analyze without abusing the API too much.

Scraper("2020-07-15","2021-07-15", "conservative", "vaccine", client_id, 
       client_secret, user_agent, username, password).main()

Version 7.0.0 of praw is outdated. Version 7.3.0 was released Thursday June 17, 2021.


2020-07-15
2020-07-16
2020-07-17
2020-07-18
2020-07-19
2020-07-20
2020-07-21
2020-07-22
2020-07-23
2020-07-24
2020-07-25
2020-07-26
2020-07-27
2020-07-28
2020-07-29
2020-07-30
2020-07-31
2020-08-01
2020-08-02
2020-08-03




2020-08-04
2020-08-05
2020-08-06
2020-08-07
2020-08-08




2020-08-09
2020-08-10
2020-08-11
2020-08-12
2020-08-13
2020-08-14
2020-08-15
2020-08-16
2020-08-17
2020-08-18
2020-08-19
2020-08-20
2020-08-21
2020-08-22
2020-08-23
2020-08-24
2020-08-25
2020-08-26
2020-08-27
2020-08-28
2020-08-29
2020-08-30
2020-08-31
2020-09-01
2020-09-02
2020-09-03
2020-09-04
2020-09-05
2020-09-06
2020-09-07
2020-09-08
2020-09-09
2020-09-10
2020-09-11
2020-09-12
2020-09-13
2020-09-14
2020-09-15
2020-09-16
2020-09-17
2020-09-18
2020-09-19
2020-09-20
2020-09-21
2020-09-22
2020-09-23
2020-09-24
2020-09-25
2020-09-26
2020-09-27
2020-09-28
2020-09-29
2020-09-30
2020-10-01
2020-10-02
2020-10-03
2020-10-04
2020-10-05
2020-10-06
2020-10-07
2020-10-08
2020-10-09
2020-10-10
2020-10-11
2020-10-12
2020-10-13
2020-10-14
2020-10-15
2020-10-16
2020-10-17
2020-10-18
2020-10-19
2020-10-20
2020-10-21
2020-10-22
2020-10-23
2020-10-24
2020-10-25
2020-10-26
2020-10-27
2020-10-28
2020-10-29
2020-10-30
2020-10-31
2020-11-01
2020-11-02
2020-11-03
2020-11-04
2020-11-05
2020-11-06
2020-11-07

In [4]:
# Let's collect the same data from the r/politics subreddit
Scraper("2020-07-15","2021-07-15", "politics", "vaccine", client_id, 
       client_secret, user_agent, username, password).main()

2020-07-15
2020-07-16
2020-07-17
2020-07-18
2020-07-19
2020-07-20
2020-07-21
2020-07-22
2020-07-23
2020-07-24
2020-07-25
2020-07-26
2020-07-27
2020-07-28
2020-07-29
2020-07-30
2020-07-31
2020-08-01
2020-08-02
2020-08-03
2020-08-04
2020-08-05
2020-08-06
2020-08-07
2020-08-08
2020-08-09
2020-08-10
2020-08-11
2020-08-12
2020-08-13
2020-08-14
2020-08-15
2020-08-16
2020-08-17
2020-08-18
2020-08-19
2020-08-20
2020-08-21
2020-08-22
2020-08-23
2020-08-24
2020-08-25
2020-08-26
2020-08-27
2020-08-28
2020-08-29
2020-08-30
2020-08-31
2020-09-01
2020-09-02
2020-09-03
2020-09-04
2020-09-05
2020-09-06
2020-09-07
2020-09-08
2020-09-09
2020-09-10
2020-09-11
2020-09-12
2020-09-13
2020-09-14
2020-09-15
2020-09-16
2020-09-17
2020-09-18
2020-09-19
2020-09-20
2020-09-21
2020-09-22
2020-09-23
2020-09-24
2020-09-25
2020-09-26
2020-09-27
2020-09-28
2020-09-29
2020-09-30
2020-10-01
2020-10-02
2020-10-03
2020-10-04
2020-10-05
2020-10-06
2020-10-07
2020-10-08
2020-10-09
2020-10-10
2020-10-11
2020-10-12
2020-10-13

In [5]:
# And also from the the r/COVID19 subreddit
Scraper("2020-07-15","2021-07-15", "COVID19", "vaccine", client_id, 
       client_secret, user_agent, username, password).main()

2020-07-15
2020-07-16
2020-07-17
2020-07-18
2020-07-19
2020-07-20
2020-07-21
2020-07-22
2020-07-23
2020-07-24
2020-07-25
2020-07-26
2020-07-27
2020-07-28
2020-07-29
2020-07-30
2020-07-31
2020-08-01
2020-08-02
2020-08-03
2020-08-04
2020-08-05
2020-08-06
2020-08-07
2020-08-08
2020-08-09
2020-08-10
2020-08-11
2020-08-12
2020-08-13
2020-08-14
2020-08-15
2020-08-16
2020-08-17
2020-08-18
2020-08-19
2020-08-20
2020-08-21
2020-08-22
2020-08-23
2020-08-24
2020-08-25
2020-08-26
2020-08-27
2020-08-28
2020-08-29
2020-08-30
2020-08-31
2020-09-01
2020-09-02
2020-09-03
2020-09-04
2020-09-05
2020-09-06
2020-09-07
2020-09-08
2020-09-09
2020-09-10
2020-09-11
2020-09-12
2020-09-13
2020-09-14
2020-09-15
2020-09-16
2020-09-17
2020-09-18
2020-09-19
2020-09-20
2020-09-21
2020-09-22
2020-09-23
2020-09-24
2020-09-25
2020-09-26
2020-09-27
2020-09-28
2020-09-29
2020-09-30
2020-10-01
2020-10-02
2020-10-03
2020-10-04
2020-10-05
2020-10-06
2020-10-07
2020-10-08
2020-10-09
2020-10-10
2020-10-11
2020-10-12
2020-10-13