In [1]:
import praw
# praw: Python Reddit API Wrapper
# https://praw.readthedocs.io/en/latest/getting_started/installation.html
import pandas as pd
import datetime as dt
import configparser

In [2]:
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))

USER = config.get('PERSO','USER_REDDIT')
PASSWORD = config.get('PERSO','PWD_REDDIT')
SECRET = config.get('PERSO','SECRET_REDDIT')

#reddit = praw.Reddit(client_id='efZ3If5NFsDeJH4Y_a-Org', \
#                     client_secret=SECRET, \
#                     user_agent='pg-test')

reddit = praw.Reddit("bot1", user_agent="bot1 user agent")

#                     username=USER, \
#                     password=PASSWORD)

In [3]:
subreddit = reddit.subreddit('all')

In [7]:
# We are right now really close to getting the data in our hands. Our top_subreddit object has methods to return
# all kinds of information from each submission.
# You can check it for yourself with these simple two lines:

top_subreddit = subreddit.top('all', limit=1000)

#for submission in subreddit.top('all', limit=1000):
#    print(submission.title, submission.id)

In [8]:
#For the project, Aleszu and I decided to scrape this information about the topics:
# title, score, url, id, number of comments, date of creation, body text.
# This can be done very easily with a for lop just like above,
# but first we need to create a place to store the data. On Python,
# that is usually done with a dictionary. Let’s create it with the following code:

topics_dict = { "title":[], \
                "score":[], \
                "id":[], \
                "url":[], \
                "comms_num": [], \
                "created": [], \
                "body":[]
            }

In [9]:
# Now we are ready to start scraping the data from the Reddit API. We will iterate through our top_subreddit object
# and append the information to our dictionary.

for submission in top_subreddit:
    topics_dict["title"].append(submission.title)
    topics_dict["score"].append(submission.score)
    topics_dict["id"].append(submission.id)
    topics_dict["url"].append(submission.url)
    topics_dict["comms_num"].append(submission.num_comments)
    topics_dict["created"].append(submission.created)
    topics_dict["body"].append(submission.selftext)

In [10]:
#Python dictionaries, however, are not very easy for us humans to read.
# This is where the Pandas module comes in handy.
# We’ll finally use it to put the data into something that looks like a spreadsheet — in Pandas,
# we call those Data Frames.

topics_data = pd.DataFrame(topics_dict)
topics_data

Unnamed: 0,title,score,id,url,comms_num,created,body
0,I’ve found a few funny memories during lockdow...,438822,haucpf,https://i.redd.it/f58v4g8mwh551.jpg,19267,1.592411e+09,
1,Times Square right now,425381,l8rf4k,https://v.redd.it/x64z70f7eie61,12641,1.612030e+09,
2,Joe Biden elected president of the United States,365129,jptqj9,https://apnews.com/article/election-2020-joe-b...,28832,1.604767e+09,
3,The Senate. Upvote this so that people see it ...,349293,62sjuh,http://i.imgur.com/ChYwfMq.jpg,4493,1.491051e+09,
4,My cab driver tonight was so excited to share ...,307856,7mjw12,https://i.redd.it/tojcmbvjwk601.jpg,2446,1.514430e+09,
...,...,...,...,...,...,...,...
995,“If masks were necessary we would have evolved...,146017,i733ef,https://i.redd.it/h9zrzgmsu5g51.jpg,2166,1.597059e+09,
996,Metal Jesus COVID FREAKOUT,148372,k4of64,https://v.redd.it/t0a0dkxqsl261,3426,1.606841e+09,
997,He did it,145983,d079np,https://i.redd.it/dkfj1vnjhuk31.jpg,1094,1.567720e+09,
998,Tunak tunak tun,145982,gge3nl,https://i.redd.it/dtb2twg59qx41.jpg,1867,1.589025e+09,


In [11]:
#Fixing the date column

# Reddit uses UNIX timestamps to format date and time.
# Instead of manually converting all those entries, or using a site like www.unixtimestamp.com,
# we can easily write up a function in Python to automate that process. We define it, call it,
# and join the new column to dataset with the following code:

def get_date(created):
    return dt.datetime.fromtimestamp(created)

_timestamp = topics_data["created"].apply(get_date)

topics_data = topics_data.assign(timestamp = _timestamp)

title        1000
score        1000
id           1000
url          1000
comms_num    1000
created      1000
body         1000
timestamp    1000
dtype: int64

In [12]:
topics_data.to_json("Data/reddit.json")