In [15]:
import praw
# praw: Python Reddit API Wrapper
# https://praw.readthedocs.io/en/latest/getting_started/installation.html
import pandas as pd
import datetime as dt

In [16]:
# initiate a reddit instance

reddit = praw.Reddit("bot1", user_agent="bot1 user agent")

In [17]:
#For the project, Aleszu and I decided to scrape this information about the topics:
# title, score, url, id, number of comments, date of creation, body text.
# This can be done very easily with a for loop just like above,
# but first we need to create a place to store the data. On Python,
# that is usually done with a dictionary. Let’s create it with the following code:

topics_dict = { "title":[], \
                "author": [], \
                "created":[]
            }

In [18]:
# Stream reddit
# for submission in reddit.subreddit("all").stream.submissions():
#    print(submission.title)

In [19]:
# Now we are ready to start scraping the data from the Reddit API. We will iterate through our top_subreddit object
# and append the information to our dictionary.

for submission in reddit.subreddit("all").top("all", limit=1000):
    topics_dict["title"].append(submission.title)
    if hasattr(submission, 'author_fullname'):
        topics_dict["author"].append(submission.author_fullname)    
    else:
        topics_dict["author"].append(None)
    topics_dict["created"].append(submission.created)

In [20]:
#Python dictionaries, however, are not very easy for us humans to read.
# This is where the Pandas module comes in handy.
# We’ll finally use it to put the data into something that looks like a spreadsheet — in Pandas,
# we call those Data Frames.

#print(topics_dict)

topics_data = pd.DataFrame(topics_dict)
topics_data

Unnamed: 0,title,author,created
0,I’ve found a few funny memories during lockdow...,t2_11yd5w,1.592411e+09
1,Times Square right now,t2_cxhbp,1.612030e+09
2,Joe Biden elected president of the United States,t2_fkqop,1.604767e+09
3,The Senate. Upvote this so that people see it ...,t2_nifnj,1.491051e+09
4,My cab driver tonight was so excited to share ...,t2_aa1ng,1.514430e+09
...,...,...,...
995,“If masks were necessary we would have evolved...,t2_3p1sn3e4,1.597059e+09
996,Metal Jesus COVID FREAKOUT,t2_15yuxw,1.606841e+09
997,He did it,t2_38v4u3rb,1.567720e+09
998,Tunak tunak tun,t2_5sul6p80,1.589025e+09


In [21]:
#Fixing the date column

# Reddit uses UNIX timestamps to format date and time.
# Instead of manually converting all those entries, or using a site like www.unixtimestamp.com,
# we can easily write up a function in Python to automate that process. We define it, call it,
# and join the new column to dataset with the following code:

def get_date(created):
    return dt.datetime.fromtimestamp(created)

_timestamp = topics_data["created"].apply(get_date)

topics_data = topics_data.assign(timestamp = _timestamp)

In [22]:
topics_data.to_json("Data/Reddit/reddit.json", orient="records")
#topics_data.to_csv("Data/Reddit/reddit.csv", sep=";")