In [1]:
import pandas as pd
from psaw import PushshiftAPI
# PSAW: Python Pushshift.io API Wrapper (for comment/submission search)
# https://pypi.org/project/psaw/
# https://psaw.readthedocs.io/en/latest/
# https://github.com/pushshift/api
api = PushshiftAPI()

In [3]:
# The `search_comments` and `search_submissions` methods return generator objects
#gen = api.search_submissions(limit=1000)
#results = list(gen)

#First 10 submissions to /r/politics in 2017, filtering results to url/author/title/subreddit fields.
#The created_utc field will be added automatically (it’s used for paging).

import datetime as dt

start_epoch=int(dt.datetime(2020, 1, 1).timestamp())

results = list(api.search_submissions(after=start_epoch,
                            subreddit='politics',
                            limit=10000))
# filter=['url','author', 'title', 'subreddit']



In [4]:
topics_dict = { "title":[], \
                "score":[], \
                "id":[], \
                "url":[], \
                "comms_num": [], \
                "created": [], \
                "body":[]
            }

In [6]:
for submission in results:
    topics_dict["title"].append(submission.title)
    topics_dict["score"].append(submission.score)
    topics_dict["id"].append(submission.id)
    topics_dict["url"].append(submission.url)
    topics_dict["comms_num"].append(submission.num_comments)
    topics_dict["created"].append(submission.created)
    topics_dict["body"].append(submission.selftext)

In [7]:
# Transform dictionnary to pandas dataframe
topics_dict
#topics_data = pd.DataFrame(topics_dict)

{'title': ["GOP-led Arizona election review closely matches Biden's winning margin",
  'Buy Google Reviews',
  'Arizona audit results reveal Trump lost to Biden by even bigger margin',
  'Opinion | Chris Cuomo Sexually Harassed Me. I Hope He’ll Use His Power to Make Change.',
  'Regulators issue standards to prevent another Texas grid freeze',
  "Trump's election lies would have died out without help from his lackeys",
  'Buy Facebook Ads Accounts',
  'Opinion: Republicans prove they never really wanted police reform. They’re on the wrong side of history.',
  'Ron DeSantis Was a Slam Dunk. Until He Wasn’t.',
  'Buy Facebook Page Reviews - buy Facebook 5 star reviews',
  'The Jan. 6 Plotters Had a Mob. They Also Had a Plan.',
  'Despite his victory in Texas and no credible evidence of widespread fraud, Donald Trump calls for election audit legislation',
  'The Anti-vaccine Con Job Is Becoming Untenable',
  'Chris Cuomo Sexually Harassed Me. I Hope He’ll Use His Power to Make Change.',
 

In [None]:
#Fixing the date column

# Reddit uses UNIX timestamps to format date and time.
# Instead of manually converting all those entries, or using a site like www.unixtimestamp.com,
# we can easily write up a function in Python to automate that process. We define it, call it,
# and join the new column to dataset with the following code:

def get_date(created):
    return dt.datetime.fromtimestamp(created)

_timestamp = topics_data["created"].apply(get_date)

topics_data = topics_data.assign(timestamp = _timestamp)

topics_data.to_json("Data/reddit.json")