# Data collection

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import requests
from datetime import date
import time

import pickle
import os
import sys

In [2]:
def save_obj(obj, filename):
    with open(filename + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(filename + '.pkl', 'rb') as f:
        return pickle.load(f)

In [3]:
import time
import requests

class JsonCollector:
    def __init__(self, mintime, subreddit):
        self.url = "https://api.pushshift.io/reddit/search/submission/"
        self.subreddit = subreddit

        # the Unix times we have already searched. We searching from mintime to current,
        # updating each time we do it.
        self.min = mintime
        
        #contains dictionary keyed on postid, with value=json block
        self.data = {}
        
    
    def fetch_json(self, lower, upper):
        r = requests.get(
            self.url,
            params={
                "subreddit": self.subreddit,
                "before": upper,
                "after": lower,
                "size": 500
            }
        )
        assert r.status_code == 200
        return r.json()["data"]
    
    # Find posts starting at the subreddit's start, up to the current time
    def read_forward(self):
        jsonlist = self.fetch_json(self.min, int(time.time()))        
        
        if jsonlist:
            # save the data    
            for i in range(len(jsonlist)):
                self.data[jsonlist[i]["id"]] = jsonlist[i]
            self.min = jsonlist[-1]["created_utc"]
        return jsonlist

    def populate(self):
        while self.read_forward():
#             display(len(self.data.keys()))
            # The pushshift api limit is apparently 3Hz.
            time.sleep(0.5)

In [4]:
# The following subreddits were founded at certain Unix times:
# /r/nottheonion: 1224968000
# /r/TheOnion: 1206303030
# /r/OnionHeadlines: 1314636831

In [5]:
DIR = "C:\\Users\\AzNsAnTaGiN\\DSI\\Projects\\project_3\\"
FILE1 = "theonion"
FILE2 = "nottheonion"
FILE3 = "onionheadlines"

In [6]:
if not os.path.isfile(DIR+FILE1):
    theonion = JsonCollector(mintime=1206303030, subreddit="TheOnion")
else:
    theonion = load_obj(DIR+FILE1)
    
if not os.path.isfile(DIR+FILE2):
    nottheonion = JsonCollector(mintime=1224968000, subreddit="nottheonion")
else:
    nottheonion = load_obj(DIR+FILE2)

if not os.path.isfile(DIR+FILE3):
    onionheadlines = JsonCollector(mintime=1224968000, subreddit="OnionHeadlines")
else:
    onionheadlines = load_obj(DIR+FILE3)

In [8]:
theonion.populate()

In [9]:
nottheonion.populate()

In [10]:
onionheadlines.populate()

In [11]:
display(len(theonion.data.keys()))
display(len(nottheonion.data.keys()))
display(len(onionheadlines.data.keys()))

17039

441717

6531

In [12]:
save_obj(theonion, DIR+FILE1)
save_obj(nottheonion, DIR+FILE2)
save_obj(onionheadlines, DIR+FILE3)

In [13]:
(len(theonion.data.keys()))+(len(nottheonion.data.keys()))+(len(onionheadlines.data.keys()))

465287