# Importing Modules

In [1]:
import pandas as pd
from pandas import DataFrame
import requests
from requests.exceptions import Timeout
import json
import time
import os
import pickle

from ratelimiter import RateLimiter

# DATA COLLECTION

In [2]:
ids = pd.read_csv('data/full_ids.txt', delimiter='\n', header=None)
ids_list = ids[0].values.tolist()
print("Total Number of ids: ", len(ids_list))
print(ids.head(5))

Total Number of ids:  116426
        0
0  2m9g6k
1  3b5gc8
2  1vm334
3  5nwxsb
4  3f3qr7


In [3]:
temp_ids = ids.loc[:5, :]
temp_ids_list = temp_ids[0].values.tolist()
temp_ids_list
for id in temp_ids_list:
    print(id)

2m9g6k
3b5gc8
1vm334
5nwxsb
3f3qr7
23m3im


### Fetching the Data

I am using both the end points. We are able to retreive comments using the id from comments end point **if and only if** the post has "comments". If the post doesn't have any comments, the comment end point is not returning any data even though the post exists. So, to just get the post, when there are no comments for the given post, I am switching to submission end point.

In [2]:
def fetchdata(submission_id):
    # parameters for the comment endpoint query
    parms_comments = {
    'sort_type': 'created_utc',
    'sort': 'asc',
    'limit': 30000,
    'aggs': 'link_id',
    'link_id': submission_id
    }

    # parameters for the submission endpoint query
    parms_submission = {
        'sort_type': 'created_utc',
        'sort': 'asc',
        'size': 500, 
        'ids': submission_id 
    }

    # === COMMENTS END POINT === #
    try:
        service_url = 'https://api.pushshift.io/reddit/search/comment/?'
        response = requests.get(service_url, 
                                params = parms_comments, 
                                timeout = 30) 
    except Timeout:
        print("The request Time Out", submission_id)

    # checking the status code
    if response.status_code == 200 and len(response.text) > 100:
        try:
            js = json.loads(response.text)
            comments_data = js['data'] #list
            user_post_data = js['aggs']['link_id'][0]['data']
            comments_data = [user_post_data] + comments_data
            data = comments_data
        
        except (KeyError, IndexError):
            js = json.loads(response.text)
            comments_data = js['data'] # key&Index error because no post but only comments so can't access 'aggs'
            data = comments_data

    elif response.status_code == 429:
        print("Status code is: {} and the Length of Text is: {}".format(response.status_code, len(response.text)))
        return data

        # === SUBMISSION END POINT === #
    else:
        service_url = 'https://api.pushshift.io/reddit/search/submission/?'
        try:
            response = requests.get(service_url,
                                    params = parms_submission,
                                    timeout = 30)
        except Timeout:
                print("The request Time Out", submission_id)

    # checking the status code
        if response.status_code == 200 and len(response.text) > 100:
            try:
                js = json.loads(response.text)
                data = js['data']
            except:
                js = None
        
        elif response.status_code == 429:
            print("Status code is: {} and the Length of Text is: {}".format(response.status_code, len(response.text)))
            return data
        
        elif response.status_code == 200 and len(response.text) < 20:
            return None
        
        else:
#             print("===Failure to Retreive===", submission_id)
            print("Status code is: {} and the Length of Text is: {}".format(response.status_code, len(response.text)))
    return data

In [3]:
# Rate Limit of the API
meta_url = requests.get('https://api.pushshift.io/meta')
js_data = json.loads(meta_url.text)
print("Rate Limit /minute:" , js_data['server_ratelimit_per_minute'])

Rate Limit /minute: 120


In [None]:
fetchdata('2wbu57')

### Extracting the Data

In [4]:
def extract_data(ids_list):
    
    # Saving the ids of successful and failed retreivals
    failed_ids = []
    success_ids = []
    dict_ids = {}
    dict_ids['failed_ids'] = failed_ids
    dict_ids['success_ids'] = success_ids
    # time
    start_time = time.time()
    
    # Open a file for JSON output
    js_file = open("posts_and_comments_new.json", 'a') #posts_and_comments.json

    for _n, _id in enumerate(ids_list):
        try:
            print(_n, end="\r")
            time.sleep(1)
            fetched_data = fetchdata(_id)
            if (ids_list.index(_id) % 100 == 0):
                print("Current Id: ", ids_list.index(_id))
                elapsed_time = time.time() - start_time
                print("Time Elapsed for the 1000 ids: ", elapsed_time)
            success_ids.append(_id)
        except:
            failed_ids.append(_id)
            print("Error at: ", _id, '\n')
            time.sleep(0.5)
            continue

        # save the returned json data to the opened file
        print(json.dumps(fetched_data), file=js_file)


    # Saving the ids
    with open('traversed_ids.txt', 'wb') as f: #pickling
        pickle.dump(dict_ids, f)

    # Sleep a little before next call
    time.sleep(.5)
    return

In [6]:
# open('traversed_ids.txt', 'w').close() #erasing the file
# os.remove('temp.json')
# extract_data(failed_ids_list)
# extract_data(temp_ids_list)

In [7]:
with open("remaining_ids.txt") as f:
    ids = f.readlines()
ids = [i.strip() for i in ids]

In [None]:
extract_data(ids[250000+172645:])

In [None]:
# Checking the Number of Failed and Successful ids
with open('traversed_ids.txt', 'rb') as f:
    items = pickle.load(f)
print("Number of Successful Ids: ", len(items['success_ids']))
print("Number of Failed Ids: ", len(items['failed_ids']))
failed_ids_list = items['failed_ids']

In [21]:
s_ids = ids[250000:]

In [None]:
len(set(s_ids))

In [5]:
with open("posts_and_comments_new.json") as f:
    j = f.readlines()

In [8]:
res = []
for each in j[172645:]:
    res.append(json.loads(each))

In [7]:
len(res)

172644