# Reddit Subreddit Classification
---

## Problem Statement
---

## Import Libraries
---

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import time
import datetime

## Data Acquisition

In [83]:
base_url = "https://api.pushshift.io/reddit/search/submission"

In [114]:
# Function to make request from Pushshift API and return data in dataframe
# sleep code inspired by: https://realpython.com/python-sleep/

def extract_reddit_data(subreddit, size, after):
    url = base_url + f"?subreddit={subreddit}&size={size}&after={after}&sort=asc"
    no_response = True
    while no_response:
        print(f'Making request at: {url}')
        res = requests.get(url)
        print(f'Request response code: {res.status_code}')
        if res.status_code == 200:
            no_response = False
        else:
            print(f'Trying again')
            time.sleep(5)        
    data = res.json()['data']
    print(f'Length of Data: {len(data)}')
    submissions = []
    for submission in data:
        s = {
            "id": submission.get('id', ""),
            "created_utc": submission.get('created_utc',""),
            "title": submission.get("title", ""),
            "selftext": submission.get("selftext", ""),
            "subreddit": submission.get("subreddit", ""),
            "subreddit_id": submission.get("subreddit_id", ""),
            "url": submission.get("url", ""),
        }
        submissions.append(s)
    return submissions

In [115]:
def extract_all_submissions(subreddit, after):
    # Get first set of submissions
    print(f'Getting first set of submissions')
    first = extract_reddit_data(subreddit, "100", after)
    pd.DataFrame(first).to_csv(f"./data/{subreddit}-{after}.csv")
    
    last_timestamp = datetime.datetime.fromtimestamp(first[-1]['created_utc'])
    
    while last_timestamp.date() < datetime.date.today():
        time.sleep(5)
        print(f'Getting submissions after {last_timestamp}')
        next_submissions = extract_reddit_data(subreddit, "100", str(int(last_timestamp.timestamp())))
        pd.DataFrame(next_submissions).to_csv(f"./data/{subreddit}-{last_timestamp}.csv")
        last_timestamp = datetime.datetime.fromtimestamp(next_submissions[-1]['created_utc'])
        

In [119]:
extract_all_submissions("AskMen", "180d")

Getting first set of submissions
Making request at: https://api.pushshift.io/reddit/search/submission?subreddit=AskMen&size=100&after=180d&sort=asc
Request response code: 200
Length of Data: 99
Getting submissions after 2021-09-25 17:41:05
Making request at: https://api.pushshift.io/reddit/search/submission?subreddit=AskMen&size=100&after=1632609665&sort=asc
Request response code: 200
Length of Data: 99
Getting submissions after 2021-09-25 22:06:06
Making request at: https://api.pushshift.io/reddit/search/submission?subreddit=AskMen&size=100&after=1632625566&sort=asc
Request response code: 200
Length of Data: 100
Getting submissions after 2021-09-26 05:51:51
Making request at: https://api.pushshift.io/reddit/search/submission?subreddit=AskMen&size=100&after=1632653511&sort=asc
Request response code: 200
Length of Data: 100
Getting submissions after 2021-09-26 10:42:30
Making request at: https://api.pushshift.io/reddit/search/submission?subreddit=AskMen&size=100&after=1632670950&sort=as

KeyboardInterrupt: 