# Getting Reddit Data
https://www.reddit.com/prefs/apps/

There are two options for extracting data from Reddit:

* The `requests` library, which will allow us to interface directly with the Reddit API.

* The PRAW library, which is a wrapper library that adds an extra layer of abstraction in accessing the Reddit API.

Here we will cover the first option, using the `requests` library to interface directly with the API.

The final extraction script will look like this:

In [42]:
import requests
import pandas as pd


class Reddit:
    def __init__(self, client_id, secret_token, username, password):
        # first create authentication object
        auth = requests.auth.HTTPBasicAuth(client_id, secret_token)
        # build login dictionary
        login = {'grant_type': 'password',
                 'username': username,
                 'password': password}
        # setup header info (incl description of API)
        headers = {'User-Agent': 'MyBot/0.0.1'}
        # send request for OAuth token
        res = requests.post(f'https://www.reddit.com/api/v1/access_token',
                            auth=auth, data=login, headers=headers)
        # pull auth bearer token from response
        token = res.json()['access_token']
        # add authorization to headers dictionary
        headers['Authorization'] = f'bearer {token}'
        print(f"Headers: {headers}")
        # add headers dict to internal attributes
        self.headers = headers
        # and api
        self.api = 'https://oauth.reddit.com'

    def get_new(self, subreddit, iters):
        # initialize dataframe to store data
        df = pd.DataFrame()
        # initialize parameters dictionary
        params = {'limit': 100}
        # iterate through several times to make sure we get all the data available
        for i in range(iters):
            # make request
            res = requests.get(f'{self.api}/r/{subreddit}/new',
                               headers=self.headers,
                               params=params)
            
            if i == 0:
                print(f"Json responce:\n{res.json()}")
            # check that we returned something (if not we reached end)
            if len(res.json()['data']['children']) == 0:
                print('No more found')
                return df
            # iterate through each thread recieved
            for thread in res.json()['data']['children']:
                # add info to dataframe
                df = df.append({
                    'id': thread['data']['name'],
                    'created_utc': int(thread['data']['created_utc']),
                    'subreddit': thread['data']['subreddit'],
                    'title': thread['data']['title'],
                    'selftext': thread['data']['selftext'],
                    'upvote_ratio': thread['data']['upvote_ratio'],
                    'ups': thread['data']['ups'],
                    'downs': thread['data']['downs'],
                    'score': thread['data']['score']
                }, ignore_index=True)
            # get earliest ID
            earliest = df['id'].iloc[len(df)-1]
            # add earliest ID to params
            params['after'] = earliest
        return df

In [43]:
SUB = 'investing'

In [44]:
CLIENT_ID = 'Us9dDIEW6kAg75bgmCGWWg'
SECRET_TOKEN = 'qTElaTJKnmEneu8UcRO5PD1U6COTcg'

In [45]:
auth = requests.auth.HTTPBasicAuth(CLIENT_ID, SECRET_TOKEN)

In [46]:
USER = 'de_vulpes'
PWD = '1905Sartr!'

In [47]:
reddit = Reddit(CLIENT_ID, SECRET_TOKEN, USER, PWD)

Headers: {'User-Agent': 'MyBot/0.0.1', 'Authorization': 'bearer 30304274887754--aLqy3SxUuUcjA7AxqLRzrmouedJIg'}


In [50]:
data = reddit.get_new(SUB, 20)

Json responce:


  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.ap

No more found


In [51]:
data.tail()

Unnamed: 0,id,created_utc,subreddit,title,selftext,upvote_ratio,ups,downs,score
900,t3_11wvqew,1679346031,investing,Are dividend and high yield mutual funds a goo...,Their share prices have declined as bond rates...,0.5,0,0,0
901,t3_11wvm0t,1679345788,investing,Stock Market News from Today (03/20/2033),***Earnings:***\n\n**Footlocker (FL)**\n\n* EP...,0.43,0,0,0
902,t3_11wvdwf,1679345345,investing,Roth IRA Megabackdoor Contribution Withdrawal,Say I contribute $2000 after-tax money into my...,0.5,0,0,0
903,t3_11wrvly,1679338543,investing,Is there any chance for FRC?,Had an account with them and loved their servi...,0.76,29,0,29
904,t3_11wqvln,1679336753,investing,Negative cash cycle businesses?,What are some businesses with a negative cash ...,0.44,0,0,0


In [52]:
data = data.replace({'|': ''}, regex=True)

In [53]:
data.to_csv(f'./data/reddit_{SUB}.csv', sep='|', index=False)