### Imports

**THIS NOTEBOOK USES REQUESTS TO GET DATA FROM REDDITS API**

In [1]:
import pandas as pd 
import numpy as np 
import time 
import getpass 
import requests

### API credentials 

In [2]:
client_id = 'NSeSVr03ZJ8ov2YBVvK6vw'
client_secret =  'VJbMqoBH0yMzhUXtvrtBUY6IgwLcXQ'
user_agent = 'navi dsb-826'
username = 'OkCommunity6752'
password =  getpass.getpass() # reddit password

 ········


In [12]:
# Create function to get access token 
def get_token():
    auth = requests.auth.HTTPBasicAuth(client_id, client_secret)
    data = {
        'grant_type': 'password',
        'username': username,
        'password': password
    }
    # Create an informative header for your application
    headers = {'User-Agent': 'navina/0.0.1'}

    res = requests.post(
        'https://www.reddit.com/api/v1/access_token',
        auth=auth,
        data=data,
        headers=headers)
    
    # Retrieve access token
    token = res.json()['access_token']
    return token 

In [4]:
# Create function to get posts from subreddit
def get_posts(subreddit, token, limit=100, before = None):
    headers = {'Authorization': f'bearer {token}', 'User-Agent': user_agent}
    url = f'https://oauth.reddit.com/r/{subreddit}/new'
    params = {'limit': limit, 'before': before, 'sort': 'created', 'sort_order': 'asc'}

    print(f"Requesting posts from r/{subreddit}")
    res = requests.get(url, headers=headers, params=params)
    # Check if the request was successful
    if res.status_code != 200:
        print(f"Error {res.status_code}: {res.text}")
        return pd.DataFrame(), None  # Return an empty DataFrame on error

    print(f"Received response from r/{subreddit}")
    posts = res.json()['data']['children']
    after = res.json()['data']['before'] 

    posts_list = []
    for post in posts: 
        post_list = post['data']
        if post_list['title'] and post_list['selftext']:
            posts_list.append({
                'subreddit': subreddit, 
                'title': post_list['title'],
                'body': post_list['selftext'],
                'created': post_list['created_utc'],
                'score': post_list['score'],
                'num_comments': post_list['num_comments'],
                'id': post_list['id']
            })
    print(f"Parsed {len(posts_list)} posts from r/{subreddit}")
    return pd.DataFrame(posts_list), before

In [5]:
# Function to filter unique posts
def unique_filter(df, new_posts):
    # filtering based on ID rather than all columns, two posts can be created at the same time but ID is unique for every post 
    return new_posts[~new_posts['id'].isin(df['id'])]

In [6]:
# Read the existing csv
try:
    all_posts = pd.read_csv('subreddit_posts.csv')
    print(f'{len(all_posts)} posts loaded from subreddit_posts.csv')
# Create new dataframe if file is not found
except FileNotFoundError:
    all_posts = pd.DataFrame(columns=['subreddit', 'title', 'body', 'created', 'score', 'num_comments', 'id'])
    print('File not found. Creating a new file.')

2099 posts loaded from subreddit_posts.csv


In [7]:
# Identify what the last post IDs for each subreddit were
last_personalfinance_id = all_posts[all_posts['subreddit'] == 'personalfinance']['id'].min() if 'id' in all_posts.columns else None
last_investing_id = all_posts[all_posts['subreddit'] == 'investing']['id'].min() if 'id' in all_posts.columns else None

### Pull data 

In [9]:
# Loop to collect posts
while True:
    try:
        # Print statement to make sure the loop is looping
        print('Beginning Loop...')
        # Get access token
        token = get_token()

        # Pull posts from both subreddits
        personalfinance_posts, last_personalfinance_id = get_posts('personalfinance', token, limit=100, before=last_personalfinance_id)
        investing_posts, last_investing_id = get_posts('investing', token, limit=100, before= last_investing_id)
        
        # Filter unique posts
        personalfinance_unique = unique_filter(all_posts, personalfinance_posts)
        investing_unique = unique_filter(all_posts, investing_posts)

        # Append the unique posts to the main DataFrame
        all_posts = pd.concat([all_posts, personalfinance_unique, investing_unique], ignore_index=True)
        
        # Remove duplicates 
        all_posts.drop_duplicates(subset=['id'], inplace=True)

        # Save data to CSV after each iteration
        all_posts.to_csv('subreddit-data.csv', mode='w', header=True, index=False)

        # Print progress
        print(f"Collected {len(all_posts[all_posts['subreddit'] == 'personalfinance'])} unique posts from r/personalfinance")
        print(f"Collected {len(all_posts[all_posts['subreddit'] == 'investing'])} unique posts from r/investing")
        print(f"Total posts collected so far: {len(all_posts)}")

        # Check if both subreddits have 3500 unique posts
        if len(all_posts[all_posts['subreddit'] == 'personalfinance']) >= 3000 and len(all_posts[all_posts['subreddit'] == 'investing']) >= 3000:
            print("Collected 3000 unique posts from both subreddits. Stopping the script.")
            break

        # Waiting before looping 
        print("Test: Break Time!")
        time.sleep(61)

    except Exception as e:
        print(f"Error: {e}")
        # Retry after 45 seconds if there is an error
        time.sleep(45)

Beginning Loop...
Requesting posts from r/personalfinance
Received response from r/personalfinance
Parsed 100 posts from r/personalfinance
Requesting posts from r/investing
Received response from r/investing
Parsed 100 posts from r/investing
Collected 1001 unique posts from r/personalfinance
Collected 943 unique posts from r/investing
Total posts collected so far: 1944
Test: Break Time!
Beginning Loop...
Requesting posts from r/personalfinance
Received response from r/personalfinance
Parsed 100 posts from r/personalfinance
Requesting posts from r/investing
Received response from r/investing
Parsed 100 posts from r/investing
Collected 1001 unique posts from r/personalfinance
Collected 943 unique posts from r/investing
Total posts collected so far: 1944
Test: Break Time!


KeyboardInterrupt: 