# Reddit Data Scraper

### Import Libraries

In [2]:
import os
import time
import datetime
import pandas as pd
import praw
from dotenv import load_dotenv
import requests
from bs4 import BeautifulSoup
from io import StringIO
from datetime import datetime
from typing import List, Dict, Any, Optional, Tuple
import numpy as np
from tqdm import tqdm
import json

## First let's get S&P 500 Tickers from Wikipedia. This way we have a scope for our analysis

In [24]:
url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
response = requests.get(url)
soup = BeautifulSoup(response.text)
table = soup.find("table", {"class": "wikitable"})
sp500 = pd.read_html(StringIO(str(table)))[0]
sp500.head()

Unnamed: 0,Symbol,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded
0,MMM,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,66740,1902
1,AOS,A. O. Smith,Industrials,Building Products,"Milwaukee, Wisconsin",2017-07-26,91142,1916
2,ABT,Abbott Laboratories,Health Care,Health Care Equipment,"North Chicago, Illinois",1957-03-04,1800,1888
3,ABBV,AbbVie,Health Care,Biotechnology,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
4,ACN,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989


# Reddit API Client Initialization
First let's load the api environment to collect all the necessary data. 
The idea is to download posts and comments from different subreddits. 
- Subreddits: 'wallstreetbets', 'stocks', 'investing', 'StockMarket'
- General Search & Stock Specific search

In [3]:
# First we will load the environment
load_dotenv('api.env')
# Import the ids and secret keys from our environment
id = os.getenv('REDDIT_CLIENT_ID')
secret = os.getenv('REDDIT_CLIENT_SECRET')
agent = os.getenv('REDDIT_USER_AGENT', 'StockDataScraper v1.0')
reddit = praw.Reddit(client_id = id,
                     client_secret = secret,
                     user_agent = agent)

## Let's now collect all of the data
#### Data Collection Function

In [None]:
def get_reddit_data(reddit, subreddit_name, data_type='posts', search_term=None, 
                   time_filter='year', limit=200, comment_limit=30):
    """This is the general function that I will loop through in order to download
      all the Reddit Data"""
    
    subreddit = reddit.subreddit(subreddit_name)
    posts_list = []
    comments_list = []
    
    try:
        # Determine which data to fetch
        if data_type == 'search' and search_term:
            print(f"Searching for '{search_term}' in r/{subreddit_name}...")
            posts = subreddit.search(search_term, limit=limit)
            search_keywords = [search_term.lower(), f"${search_term.lower()}"]
        else:
            print(f"Getting top posts from r/{subreddit_name} for {time_filter}...")
            posts = subreddit.top(time_filter=time_filter, limit=limit)
            search_keywords = None
            
        # Process posts
        for i, post in enumerate(posts):
            # Filter search results if needed
            if search_keywords and not any(kw in (post.title + " " + post.selftext).lower() for kw in search_keywords):
                continue
                
            # Extract post data
            post_data = {
                'post_id': post.id,
                'title': post.title,
                'selftext': post.selftext,
                'score': post.score,
                'upvote_ratio': post.upvote_ratio,
                'created_utc': datetime.datetime.fromtimestamp(post.created_utc),
                'num_comments': post.num_comments,
                'author': str(post.author),
                'permalink': post.permalink,
                'url': post.url,
                'is_self': post.is_self,
                'flair': post.link_flair_text,
                'subreddit': subreddit_name,
                'category': 'stock_specific' if data_type == 'search' else 'general'
            }
            
            # Add search term if applicable
            if search_term:
                post_data['search_term'] = search_term
                
            posts_list.append(post_data)
            
            # Get comments
            try:
                post.comments.replace_more(limit=0)
                for comment in post.comments.list()[:comment_limit]:
                    # Filter comments for search terms if needed
                    if search_keywords and not any(kw in comment.body.lower() for kw in search_keywords):
                        continue
                        
                    comment_data = {
                        'comment_id': comment.id,
                        'post_id': post.id,
                        'parent_id': comment.parent_id,
                        'body': comment.body,
                        'score': comment.score,
                        'created_utc': datetime.datetime.fromtimestamp(comment.created_utc),
                        'author': str(comment.author),
                        'subreddit': subreddit_name,
                        'category': 'stock_specific' if data_type == 'search' else 'general'
                    }
                    
                    # Add search
                    if search_term:
                        comment_data['search_term'] = search_term
                        
                    comments_list.append(comment_data)
            except Exception as e:
                print(f"Error processing comments for post {post.id}: {e}")
                
            # Be nice to Reddit's servers
                
        print(f"Found {len(posts_list)} posts and {len(comments_list)} comments")
        return pd.DataFrame(posts_list) if posts_list else pd.DataFrame(), \
               pd.DataFrame(comments_list) if comments_list else pd.DataFrame()
               
    except Exception as e:
        print(f"Error fetching data from r/{subreddit_name}: {e}")
        return pd.DataFrame(), pd.DataFrame()

#### Here we are downloading General top Posts from the following subreddits. These posts are not stock specific rather, they are the top posts a user might see when looking at the following subreddits for the past year

In [29]:
# Get S&P 500 tickers
sp500_tickers = sp500['Symbol'].tolist()
# List of finance subreddits
subreddits = ['Economics','finance','wallstreetbets', 'stocks', 'investing']
# Initialize aggregated DataFrames

all_posts = pd.DataFrame()
all_comments = pd.DataFrame()

# Loop through all the subreddits defined above and collect the posts and comments data
for subreddit in subreddits:
    posts_df, comments_df = get_reddit_data(reddit, subreddit,data_type='posts',
                                            limit=200,comment_limit=30)
    all_posts = pd.concat([all_posts, posts_df],ignore_index=True)
    all_comments = pd.concat([all_comments, comments_df],ignore_index=True)
    time.sleep(1)

Getting top posts from r/Economics for year...
Found 200 posts and 5935 comments
Getting top posts from r/finance for year...
Found 100 posts and 1870 comments
Getting top posts from r/wallstreetbets for year...
Found 200 posts and 6000 comments
Getting top posts from r/stocks for year...
Found 200 posts and 5985 comments
Getting top posts from r/investing for year...
Found 200 posts and 5876 comments


In [31]:
# Safe all posts and comments into our file
all_posts.to_csv('./data/reddit/general_posts.csv',index = False)
all_comments.to_csv('./data/reddit/general_comments.csv',index = False)

#### Let's now use the same function to collect stock specific reddit data

In here we don't want to scrape data for all of the subreddits but only keeping the few popular ones. However, we will download for all the stocks in the S&P500
I will chose r/stocks and r/wallstreetbets due to it's recent popularity

In [32]:
subreddits = ['stocks','wallstreetbets']
all_posts = pd.DataFrame()
all_comments = pd.DataFrame()
for subreddit in subreddits:
    for ticker in sp500_tickers:
        posts_df , comments_df = get_reddit_data(
            reddit, subreddit, data_type='search',search_term=ticker,
            limit=50, comment_limit=20
        )
        all_posts= pd.concat([all_posts, posts_df],ignore_index = True)
        all_comments = pd.concat([all_comments,comments_df],ignore_index=True)
        time.sleep(1)    

Searching for 'MMM' in r/stocks...
Found 37 posts and 48 comments
Searching for 'AOS' in r/stocks...
Found 14 posts and 1 comments
Searching for 'ABT' in r/stocks...
Found 39 posts and 17 comments
Searching for 'ABBV' in r/stocks...
Found 43 posts and 91 comments
Searching for 'ACN' in r/stocks...
Found 46 posts and 3 comments
Searching for 'ADBE' in r/stocks...
Found 46 posts and 56 comments
Searching for 'AMD' in r/stocks...
Found 50 posts and 400 comments
Searching for 'AES' in r/stocks...
Found 23 posts and 6 comments
Searching for 'AFL' in r/stocks...
Found 18 posts and 5 comments
Searching for 'A' in r/stocks...
Found 50 posts and 919 comments
Searching for 'APD' in r/stocks...
Found 21 posts and 16 comments
Searching for 'ABNB' in r/stocks...
Found 40 posts and 74 comments
Searching for 'AKAM' in r/stocks...
Found 39 posts and 15 comments
Searching for 'ALB' in r/stocks...
Found 32 posts and 43 comments
Searching for 'ARE' in r/stocks...
Found 50 posts and 278 comments
Searching

In [33]:
all_posts.to_csv('./data/reddit/posts_stock_specific.csv')
all_comments.to_csv('./data/reddit/comments_stock_specific.csv')

### Now that we have the reddit data we will be downloading the Financial Data from EDGAR so we can analyze and compute Financial data mainly from the 10K