In [1]:
import praw
from psaw import PushshiftAPI
import datetime as dt
import pandas as pd
import shutil
import os
from tqdm.notebook import tqdm_notebook
import json
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
REDDIT_CLIENT_ID = '4FEO91ch16g-P_n8AxU2_A'
REDDIT_CLIENT_SECRET = 'Q2cyR53G5a6IpsSAzLbevNtBiITYhw'
REDDIT_USER_AGENT = 'desktop:DSE203 (by u/Life_is_Life)'

SUBREDDITS_TO_EXTRACT = (
    # Sources:
    #  - https://thehiveindex.com/topics/investing/platform/reddit/
    #  - https://www.investopedia.com/reddit-top-investing-and-trading-communities-5189322
    'stocks',
    'wallstreetbets',
    'pennystocks',
    'investing',
    'Wallstreetbetsnew',
    'StockMarket',
    'options',
    'RobinHood',
    'RobinHoodPennyStocks',
    'weedstocks',
    'smallstreetbets',
    'SecurityAnalysis',
    'CanadianInvestor',
    'SPACs',
    'InvestmentClub',
    'ValueInvesting',
    'investing_discussion',
    'stonks',
    'shroomstocks',
)

In [3]:
reddit = praw.Reddit(
    client_id = REDDIT_CLIENT_ID,
    client_secret = REDDIT_CLIENT_SECRET,
    user_agent = REDDIT_USER_AGENT
)

api = PushshiftAPI()

# Get top submissions (Reddit posts)

Note: Make sure `top_submissions_and_comments/top_submissions` directory exists before running this. Any data inside this folder may be overwritten.

In [4]:
def construct_search_q(stock_symbol):
    companies_data = json.load(open('../dse203_final_project/companies.json'))
    
    search_q = list()
    
    search_q.append('$' + stock_symbol)
    search_q.append(stock_symbol)
    
    for other_term in companies_data[stock_symbol]:
        if ' ' in other_term:
            search_q.append('"' + other_term + '"')
        else:
            search_q.append(other_term)
    
    return '|'.join(search_q)

In [5]:
START_DATE = '2020-01-01'
END_DATE = '2021-12-01'

# Top Submissions

In [7]:
LIMIT = 500
companies_data = json.load(open('../dse203_final_project/companies.json'))
month_begins = pd.date_range(START_DATE, END_DATE, freq='1MS')


def get_top_sumbissions(stock_symbol, month_begin):
    destination_json_file_path = os.path.join('top_submissions_and_comments', 'top_submissions', month_begin.strftime('%Y_%m_%d') + '__' + stock_symbol + '.json')
    
    if os.path.exists(destination_json_file_path):
        return
    
    start_epoch = int(month_begin.to_pydatetime().timestamp())
    end_epoch = int((month_begin + pd.offsets.MonthBegin() - pd.offsets.Second()).timestamp())
        
    top_submissions_gen = api.search_submissions(
        after = start_epoch,
        before = end_epoch,
        q = construct_search_q(stock_symbol),
        subreddit = ','.join(SUBREDDITS_TO_EXTRACT),
        sort_type = 'num_comments',
        sort = 'desc',
        filter = ['id', 'url', 'subreddit', 'created', 'author', 'num_comments', 'num_crossposts', 'title', 'selftext'],
        limit = LIMIT
    )
    
    top_submissions = list()
    for s in top_submissions_gen:
        top_submissions.append(s.d_)
    
    with open(os.path.join('top_submissions_and_comments', 'top_submissions', month_begin.strftime('%Y_%m_%d') + '__' + stock_symbol + '.json'), 'w') as f:
        json.dump(top_submissions, f, indent=4)


with ThreadPoolExecutor() as executor:
    futures = set()
    for month_begin in month_begins:
        for stock_symbol in companies_data:
            futures.add(executor.submit(get_top_sumbissions, stock_symbol, month_begin))
    
    for future in tqdm_notebook(as_completed(futures), total=len(month_begins)*len(companies_data)):
        future.result()

  0%|          | 0/1800 [00:00<?, ?it/s]



# Top Comments

In [32]:
LIMIT = 1_500
companies_data = json.load(open('../dse203_final_project/companies.json'))
month_begins = pd.date_range(START_DATE, END_DATE, freq='1MS')

def get_top_comments(stock_symbol, month_begin):
    destination_json_file_path = os.path.join('top_submissions_and_comments', 'top_comments', month_begin.strftime('%Y_%m_%d') + '__' + stock_symbol + '.json')
    
    if os.path.exists(destination_json_file_path):
        return

    start_epoch = int(month_begin.to_pydatetime().timestamp())
    end_epoch = int((month_begin + pd.offsets.MonthBegin() - pd.offsets.Second()).timestamp())
    
    top_comments_gen = api.search_comments(
        after = start_epoch,
        before = end_epoch,
        q = construct_search_q(stock_symbol),
        subreddit = ','.join(SUBREDDITS_TO_EXTRACT),
        sort_type = 'score',
        sort = 'desc',
        filter = ['id', 'url', 'subreddit', 'created', 'author', 'body', 'score'],
        limit = LIMIT
    )
    
    top_comments = list()
    for s in top_comments_gen:
        top_comments.append(s.d_)
    
    with open(destination_json_file_path, 'w') as f:
        json.dump(top_comments, f, indent=4)
        
with ThreadPoolExecutor() as executor:
    futures = set()
    for month_begin in month_begins:
        for stock_symbol in companies_data:
            futures.add(executor.submit(get_top_comments, stock_symbol, month_begin))
    
    for future in tqdm_notebook(as_completed(futures), total=len(month_begins)*len(companies_data)):
        try:
            future.result()
        except Exception:
            print('An error occurred with one of the jobs')

  0%|          | 0/1800 [00:00<?, ?it/s]



In [30]:
get_top_comments('AAPL', pd.to_datetime('2021-01-01'))