In [1]:
import praw
from psaw import PushshiftAPI
import json
import pandas as pd
import gzip
import glob
import os
from tqdm.notebook import tqdm_notebook
import re
import datetime
import time
import re

from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
REDDIT_CLIENT_ID = '4FEO91ch16g-P_n8AxU2_A'
REDDIT_CLIENT_SECRET = 'Q2cyR53G5a6IpsSAzLbevNtBiITYhw'
REDDIT_USER_AGENT = 'desktop:DSE203 (by u/Life_is_Life)'

In [3]:
reddit = praw.Reddit(
    client_id = REDDIT_CLIENT_ID,
    client_secret = REDDIT_CLIENT_SECRET,
    user_agent = REDDIT_USER_AGENT
)

api = PushshiftAPI()

In [4]:
# This function will be a decorator for disk caching of API calls
def persist_to_file(file_name):
    def decorator(original_func):

        try:
            cache = json.load(open(file_name, 'r'))
        except (IOError, ValueError):
            cache = {}

        def new_func(param):
            if param not in cache:
                cache[param] = original_func(param)
                json.dump(cache, open(file_name, 'w'))
            return cache[param]

        return new_func

    return decorator

# Get company search queries

In [5]:
def construct_regex_filter(stock_symbol):
    companies_data = json.load(open('../dse203_final_project/companies.json'))
    
    search_q = list()
    
    search_q.append('$' + stock_symbol)
    search_q.append(stock_symbol)
    
    for other_term in companies_data[stock_symbol]:
        if ' ' in other_term:
            search_q.append('"' + other_term + '"')
        else:
            search_q.append(other_term)
    
    return '|'.join(search_q)

# Dataset 1: Reddit Posts

In [6]:
reddit_posts = list()

In [7]:
# Get all posts from manually extracted dataset
for json_file in tqdm_notebook(glob.glob('top_submissions_and_comments/top_submissions/*.json')):
    # Extract stock symbol from file name
    stock_symbol = os.path.basename(json_file)[12:-5]
    
    # Read in JSON file data
    with open(json_file) as f:
        json_file_data = json.load(f)
    
    for record in json_file_data:
        reddit_posts.append({
            'stock_symbol': stock_symbol,
            'author': record['author'],
            'created_utc': datetime.datetime.fromtimestamp(int(record['created_utc'])).isoformat(),
            'id': record['id'],
            'num_comments': record['num_comments'],
            'num_crossposts': record['num_crossposts'],
            'selftext': record.get('selftext'),
            'subreddit': record['subreddit'],
            'title': record['title'],
            'url': record['url'],
            'source': 'Custom Reddit API Query',
        })

  0%|          | 0/1800 [00:00<?, ?it/s]

In [8]:
# Retrieve matching company posts from kaggle posts data
@persist_to_file(os.path.join('all_data', 'submission_ids_without_author_info.json'))
def get_submission_author(submission_id):
    author = reddit.submission(id=submission_id).author
    time.sleep(0.5)
    if author is not None:
        return author.name
    else:
        return None

companies_data = json.load(open('../dse203_final_project/companies.json'))
kaggle_posts_df = pd.read_csv('kaggle_data/wsb-aug-2021-posts.csv')

for stock_symbol in tqdm_notebook(companies_data):
    selector = (
        kaggle_posts_df['title'].str.match(construct_regex_filter(stock_symbol), flags=re.I) | \
        kaggle_posts_df['selftext'].str.match(construct_regex_filter(stock_symbol), flags=re.I)
    )
    for _, record in kaggle_posts_df[selector].iterrows():
        reddit_posts.append({
            'stock_symbol': stock_symbol,
            'author': get_submission_author(record['id']),
            'created_utc': datetime.datetime.fromtimestamp(int(record['created_utc'])).isoformat(),
            'id': record['id'],
            'num_comments': None,
            'num_crossposts': None,
            'selftext': record['selftext'],
            'subreddit': 'wallstreetbets',
            'title': record['title'],
            'url': record['url'],
            'source': 'Kaggle',
        })

  0%|          | 0/77 [00:00<?, ?it/s]

# Dataset 2: Reddit Comments

In [25]:
reddit_comments = list()

In [26]:
# Get all comments from manually extracted dataset
for json_file in tqdm_notebook(glob.glob('top_submissions_and_comments/top_comments/*.json')):
    # Extract stock symbol from file name
    stock_symbol = os.path.basename(json_file)[12:-5]
    
    # Read in JSON file data
    with open(json_file) as f:
        json_file_data = json.load(f)
    
    for record in json_file_data:
        reddit_comments.append({
            'stock_symbol': stock_symbol,
            'author': record['author'],
            'created_utc': datetime.datetime.fromtimestamp(int(record['created_utc'])).isoformat(),
            'id': record['id'],
            'score': record['score'],
            'body': record['body'],
            'subreddit': record['subreddit'],
            'sentiment': None,
            'source': 'Custom Reddit API Query',
            'post_id': re.findall(r'^\/r\/.+?\/comments\/(.+?)\/.+$', json_file_data[0]['permalink'])[0]
        })

  0%|          | 0/1812 [00:00<?, ?it/s]

In [29]:
# Retrieve matching company comments from kaggle posts data
@persist_to_file(os.path.join('all_data', 'comment_ids_without_author_info.json'))
def get_comment_author(comment_id):
    try:
        return next(api.search_comments(ids=comment_id)).author
    except StopIteration:
        return None
    
companies_data = json.load(open('../dse203_final_project/companies.json'))
kaggle_comments_df = pd.read_csv('kaggle_data/wsb-aug-2021-comments.csv')
kaggle_comments_df['body'] = kaggle_comments_df['body'].fillna('')

for stock_symbol in tqdm_notebook(companies_data):
    selector = (
        kaggle_comments_df['body'].str.match(construct_regex_filter(stock_symbol), flags=re.I)
    )
    for _, record in kaggle_comments_df[selector].iterrows():
        reddit_comments.append({
            'stock_symbol': stock_symbol,
            'author': None,
            'created_utc': datetime.datetime.fromtimestamp(int(record['created_utc'])).isoformat(),
            'id': record['id'],
            'score': record['score'],
            'body': record['body'],
            'subreddit': 'wallstreetbets',
            'sentiment': record['sentiment'],
            'source': 'Kaggle',
            'post_id': re.findall(r'^\/r\/.+?\/comments\/(.+?)\/.+$', json_file_data[0]['permalink'])[0],
        })

  0%|          | 0/77 [00:00<?, ?it/s]

In [30]:
# Get Kaggle dataset authors
with ThreadPoolExecutor() as executor:
    futures = set()
    comment_ids = [i['id'] for i in reddit_comments if i['source'] == 'Kaggle']
    for comment_id in comment_ids:
        futures.add(executor.submit(get_comment_author, comment_id))
    
    for future in tqdm_notebook(as_completed(futures), total=len(comment_ids)):
        try:
            future.result()
        except Exception as e:
            pass
        
# Add Kaggle dataset authors to reddit_comments
for reddit_comment in tqdm_notebook([i for i in reddit_comments if (i['source'] == 'Kaggle') and (i['author'] is None)]):
    reddit_comment['author'] = get_comment_author(reddit_comment['id'])

  0%|          | 0/43333 [00:00<?, ?it/s]

  0%|          | 0/43333 [00:00<?, ?it/s]

# Write to Disk

In [31]:
with open(os.path.join('all_data', 'reddit_submissions.json'), 'w') as f:
    json.dump(reddit_posts, f, indent=4)

In [32]:
with open(os.path.join('all_data', 'reddit_comments.json'), 'w') as f:
    json.dump(reddit_comments, f, indent=4)