In [1]:
import spacy
import pytextrank
import json
import gzip
from tqdm.notebook import tqdm_notebook
import pandas as pd

In [2]:
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('textrank')

<pytextrank.base.BaseTextRankFactory at 0x24f6687f370>

In [3]:
def spacy_process(text):
    doc = nlp(text)
    
    # Lowercase
    filtered_list = []
    for token in doc:
        filtered_list.append(token.lower_)
    
    # Remove punctuation
    punctuations = "?:!.,;()"
    for word in filtered_list:
        if word in punctuations:
            filtered_list.remove(word)
    
    # Remove whitespace characters
    white_space = ['\n', '\t', '\n\n', '\n\n\n']
    for word in filtered_list:
        if word in white_space:
            filtered_list.remove(word)
    
    doc = nlp(' '.join(filtered_list))
    
    return doc._.phrases, ' '.join(filtered_list)

In [4]:
text_rank_data = list()

In [5]:
with gzip.open('all_data/reddit_submissions.json.gz', 'rb') as f:
    reddit_submissions = json.load(f)

relevant_submissions = filter(
    lambda i: (
        i['source'] == 'Kaggle' and \
        i['selftext'] is not None and \
        not ( i['selftext'] == '[removed]' or i['selftext'] == '[deleted]' )
    ),
    reddit_submissions
)

for reddit_submission in tqdm_notebook(list(relevant_submissions)):
    text_rank_phrases, filtered_doc = spacy_process(reddit_submission['selftext'])
    
    for phrase in text_rank_phrases[0:10]:
        text_rank_data.append({
            'id': reddit_submission['id'],
            'document_type': 'submission',
            'phrase': phrase.text,
            'rank': phrase.rank,
            'count': phrase.count,
        })

  0%|          | 0/56 [00:00<?, ?it/s]

In [6]:
with gzip.open('all_data/reddit_comments.json.gz', 'rb') as f:
    reddit_comments = json.load(f)

relevant_comments = [i for i in reddit_comments if i['source'] == 'Kaggle' and i['body'] is not None]
stock_symbols = set(i['stock_symbol'] for i in relevant_comments)

for stock_symbol in tqdm_notebook(stock_symbols):
    stock_top_comments = sorted([i for i in relevant_comments if i['stock_symbol'] == 'AAPL'], key=lambda i: i['score'], reverse=True)
    
    for comment in stock_top_comments:
        text_rank_phrases, filtered_doc = spacy_process(comment['body'])
        
        for phrase in text_rank_phrases[0:10]:
            text_rank_data.append({
                'id': reddit_submission['id'],
                'document_type': 'comment',
                'phrase': phrase.text,
                'rank': phrase.rank,
                'count': phrase.count,
            })

  0%|          | 0/56 [00:00<?, ?it/s]

In [7]:
text_rank_data[0]

{'id': 'p22oxd',
 'document_type': 'submission',
 'phrase': 'line fib trends',
 'rank': 0.08219014293954903,
 'count': 1}

In [8]:
len(text_rank_data)

163688

In [9]:
with open('all_data/text_rank_data.json', 'w') as f:
    json.dump(text_rank_data, f, indent=4)