In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
import datetime as dt
import timeit
import itertools
import matplotlib.pyplot as plt

from pathlib import Path
from typing import *
import torch
import torch.optim as optim
from functools import partial
from overrides import overrides

#from allennlp.data import Instance
#from allennlp.data.token_indexers import TokenIndexer
#from allennlp.data.tokenizers import Token
#from allennlp.nn import util as nn_util

import praw
from psaw import PushshiftAPI

import stanfordnlp

In [None]:
api = PushshiftAPI()

In [None]:
# make dataframe of submissions

start_epoch=int(dt.datetime(2019, 6, 1).timestamp())

start_time = timeit.default_timer()
sub_results = list(api.search_submissions(after=start_epoch,
                            subreddit='bookclub',
                            filter=['url','author', 'title', 'subreddit', 'id'],
                            limit=10000000))

all_d = []
for s,sub in enumerate(sub_results):
    all_d.append(sub.d_)
submission_df = pd.DataFrame(all_d)

process_time = timeit.default_timer() - start_time
print(str(len(sub_results)) + ' submissions, query took ' + str(process_time) + ' seconds')

In [None]:
# make dataframe of comments

start_epoch=int(dt.datetime(2019, 6, 1).timestamp())

start_time = timeit.default_timer()
comment_results = list(api.search_comments(after=start_epoch,
                            subreddit='bookclub',
                            filter=['url','author', 'subreddit', 'body', 'score', 'link_id', 'id'],
                            limit=10000000))

all_d = []
for s,sub in enumerate(comment_results):
    all_d.append(sub.d_)
comment_df = pd.DataFrame(all_d)
links_trunc = [l[3:] for l in comment_df['link_id']]
comment_df['link_id_trunc'] = links_trunc


process_time = timeit.default_timer() - start_time
print(str(len(comment_results)) + ' comments, query took ' + str(process_time) + ' seconds')

In [None]:
all_author_text = []
num_posts = []
for sub_id in submission_df['id']:
    sub_df = comment_df[comment_df['link_id_trunc']==sub_id]
    authors = list(sub_df.author.unique())
    if authors:
        author_text = [sub_df[sub_df['author']==author]['body'] for author in authors]
        ''' 
        1. tokenize
        2. lemmatize
        3. parts-of-speech classification
        4. Noun extraction
        5. fuzzy-matching to correct spelling, and join different forms of the noun
        6. tf-idf
        7. normalize by historical frequency (novelty detection)?
        '''
        all_author_text.append(list(itertools.chain.from_iterable(author_text)))
        num_posts.append(len(author_text))
    else:
        all_author_text.append(None)
        num_posts.append(0)

In [None]:
submission_df['text'] = all_author_text
submission_df['num_posts'] = num_posts
submission_df

# StanfordNLP

In [None]:
stanfordnlp.download('en')   # This downloads the English models for the neural pipeline
nlp = stanfordnlp.Pipeline(lang='en') # This sets up a default neural pipeline in English

In [None]:
submission_num = 0
text_list = submission_df['text'].iloc[submission_num]
contrib_text = ''.join(text_list)

In [None]:
def show_word_relations(doc):
    for i, sent in enumerate(doc.sentences):
        print("[Sentence {}]".format(i+1))
        for word in sent.words:
            print("{:12s}\t{:12s}\t{:6s}\t{:d}\t{:12s}".format(\
                  word.text, word.lemma, word.pos, word.governor, word.dependency_relation))
        print("")

In [None]:
txt = nlp(submission_df['title'].iloc[0])
show_word_relations(txt)

In [None]:
start_time = timeit.default_timer()

doc = nlp(contrib_text)

process_time = timeit.default_timer() - start_time
print('NLP took ' + str(process_time) + ' seconds')

In [None]:
show_word_relations(doc)

# Spacy

In [None]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm

In [None]:
#nlp = spacy.load('en')
#nlp = spacy.load('en_core_web_sm')
nlp = spacy.load('en_core_web_lg')
# if problems, see https://stackoverflow.com/questions/54334304/spacy-cant-find-model-en-core-web-sm-on-windows-10-and-python-3-5-3-anacon

In [None]:
print(submission_df['text'].iloc[0][0])

In [None]:
all_names = [] # entity = 'PERSON'
all_works = [] # entity = 'WORK_OF_ART'
all_topics = [] # name or work from submission title

for r,row in submission_df.iterrows():
    if row.text:
        doc = nlp(row.text[0])
        names = [X.text for X in doc.ents if X.label_=='PERSON']
        works_art = [X.text for X in doc.ents if X.label_=='WORK_OF_ART']
        if names:
            all_names.append(names)
        else:
            all_names.append(None)
        if works_art:
            all_works.append(works_art)
        else:
            all_works.append(None)
    else:
        all_names.append(None)
        all_works.append(None)
        
    topic = []
    if row.title:
        doc = nlp(row.title) # title is a string, not a list
        names = [X.text for X in doc.ents if X.label_=='PERSON']
        works_art = [X.text for X in doc.ents if X.label_=='WORK_OF_ART']
        if names:
            topic.append(names)
        elif works_art:
            topic.append(works_art)
        else:
            topic.append(None)
    else:
        topic.append(None)
    all_topics.append(topic[0])

In [None]:
submission_df['names'] = all_names
submission_df['works'] = all_works
submission_df['topic'] = all_topics

In [None]:
submission_df.head(10)

In [None]:
num_posts = np.array(submission_df['num_posts'].values)

num_names = []
for n in submission_df['names']:
    if n:
        num_names.append(len(n))
    else:
        num_names.append(0)
num_names = np.array(num_names)

num_works = []
for n in submission_df['works']:
    if n:
        num_works.append(len(n))
    else:
        num_works.append(0)
num_works = np.array(num_works)

In [None]:
plt.scatter(num_posts, num_names+num_works)

In [None]:
submission_df['url'].iloc[7]

In [None]:
submission_df['text'].iloc[7]

In [None]:
names = list(itertools.chain.from_iterable(all_names))
art = list(itertools.chain.from_iterable(all_works))
print(art)

In [None]:
for r,row in submission_df.iterrows():
    if row.text:
        doc = nlp(row.text[0])
        for ent in doc.ents:
            if ent.label_=='PERSON':
                print(ent.text, ent.vector)

# ALLEN NLP

# GENSIM