In [4]:
import os
from dotenv import load_dotenv
load_dotenv()

CLIENT_ID = os.getenv('CLIENT_ID')
CLIENT_SECRET=os.getenv('CLIENT_SECRET')
APP_NAME=os.getenv('APP_NAME')
REDDIT_USERNAME=os.getenv('REDDIT_USERNAME')
REDDIT_PASSWORD=os.getenv('REDDIT_PASSWORD')

# Load Reddit API stuff

In [5]:
import praw
import pandas as pd
import datetime as dt
import numpy as np

In [6]:
reddit = praw.Reddit(client_id=CLIENT_ID, client_secret=CLIENT_SECRET, user_agent=APP_NAME, username=REDDIT_USERNAME, password=REDDIT_PASSWORD)

# Get dataframe of topics from neutral news

In [9]:
subreddit = reddit.subreddit('neutralnews')
top_subreddit = subreddit.top()
topics_dict = {'title':[], 'score':[], 'id':[], 'url':[], 'comms_num': [], 'created':[], 'body':[]}
for submission in top_subreddit:
    topics_dict['title'].append(submission.title)
    topics_dict['score'].append(submission.score)
    topics_dict['id'].append(submission.id)
    topics_dict['url'].append(submission.url)
    topics_dict['comms_num'].append(submission.num_comments)
    topics_dict['created'].append(submission.created)
    topics_dict['body'].append(submission.selftext)
topics_data = pd.DataFrame(topics_dict)

# Make comment df

In [11]:
comments_dict = {"action": [], "content": [], "author": [], "details": [], "submissionId": [], "commentId": []}
for submission in subreddit.top(limit=20):
    #print(submission.title, submission.id)
    submission.comments.replace_more(limit=100)
    for comment in submission.comments:
        #print(top_level_comment.body)
    
        comments_dict["action"].append(np.nan)
        comments_dict["content"].append(comment.body)
        comments_dict["author"].append(comment.author)
        comments_dict["details"].append(np.nan)
        comments_dict["submissionId"].append(submission.id)
        comments_dict["commentId"].append(comment.id)

comment_data = pd.DataFrame(comments_dict)

In [12]:
comment_data

Unnamed: 0,action,content,author,details,submissionId,commentId
0,,---- **/r/NeutralNews is a curated space.**\nI...,AutoModerator,,5uy6s0,ddxqtmg
1,,Just a quick reminder what neutral means here ...,BundleOfHiss,,5uy6s0,ddy27rm
2,,Already on the campaign trail Trump wanted to ...,samuelsamvimes,,5uy6s0,ddxttda
3,,This may be true but criticizing the press isn...,RufusRocks,,5uy6s0,ddxzpjb
4,,Ever since [propaganda](http://www.businessins...,cheekygorilla,,5uy6s0,ddyajye
...,...,...,...,...,...,...
233,,What bill are they actually talking about? I’d...,HarryPotterAMA,,7ee4u9,dq5hn1i
234,,[removed],,,7ee4u9,dq4ws7x
235,,Doing their darndest to hold onto that Alabama...,Ginger_Lord,,7ee4u9,dq5gczf
236,,---- **/r/NeutralNews is a curated space.**\nI...,AutoModerator,,ahxiba,eej17bt


# Import NER and Wordnet

In [21]:
from newspaper import Article
from newspaper import Config

user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
config = Config()
config.browser_user_agent = user_agent

import spacy
from spacy import displacy
from collections import Counter
!python -m spacy download en_core_web_lg
!pip install spacy-wordnet

import en_core_web_lg
nlp = en_core_web_lg.load()

from spacy_wordnet.wordnet_annotator import WordnetAnnotator
nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')


STOP_WORDS = spacy.lang.en.stop_words.STOP_WORDS

[+] Download and installation successful
You can now load the model via spacy.load('en_core_web_lg')
Collecting spacy-wordnet
  Downloading spacy-wordnet-0.0.4.tar.gz (648 kB)
Collecting nltk<3.4,>=3.3
  Downloading nltk-3.3.0.zip (1.4 MB)
Building wheels for collected packages: spacy-wordnet, nltk
  Building wheel for spacy-wordnet (setup.py): started
  Building wheel for spacy-wordnet (setup.py): finished with status 'done'
  Created wheel for spacy-wordnet: filename=spacy_wordnet-0.0.4-py2.py3-none-any.whl size=650298 sha256=bca6807e9773ee97c577c326295569193ca0bca58e54817b216edf7baed5163e
  Stored in directory: c:\users\mattc\appdata\local\pip\cache\wheels\78\65\76\5a98dae47d1e1ac05010b1da0e935cc3573675cad15713963d
  Building wheel for nltk (setup.py): started
  Building wheel for nltk (setup.py): finished with status 'done'
  Created wheel for nltk: filename=nltk-3.3-py3-none-any.whl size=1394473 sha256=f694876cb3c06a0cbf85eb278d04a64eccae6fd8e8f1ac8b3b2a45da471bc435
  Stored in di

In [22]:
def clean_articles(topics_data, comment_data, text_list):
    for url in topics_data['url']:
        try:
            article = Article(submission.url, language='en', fetch_images=False, config = config)
            article.download()
            article.parse()
            art_text = article.text
            art_doc = nlp(art_text.lower())
            text_list.append(art_doc)
        #if there is an exception, remove comments that go with this article
        except:
            text_list.append("error")
            comment_data = comment_data[comment_data['submissionId'] != url]
            continue
    
    topics_data['text'] = text_list
    
clean_articles(topics_data, comment_data, [])

In [12]:
def getSimWordScore(comment_data, topics_data, simWordScore):
    for index, comment in comment_data.iterrows():
        subID = comment['submissionId'] #get submission id from comment table
        post = topics_data[topics_data['id'] == subID]
        art_doc = post['text']
        art_items = [x.text for x in art_doc.ents]
        #get tokens
        art_tokens = []
        for (item, count) in Counter(art_items).most_common(5):
            token = nlp(item)[0]
            art_tokens += [token]
        #get comment content and ner
        comment_text = comment['content']
        doc = nlp(comment_text.lower())

        items = [x.text for x in doc.ents]

        #initialize list of scores
        score = 0
        #for each token, get a score
        for (item, count) in Counter(items).most_common(5):

            #get token
            token = nlp(item)#[0]

            wordScores = []

            #for each article item
            for art_word in art_tokens:

                #add similarity score to list of scores
                wordScores += [art_word.similarity(token)]
            #get average score    
            score += sum(wordScores)/len(wordScores)
    return simWordScore.append(score)

wordScoreList = getSimWordScore(comment_data, topics_data, [])
comment_data['WordScore'] = wordScoreList

TypeError: string indices must be integers

# Run NER to get similarity scores

In [13]:
currID = ''
simWordScore = []
simWholeScore = []
for i in range(len(comment_data)):
    subID = comment_data.iloc[i]['submissionId'] #get submission id from comment table
    
    #if this submission id is different than the current submission id
    if subID != currID:
        #get the submission and set current ID to this ID
        submission = reddit.submission(subID)
        currID = subID
        
        #get article from url, and begin NER
        #try to get article
        try:
            article = Article(submission.url, language='en', fetch_images=False, config = config)
            article.download()
            article.parse()
            art_text = article.text
            art_doc = nlp(art_text.lower())
        
        #if there is an exception, remove comments that go with this article
        except:
            i = i + len(comment_data['submissionId'] == currID) - 1
            comment_data = comment_data[comment_data['submissionId'] != currID]
            continue
            
        art_items = [x.text for x in art_doc.ents]
        #get tokens
        art_tokens = []
        for (item, count) in Counter(art_items).most_common(5):
            token = nlp(item)[0]
            art_tokens += [token]
    
    #get comment content and ner
    comment = comment_data.iloc[i]['content']
    doc = nlp(comment.lower())
    
    simWholeScore.append(art_doc.similarity(doc))
    
    items = [x.text for x in doc.ents]
    
    #initialize list of scores
    score = 0
    #for each token, get a score
    for (item, count) in Counter(items).most_common(5):
        
        #get token
        token = nlp(item)#[0]
        
        wordScores = []
        
        #for each article item
        for art_word in art_tokens:
            
            #add similarity score to list of scores
            wordScores += [art_word.similarity(token)]
        #get average score    
        score += sum(wordScores)/len(wordScores)
    simWordScore.append(score)

  wordScores += [art_word.similarity(token)]


In [14]:
simWordScore

[0.09441389301463617,
 0.11348354043332853,
 1.7990530497831565,
 0,
 0.32879050306342766,
 0,
 0,
 0,
 0,
 0,
 0.1300755099780606,
 0,
 1.7319029868229805,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0.5136267198118085,
 0,
 1.1564207035194456,
 0,
 0,
 0,
 0.15607552927063809,
 0.0,
 0.4328569392468269,
 1.3801175422472913,
 1.6057585759584638,
 1.6071434793112063,
 0,
 0,
 0,
 1.3444732538840762,
 0,
 0,
 0,
 0.10882211567132397,
 0,
 0.6521660287698077,
 0,
 0,
 0,
 0,
 0,
 0,
 0.7249424792612424,
 0,
 0,
 0,
 0,
 0,
 0.05235959057817251,
 0,
 0.17197840302213716,
 0,
 0,
 0,
 0,
 0,
 0.6739583685205697,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0.16881270807455917,
 0.4609321759765649,
 1.2395381522125635,
 0.9995642195247768,
 0.4792478192123574,
 0,
 0.013521386083370475,
 0.7395408672278698,
 0.6884347605031518,
 0,
 0.00200104848186242,
 0.7929269080035706,
 0.7665493173889317,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0.7526473486531605,
 -0.014854178339347907,
 0,
 -0.022

In [15]:
simWholeScore

[0.9098790417345315,
 0.9471640103896963,
 0.9338509722251349,
 0.9346357066654838,
 0.8964103092682771,
 0.32742976809043195,
 0.9612970895119729,
 0.39428346148548477,
 0.39428346148548477,
 0.39428346148548477,
 0.8688051475414897,
 0.39428346148548477,
 0.9710721380914878,
 0.39428346148548477,
 0.39428346148548477,
 0.39428346148548477,
 0.39428346148548477,
 0.39428346148548477,
 0.39428346148548477,
 0.39428346148548477,
 0.39428346148548477,
 0.39428346148548477,
 0.39428346148548477,
 0.39428346148548477,
 0.39428346148548477,
 0.9318763457255054,
 0.9545589361852977,
 0.39428346148548477,
 0.39428346148548477,
 0.9753841262423627,
 0.39428346148548477,
 0.9207612312904487,
 0.39428346148548477,
 0.39428346148548477,
 0.39428346148548477,
 0.915669323999118,
 0.9377015374323769,
 0.9215286690128374,
 0.8654823667647782,
 0.942527985799112,
 0.9594020668763508,
 0.40974601719576337,
 0.40974601719576337,
 0.40974601719576337,
 0.9600298296308796,
 0.40974601719576337,
 0.409746

In [16]:
comment_data['SimilarWordScore'] = simWordScore
comment_data['SimilarWholeScore'] = simWholeScore

In [17]:
comment_data

Unnamed: 0,action,content,author,details,submissionId,commentId,SimilarWordScore,SimilarWholeScore
0,,---- **/r/NeutralNews is a curated space.**\nI...,AutoModerator,,5uy6s0,ddxqtmg,0.094414,0.909879
1,,Just a quick reminder what neutral means here ...,BundleOfHiss,,5uy6s0,ddy27rm,0.113484,0.947164
2,,Already on the campaign trail Trump wanted to ...,samuelsamvimes,,5uy6s0,ddxttda,1.799053,0.933851
3,,This may be true but criticizing the press isn...,RufusRocks,,5uy6s0,ddxzpjb,0.000000,0.934636
4,,Ever since [propaganda](http://www.businessins...,cheekygorilla,,5uy6s0,ddyajye,0.328791,0.896410
...,...,...,...,...,...,...,...,...
233,,What bill are they actually talking about? I’d...,HarryPotterAMA,,7ee4u9,dq5hn1i,0.000000,0.899272
234,,[removed],,,7ee4u9,dq4ws7x,0.000000,0.413061
235,,Doing their darndest to hold onto that Alabama...,Ginger_Lord,,7ee4u9,dq5gczf,0.153568,0.894425
236,,---- **/r/NeutralNews is a curated space.**\nI...,AutoModerator,,ahxiba,eej17bt,0.128309,0.900826


In [None]:
def getSimWholeScore(currID, simWholeScore):
    subID = comment_data.iloc[i]['submissionId'] #get submission id from comment table

    #if this submission id is different than the current submission id
    if subID != currID:
        #get the submission and set current ID to this ID
        submission = reddit.submission(subID)
        currID = subID

        #get article from url, and begin NER
        #try to get article
        try:
            article = Article(submission.url, language='en', fetch_images=False, config = config)
            article.download()
            article.parse()
            art_text = article.text
            art_doc = nlp(art_text.lower())

        #if there is an exception, remove comments that go with this article
        except:
            i = i + len(comment_data['submissionId'] == currID) - 1
            comment_data = comment_data[comment_data['submissionId'] != currID]
            continue
            
    #get comment content and ner
    comment = comment_data.iloc[i]['content']
    doc = nlp(comment.lower())

    simWholeScore.append(art_doc.similarity(doc))
