In [6]:
import os
from dotenv import load_dotenv
load_dotenv()

CLIENT_ID = os.getenv('CLIENT_ID')
CLIENT_SECRET=os.getenv('CLIENT_SECRET')
APP_NAME=os.getenv('APP_NAME')
REDDIT_USERNAME=os.getenv('REDDIT_USERNAME')
REDDIT_PASSWORD=os.getenv('REDDIT_PASSWORD')

# Load Reddit API stuff

In [7]:
import praw
import pandas as pd
import datetime as dt
import numpy as np

In [8]:
reddit = praw.Reddit(client_id=CLIENT_ID, client_secret=CLIENT_SECRET, user_agent=APP_NAME, username=REDDIT_USERNAME, password=REDDIT_PASSWORD)

# Get dataframe of topics from neutral news

In [16]:
subreddit = reddit.subreddit('neutralnews')
top_subreddit = subreddit.top()
topics_dict = {'title':[], 'score':[], 'id':[], 'url':[], 'comms_num': [], 'created':[], 'body':[]}
for submission in top_subreddit:
    topics_dict['title'].append(submission.title)
    topics_dict['score'].append(submission.score)
    topics_dict['id'].append(submission.id)
    topics_dict['url'].append(submission.url)
    topics_dict['comms_num'].append(submission.num_comments)
    topics_dict['created'].append(submission.created)
    topics_dict['body'].append(submission.selftext)
topics_data = pd.DataFrame(topics_dict)

TypeError: 'ListingGenerator' object is not subscriptable

# Make comment df

In [10]:
comments_dict = {"action": [], "content": [], "author": [], "details": [], "submissionId": [], "commentId": []}
for submission in subreddit.top(limit=20):
    #print(submission.title, submission.id)
    submission.comments.replace_more(limit=100)
    for comment in submission.comments:
        #print(top_level_comment.body)
    
        comments_dict["action"].append(np.nan)
        comments_dict["content"].append(comment.body)
        comments_dict["author"].append(comment.author)
        comments_dict["details"].append(np.nan)
        comments_dict["submissionId"].append(submission.id)
        comments_dict["commentId"].append(comment.id)

comment_data = pd.DataFrame(comments_dict)

In [11]:
comment_data

Unnamed: 0,action,content,author,details,submissionId,commentId
0,,---- **/r/NeutralNews is a curated space.**\nI...,AutoModerator,,5uy6s0,ddxqtmg
1,,Just a quick reminder what neutral means here ...,BundleOfHiss,,5uy6s0,ddy27rm
2,,Already on the campaign trail Trump wanted to ...,samuelsamvimes,,5uy6s0,ddxttda
3,,This may be true but criticizing the press isn...,RufusRocks,,5uy6s0,ddxzpjb
4,,Ever since [propaganda](http://www.businessins...,cheekygorilla,,5uy6s0,ddyajye
...,...,...,...,...,...,...
233,,What bill are they actually talking about? I’d...,HarryPotterAMA,,7ee4u9,dq5hn1i
234,,[removed],,,7ee4u9,dq4ws7x
235,,Doing their darndest to hold onto that Alabama...,Ginger_Lord,,7ee4u9,dq5gczf
236,,---- **/r/NeutralNews is a curated space.**\nI...,AutoModerator,,ahxiba,eej17bt


# Import NER and Wordnet

In [12]:
from newspaper import Article
from newspaper import Config

user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
config = Config()
config.browser_user_agent = user_agent

import spacy
from spacy import displacy
from collections import Counter
!python -m spacy download en_core_web_lg
!pip install spacy-wordnet

import en_core_web_lg
nlp = en_core_web_lg.load()

from spacy_wordnet.wordnet_annotator import WordnetAnnotator
nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')


STOP_WORDS = spacy.lang.en.stop_words.STOP_WORDS

[+] Download and installation successful
You can now load the model via spacy.load('en_core_web_lg')


In [13]:
def clean_articles(topics_data, comment_data, text_list):
    for url in topics_data['url']:
        try:
            article = Article(submission.url, language='en', fetch_images=False, config = config)
            article.download()
            article.parse()
            art_text = article.text
            art_doc = nlp(art_text.lower())
            text_list.append(art_doc)
        #if there is an exception, remove comments that go with this article
        except:
            text_list.append("error")
            comment_data = comment_data[comment_data['submissionId'] != url]
            continue
    
    topics_data['text'] = text_list
    
clean_articles(topics_data, comment_data, [])

In [14]:
def getSimWordScore(comment_data, topics_data, simWordScore):
    for index, comment in comment_data.iterrows():
        subID = comment['submissionId'] #get submission id from comment table
        post = topics_data[topics_data['id'] == subID]
        art_doc = post['text']
        art_items = [x.text for x in art_doc.ents]
        #get tokens
        art_tokens = []
        for (item, count) in Counter(art_items).most_common(5):
            token = nlp(item)[0]
            art_tokens += [token]
        #get comment content and ner
        comment_text = comment['content']
        doc = nlp(comment_text.lower())

        items = [x.text for x in doc.ents]

        #initialize list of scores
        score = 0
        #for each token, get a score
        for (item, count) in Counter(items).most_common(5):

            #get token
            token = nlp(item)#[0]

            wordScores = []

            #for each article item
            for art_word in art_tokens:

                #add similarity score to list of scores
                wordScores += [art_word.similarity(token)]
            #get average score    
            score += sum(wordScores)/len(wordScores)
        simWordScore.append(score)
    return simWordScore

wordScoreList = getSimWordScore(comment_data, topics_data, [])
comment_data['WordScore'] = wordScoreList

AttributeError: 'Series' object has no attribute 'ents'

In [None]:
comment_data

# Run NER to get similarity scores

In [None]:
currID = ''
simWordScore = []
simWholeScore = []
for i in range(len(comment_data)):
    subID = comment_data.iloc[i]['submissionId'] #get submission id from comment table
    
    #if this submission id is different than the current submission id
    if subID != currID:
        #get the submission and set current ID to this ID
        submission = reddit.submission(subID)
        currID = subID
        
        #get article from url, and begin NER
        #try to get article
        try:
            article = Article(submission.url, language='en', fetch_images=False, config = config)
            article.download()
            article.parse()
            art_text = article.text
            art_doc = nlp(art_text.lower())
        
        #if there is an exception, remove comments that go with this article
        except:
            i = i + len(comment_data['submissionId'] == currID) - 1
            comment_data = comment_data[comment_data['submissionId'] != currID]
            continue
            
        art_items = [x.text for x in art_doc.ents]
        #get tokens
        art_tokens = []
        for (item, count) in Counter(art_items).most_common(5):
            token = nlp(item)[0]
            art_tokens += [token]
    
    #get comment content and ner
    comment = comment_data.iloc[i]['content']
    doc = nlp(comment.lower())
    
    simWholeScore.append(art_doc.similarity(doc))
    
    items = [x.text for x in doc.ents]
    
    #initialize list of scores
    score = 0
    #for each token, get a score
    for (item, count) in Counter(items).most_common(5):
        
        #get token
        token = nlp(item)#[0]
        
        wordScores = []
        
        #for each article item
        for art_word in art_tokens:
            
            #add similarity score to list of scores
            wordScores += [art_word.similarity(token)]
        #get average score    
        score += sum(wordScores)/len(wordScores)
    simWordScore.append(score)

In [None]:
def getSimWholeScore(currID, simWholeScore):
    subID = comment_data.iloc[i]['submissionId'] #get submission id from comment table

    #if this submission id is different than the current submission id
    if subID != currID:
        #get the submission and set current ID to this ID
        submission = reddit.submission(subID)
        currID = subID

        #get article from url, and begin NER
        #try to get article
        try:
            article = Article(submission.url, language='en', fetch_images=False, config = config)
            article.download()
            article.parse()
            art_text = article.text
            art_doc = nlp(art_text.lower())

        #if there is an exception, remove comments that go with this article
        except:
            i = i + len(comment_data['submissionId'] == currID) - 1
            comment_data = comment_data[comment_data['submissionId'] != currID]
            continue
            
    #get comment content and ner
    comment = comment_data.iloc[i]['content']
    doc = nlp(comment.lower())

    simWholeScore.append(art_doc.similarity(doc))
wholeScoreList = getSimWholeScore(comment_data, topics_data, [])
comment_data['WholeScore'] = wholeScoreList


In [17]:
arr = ['4o2o29',
 '4op948',
 '4sef35',
 '5x0k84',
 '64zsim',
 '68qqfz',
 '6907e3',
 '6mmw91',
 '6n161i',
 '6oftmx',
 '6xapy0',
 '7307wv',
 '76z4xh',
 '7ht8tj',
 '7hyg56',
 '7z5dyt',
 '804o3f',
 '862sn6',
 '8d0t2m',
 '8e7lrz',
 '8go559',
 '8hz10r',
 '8uby76',
 '8x2571',
 '8zp0uh',
 '91gwrp',
 '94q8j7',
 '96nrmf',
 '99fccg',
 '9c88xn',
 '9f2i6b',
 '9fj3m8',
 '9h0mo6',
 '9h82yb',
 '9ht83g',
 '9iqed1',
 '9j3gvd',
 '9jpdh7',
 '9kq8xy',
 '9ood1n',
 '9rbpt8',
 '9s3gd8',
 '9smhsl',
 '9xyyt0',
 '9zonfn',
 'a4tn3i',
 'a67z1j',
 'aab5gq',
 'ab7m8y',
 'ac7d56',
 'acz2f9',
 'ahxiba',
 'ailn92',
 'aj96gr',
 'ajvsq6',
 'alzqd0',
 'an9m5u',
 'andfa8',
 'apss2q',
 'asacv6',
 'asrdao',
 'auz29y',
 'axnhrj',
 'b47a1d',
 'b7e8o4',
 'b8cd1q',
 'b9d62h',
 'bj3tgb',
 'bjngf5',
 'bka66t',
 'blxd09',
 'bpi7u6',
 'busrz0',
 'bv8dl8',
 'bvk72a',
 'bxtj3y',
 'byiwll',
 'c109ai',
 'c3swuk',
 'c75b2i',
 'c7nqp7',
 'cess35',
 'cfa7tc',
 'cfa99a',
 'cfc5qa',
 'cfzkky',
 'cgd7ut',
 'cgks97',
 'cgkspq',
 'cgktg9',
 'cgkueu',
 'cgl6x6',
 'cim6kf',
 'f8says',
 'fd2f6z',
 'fuhknm',
 'gcz931',
 'gw3uey',
 'h9pyv5',
 'hi25c6',
 'hi274b',
 'hi2ab8',
 'hi2bdj',
 'hi2ckj',
 'hi3rm3',
 'hi3vwz',
 'hi3wk8',
 'hi4bgg',
 'hi4evw',
 'hi4x3g',
 'hibdl7',
 'hibxec',
 'hic2by',
 'hic40i',
 'hiepzp',
 'hioh3o',
 'hiokda',
 'hirg3c',
 'hivrgb',
 'hiwou8',
 'hj1nrq',
 'hjbexy',
 'hjbkf4',
 'hjimwh',
 'hjjxr5',
 'hjlda4',
 'hjm3wf',
 'hjm6py',
 'hjukp3',
 'hjxeu0',
 'hk0qme',
 'hk2e32',
 'hk6kwj',
 'hkjxz4',
 'hkoist',
 'hkt524',
 'hkx96j',
 'hl6bt7',
 'hl9cux',
 'hlaf18',
 'hlnh4l',
 'hlp04m',
 'hlpg7q',
 'hlxf66',
 'hly9t3',
 'hm7yu1',
 'hm8nfw',
 'hm8qvv',
 'hm9zu6',
 'hmhvvu',
 'hmk5k2',
 'hmor9m',
 'hmqkok',
 'hmy4jo',
 'hn1me3',
 'hnh39s',
 'hnhbls',
 'hnid7x',
 'hnovi8',
 'hnqx5w',
 'ho28c4',
 'ho3359',
 'ho42my',
 'ho59s9',
 'ho5jt8',
 'ho63fl',
 'hoo2zh',
 'horcro',
 'hosttw',
 'hp0ehw',
 'hp0eso',
 'hpjk4b',
 'hpx1r9',
 'hqbqsp',
 'hqfa6c',
 'hqj618',
 'hr3a77',
 'hr8126',
 'hr9uva',
 'hrcipo',
 'hrf0fa',
 'hrmbwc',
 'hrmo0b',
 'hrpxqn',
 'hruu5h',
 'hs5x4y',
 'hsacrg',
 'hsal1n',
 'hse1gz',
 'hsjsqx',
 'hstslg',
 'hsvpjz',
 'hsw9jm',
 'ht1vcp',
 'ht7wcf',
 'htasdr',
 'hti9ls',
 'htol34',
 'htrt98',
 'hu0n3u',
 'hu2rko',
 'huke8e',
 'hul38u',
 'hul9at',
 'hulpli',
 'huqzl7',
 'huw0tt',
 'huw3oh',
 'hv149r',
 'hv6rn4',
 'hvczsj',
 'hvdh5m',
 'hvgskd',
 'hvhcop',
 'hvtzb6',
 'hw2lr9',
 'hwakq3',
 'hwawu3',
 'hwg5me',
 'hwjoxu',
 'hx0e81',
 'hx9vlf',
 'hxfb5w',
 'hxpm0c',
 'hxq0u6',
 'hxypm9',
 'hy64fx',
 'hyf828',
 'hyg7f6',
 'hyscrn',
 'hyta9l',
 'hyubiw',
 'hyxdk8',
 'hzevob',
 'hzhcai',
 'hzl8el',
 'hzp4ts',
 'hztuwl',
 'i00tfi',
 'i02pzj',
 'i03xkr',
 'i07pbd',
 'i0m39k',
 'i0mxrh',
 'i0n6q4',
 'i0nct3',
 'i12kut',
 'i16cum',
 'i17yor',
 'i18yzj',
 'i1gv43',
 'i1hkfg',
 'i1syu7',
 'i21mky',
 'i21pw5',
 'i22uxu',
 'i28qla',
 'i28yme',
 'i292mp',
 'i2bp46',
 'i2dogn',
 'i2f8b3',
 'i33x8h',
 'i38x2w',
 'i3il30',
 'i3jcbd',
 'i3jfjv',
 'i3k5uu',
 'i3k6x6',
 'i3uttm',
 'i3wcwt',
 'i46ep0',
 'i4fnx0',
 'i4jlno',
 'i4l57l',
 'i4lo6w',
 'i4qm6n',
 'i4ucjp',
 'i5d7ed',
 'i5e8p7',
 'i5ex5l',
 'i5p8xz',
 'i5r8av',
 'i6ijq7',
 'i6oah7',
 'i6q3rl',
 'i6s6e4',
 'i6uvoc',
 'i739ub',
 'i74yaf',
 'i75f5j',
 'i77lwd',
 'i7a2fj',
 'i7gahj',
 'i7q4sp',
 'i7safq',
 'i7such',
 'i7wc60',
 'i7zc4g',
 'i7zica',
 'i80kio',
 'i8drdk',
 'i8dxuw',
 'i8nfh1',
 'i8oo52',
 'i8w5x2',
 'i9210g',
 'i929nd',
 'i9d54j',
 'i9mtu0',
 'i9otlg',
 'i9smf6',
 'i9vled',
 'ia0qvf',
 'ia7d83',
 'ia8tf4',
 'iabirm',
 'iakuxb',
 'iaua5h',
 'ib12pg',
 'ib2axf',
 'ib4jlg',
 'ibd7pw',
 'ibfjjq',
 'ibg1d0',
 'ibnlsa',
 'ibyrmx',
 'ic10hd',
 'ic1wap',
 'ic2sln',
 'ic4w7z',
 'ice94z',
 'ichwt4',
 'icmpee',
 'icnnua',
 'ics0fc',
 'icxrsk',
 'icy3z3',
 'id3ug9',
 'id6naf',
 'idcpll',
 'ie0flp',
 'ie53mq',
 'ieb9em',
 'ieje3j',
 'ieoobz',
 'iepdjp',
 'iewpfo',
 'if5nbq',
 'ifij8p',
 'ifp07u',
 'ifq1l9',
 'ifty7c',
 'ifwln1',
 'igc7uc',
 'igo5yh',
 'igpjip',
 'igqpm2',
 'igwngr',
 'ih1d3o',
 'ih2rah',
 'ih481a',
 'ih9mqu',
 'ihf2wt',
 'ihiuif',
 'ihnvuu',
 'ihpxh7',
 'ihsoih',
 'ihz2hg',
 'ii9aju',
 'ii9z1s',
 'iicu9x',
 'iifznc',
 'iinnom',
 'iiyfmg',
 'ij02pj',
 'ijgy6t',
 'ijn55e',
 'ijnwn0',
 'ijxzxa',
 'ik07zh',
 'ik1agl',
 'ik4afi',
 'ikbrh9',
 'ikh1vf',
 'ikjcx2',
 'ikolfk',
 'iktlis',
 'ikv1gj',
 'il0iua',
 'il5vfn',
 'il6dom',
 'il6q46',
 'il6sty',
 'il742b',
 'il7in0',
 'il9n4p',
 'ilajy8',
 'ilba8s',
 'ilbgd7',
 'ilbv07',
 'ilenv6',
 'ilg27f',
 'ilgw6c',
 'ilh8t5',
 'ilhx68',
 'ilicps',
 'ilixe5',
 'iljeq8',
 'ilk2hb',
 'illuuz',
 'ilozqt',
 'ilwrx6',
 'ily09p',
 'im36cp',
 'im62jx',
 'im6tnl',
 'im8jmm',
 'imbj93',
 'imkbta',
 'imkt2m',
 'immelc',
 'imo2d5',
 'imorci',
 'imsl4y',
 'imufs1',
 'in14m8',
 'ink4j7',
 'inm0is',
 'inqhx7',
 'io7gfn',
 'ioluws',
 'iovspa',
 'iowrrf',
 'ioxz26',
 'ip7po2',
 'ipci0w',
 'ipgnvk',
 'iphfy0',
 'ipjzsk',
 'ipmgnu',
 'iq2ppp',
 'iq3aul',
 'iq4crh',
 'iqbfy1',
 'iqus5t',
 'iqw6zx',
 'ir69ug',
 'iratrz',
 'irm7ex',
 'irx2mk',
 'irxd3c',
 'irzhex',
 'is1rno',
 'is5xo0',
 'isaozz',
 'isbhkv',
 'isj07n',
 'isjxc3',
 'ismoay',
 'ispld0',
 'isqv6m',
 'istm8e',
 'isum8d',
 'isupz8',
 'it7r7w',
 'it8wz9',
 'it9q8d',
 'itdjjf',
 'itgby4',
 'itmffo',
 'ituljf',
 'itys1b',
 'iu0b63',
 'iu4oda',
 'iu914j',
 'iujhay',
 'iujn20',
 'iupmy2',
 'iuqbld',
 'iuu472',
 'iuvl82',
 'iv3az1',
 'iv534g',
 'iv5s63',
 'iv6drn',
 'iv6t7c',
 'ivgtgl',
 'ivhf6s',
 'ivjfcw',
 'ivn7hd',
 'ivskes',
 'ivy8qp',
 'ivys3w',
 'iwdven',
 'iwh281',
 'iwmawa',
 'iwohni',
 'iwse12',
 'ix2wbk',
 'ix2xbx',
 'ix3rw3',
 'ix409e',
 'ix6rxb',
 'ixllnj',
 'ixmqbx',
 'ixqs7j',
 'ixtt0y',
 'ixwxg6',
 'iy2mv7',
 'iy2phk',
 'iy9574',
 'iydbkj',
 'iyfyeu',
 'iyjm83',
 'iyoo0x',
 'iyorls',
 'iz7am2',
 'iz951v',
 'izf05k',
 'izijv0',
 'izktke',
 'izku6m',
 'izmzt3',
 'izp6yc',
 'izrbbb',
 'izt4sk',
 'izul0p',
 'j04eqv',
 'j06hcq',
 'j06lwq',
 'j078ii',
 'j099m5',
 'j09b0v',
 'j0h85d',
 'j0ijwn']

In [20]:
len(arr)

536

In [21]:
new_arr = ['banana']
new_arr += ['apple']
new_arr

['banana', 'apple']