First, we import everything we need and connect to the Reddit API

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import ElasticNetCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import warnings
warnings.filterwarnings('ignore')

import os
from dotenv import load_dotenv
load_dotenv()

CLIENT_ID = os.getenv('CLIENT_ID')
CLIENT_SECRET=os.getenv('CLIENT_SECRET')
APP_NAME=os.getenv('APP_NAME')
REDDIT_USERNAME=os.getenv('REDDIT_USERNAME')
REDDIT_PASSWORD=os.getenv('REDDIT_PASSWORD')

import praw
import pandas as pd
import datetime as dt

reddit = praw.Reddit(client_id=CLIENT_ID, client_secret=CLIENT_SECRET, user_agent=APP_NAME, username=REDDIT_USERNAME, password=REDDIT_PASSWORD)

print(reddit.user.me())

%matplotlib inline

mattcat26


Next, we are going to read in the bad comment data given to us by a moderator of the 'neutralnews' subreddit.

In [34]:
import gzip
import shutil
with gzip.open('./neutralnews-2020-09-27.json.gz', 'rb') as f_in:
    with open('neutralnews-2020-09-27.json', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [35]:
comment_df = pd.read_json('neutralnews-2020-09-27.json', orient='records')

In [36]:
new_comment_df = comment_df[["action", "content", "author", "details", "submissionId", "id"]]
bad_comment_data = new_comment_df.rename(columns={"id" : "commentId"})
bad_comment_data


Unnamed: 0,action,content,author,details,submissionId,commentId
0,removecomment,"How is this""neutral news""? Literally, it's rep...",b756df5867ce042f3a07f3037e5eeeb9,Rule 5: top-level comment has no links,cgd7ut,5dea4ee94af43200093a2f4c
1,removecomment,Just wondering if you have any updates.,58843f7430c71f72208766a295eaae5e,Low effort top-level comment,cim6kf,5dea4ee94af43200093a2f4d
2,removecomment,.,f9dcef98ec140a8f44d5691d34081408,Rule 5: top-level comment has no links,cfzkky,5dea4ee94af43200093a2f4e
3,removecomment,I went to park,aab8641b0dc26f59e9c8fea95f470138,Low effort top-level comment,asrdao,5dea4ee94af43200093a2f4f
4,removecomment,"Hi, u/SFepicure\n\n[I think you may enjoy this...",e48fe0135108ee9985e260caf57cc6e0,remove,b47a1d,5dea4ee94af43200093a2f50
...,...,...,...,...,...,...
5997,removecomment,Oh I thought we were posting arbitrary facts t...,38e2216dc3dc2cb196cbf4bab0d83541,remove,j0ijwn,5f70e968dbe1ef0009d976e7
5998,removecomment,No. The senate is not wasting time to fill th...,c4f22a6532a4ca274c5027e4bc3d3d99,remove,j0ijwn,5f70e968dbe1ef0009d976e9
5999,approvecomment,"My point was not addressed, regarding providin...",062512f6041e1c40256c31f31a1138fc,unspam,j04eqv,5f70ee18dbe1ef0009d97717
6000,removecomment,Voting has begun in some states but the debate...,c696933b8cbd4b625077d6b577bac0fe,remove,j0ijwn,5f70f3f4dbe1ef0009d9775e


In [37]:
new_arr = bad_comment_data['submissionId'].to_list()
new_arr = np.array(new_arr)
submissions = np.unique(new_arr)
submissions = submissions.tolist()
submissions

['4o2o29',
 '4op948',
 '4sef35',
 '5x0k84',
 '64zsim',
 '68qqfz',
 '6907e3',
 '6mmw91',
 '6n161i',
 '6oftmx',
 '6xapy0',
 '7307wv',
 '76z4xh',
 '7ht8tj',
 '7hyg56',
 '7z5dyt',
 '804o3f',
 '862sn6',
 '8d0t2m',
 '8e7lrz',
 '8go559',
 '8hz10r',
 '8uby76',
 '8x2571',
 '8zp0uh',
 '91gwrp',
 '94q8j7',
 '96nrmf',
 '99fccg',
 '9c88xn',
 '9f2i6b',
 '9fj3m8',
 '9h0mo6',
 '9h82yb',
 '9ht83g',
 '9iqed1',
 '9j3gvd',
 '9jpdh7',
 '9kq8xy',
 '9ood1n',
 '9rbpt8',
 '9s3gd8',
 '9smhsl',
 '9xyyt0',
 '9zonfn',
 'a4tn3i',
 'a67z1j',
 'aab5gq',
 'ab7m8y',
 'ac7d56',
 'acz2f9',
 'ahxiba',
 'ailn92',
 'aj96gr',
 'ajvsq6',
 'alzqd0',
 'an9m5u',
 'andfa8',
 'apss2q',
 'asacv6',
 'asrdao',
 'auz29y',
 'axnhrj',
 'b47a1d',
 'b7e8o4',
 'b8cd1q',
 'b9d62h',
 'bj3tgb',
 'bjngf5',
 'bka66t',
 'blxd09',
 'bpi7u6',
 'busrz0',
 'bv8dl8',
 'bvk72a',
 'bxtj3y',
 'byiwll',
 'c109ai',
 'c3swuk',
 'c75b2i',
 'c7nqp7',
 'cess35',
 'cfa7tc',
 'cfa99a',
 'cfc5qa',
 'cfzkky',
 'cgd7ut',
 'cgks97',
 'cgkspq',
 'cgktg9',
 'cgkueu',

In [38]:
topics_dict = {'title':[], 'score':[], 'id':[], 'url':[], 'comms_num': [], 'created':[], 'body':[]}

for list_submission in submissions:
    submission = reddit.submission(id=list_submission)
    topics_dict['title'].append(submission.title)
    topics_dict['score'].append(submission.score)
    topics_dict['id'].append(submission.id)
    topics_dict['url'].append(submission.url)
    topics_dict['comms_num'].append(submission.num_comments)
    topics_dict['created'].append(submission.created)
    topics_dict['body'].append(submission.selftext)
topics_data = pd.DataFrame(topics_dict)

Now, we are going to parse in good comment data by scraping the neutral news subreddit.

In [39]:
comments_dict = {"action": [], "content": [], "author": [], "details": [], "submissionId": [], "commentId": []}

In [40]:
subreddit = reddit.subreddit('neutralnews')

for list_submission in submissions:
    submission = reddit.submission(id=list_submission)
    #print(submission.title, submission.id)
    submission.comments.replace_more(limit=100)
    for comment in submission.comments:
        #print(top_level_comment.body)
        comments_dict["action"].append(np.nan)
        comments_dict["content"].append(comment.body)
        comments_dict["author"].append(comment.author)
        comments_dict["details"].append(np.nan)
        comments_dict["submissionId"].append(submission.id)
        comments_dict["commentId"].append(comment.id)

good_comment_data = pd.DataFrame(comments_dict)

In [41]:
good_comment_data = good_comment_data[good_comment_data['author'] != 'AutoModerator']

In [42]:
good_comment_data = good_comment_data.reset_index()

In [43]:
good_comment_data = good_comment_data.drop(['index'], axis=1)
good_comment_data

Unnamed: 0,action,content,author,details,submissionId,commentId
0,,"Just wanted to say ""thanks"" publicly. I'm ver...",zaphnod,,4o2o29,d498hvk
1,,"Hi, thanks for making this sub. A couple quest...",niugnep24,,4o2o29,d49b7t2
2,,"I'd like to say thanks to our ""focus group"" pa...",nosecohn,,4o2o29,d498cxc
3,,[removed],,,4o2o29,d4997am
4,,"Can we get ""Moderator Activity Log"" for /r/Neu...",bloodguard,,4o2o29,d49d04t
...,...,...,...,...,...,...
2606,,"r/NeutralNews is a **curated space**, but desp...",NeutralverseBot,,j0ijwn,g6rh4no
2607,,__I'm a bot. Here are The Factual credibility ...,TheFactualBot,,j0ijwn,g6rhg7f
2608,,[removed],,,j0ijwn,g6ud0cs
2609,,[removed],,,j0ijwn,g6ukt21


We can now combine the good comment data and bad comment data into one dataframe.

In [44]:
from datetime import datetime
now = datetime.now()
current_time = now.strftime('%H:%M:%S')
print('current time = ', current_time)

current time =  21:41:29


After combining the good comment data, run sam's code on the whole comment data and then run the upvote predictor code on the features

In [47]:
good_comment_data.to_csv("good_comment_data_119.csv", index=False)
bad_comment_data.to_csv("bad_comment_data_119.csv", index=False)

In [6]:
from newspaper import Article
from newspaper import Config

user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
config = Config()
config.browser_user_agent = user_agent

import spacy
from spacy import displacy
from collections import Counter
#!python -m spacy download en_core_web_lg
#!pip install spacy-wordnet

import en_core_web_lg
nlp = en_core_web_lg.load()

from spacy_wordnet.wordnet_annotator import WordnetAnnotator
nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')


STOP_WORDS = spacy.lang.en.stop_words.STOP_WORDS

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [52]:
frames = [good_comment_data, bad_comment_data]
comment_data = pd.concat(frames, ignore_index=True)
comment_data

Unnamed: 0,action,content,author,details,submissionId,commentId
0,,"Just wanted to say ""thanks"" publicly. I'm ver...",zaphnod,,4o2o29,d498hvk
1,,"Hi, thanks for making this sub. A couple quest...",niugnep24,,4o2o29,d49b7t2
2,,"I'd like to say thanks to our ""focus group"" pa...",nosecohn,,4o2o29,d498cxc
3,,[removed],,,4o2o29,d4997am
4,,"Can we get ""Moderator Activity Log"" for /r/Neu...",bloodguard,,4o2o29,d49d04t
...,...,...,...,...,...,...
8608,removecomment,Oh I thought we were posting arbitrary facts t...,38e2216dc3dc2cb196cbf4bab0d83541,remove,j0ijwn,5f70e968dbe1ef0009d976e7
8609,removecomment,No. The senate is not wasting time to fill th...,c4f22a6532a4ca274c5027e4bc3d3d99,remove,j0ijwn,5f70e968dbe1ef0009d976e9
8610,approvecomment,"My point was not addressed, regarding providin...",062512f6041e1c40256c31f31a1138fc,unspam,j04eqv,5f70ee18dbe1ef0009d97717
8611,removecomment,Voting has begun in some states but the debate...,c696933b8cbd4b625077d6b577bac0fe,remove,j0ijwn,5f70f3f4dbe1ef0009d9775e


In [7]:
good_comment_data = pd.read_csv('good_comment_data_119.csv')
bad_comment_data = pd.read_csv('bad_comment_data_119.csv')
frames = [good_comment_data, bad_comment_data]
comment_data = pd.concat(frames, ignore_index=True)
comment_data

Unnamed: 0,action,content,author,details,submissionId,commentId
0,,"Just wanted to say ""thanks"" publicly. I'm ver...",zaphnod,,4o2o29,d498hvk
1,,"Hi, thanks for making this sub. A couple quest...",niugnep24,,4o2o29,d49b7t2
2,,"I'd like to say thanks to our ""focus group"" pa...",nosecohn,,4o2o29,d498cxc
3,,[removed],,,4o2o29,d4997am
4,,"Can we get ""Moderator Activity Log"" for /r/Neu...",bloodguard,,4o2o29,d49d04t
...,...,...,...,...,...,...
8608,removecomment,Oh I thought we were posting arbitrary facts t...,38e2216dc3dc2cb196cbf4bab0d83541,remove,j0ijwn,5f70e968dbe1ef0009d976e7
8609,removecomment,No. The senate is not wasting time to fill th...,c4f22a6532a4ca274c5027e4bc3d3d99,remove,j0ijwn,5f70e968dbe1ef0009d976e9
8610,approvecomment,"My point was not addressed, regarding providin...",062512f6041e1c40256c31f31a1138fc,unspam,j04eqv,5f70ee18dbe1ef0009d97717
8611,removecomment,Voting has begun in some states but the debate...,c696933b8cbd4b625077d6b577bac0fe,remove,j0ijwn,5f70f3f4dbe1ef0009d9775e


In [93]:
import nltk
nltk.download('wordnet')

from datetime import datetime
now = datetime.now()
current_time = now.strftime('%H:%M:%S')
print('current time = ', current_time)

def clean_articles(topics_data, comment_data, text_list):
    for url in topics_data['url']:
        try:
            article = Article(url, language='en', fetch_images=False, config = config)
            article.download()
            article.parse()
            art_text = article.text
            art_doc = nlp(art_text.lower())
            text_list.append(art_doc)
        #if there is an exception, remove comments that go with this article
        except:
            text_list.append("error")
            subId = topics_data[topics_data['url'] == url]['id']
            subId = subId.to_numpy()
            subId = subId[0]
            comment_data = comment_data[comment_data['submissionId'] != subId]
            continue
    topics_data['text'] = text_list

clean_articles(topics_data, comment_data, [])
# url = 'https://www.washingtonpost.com/world/national-security/sessions-spoke-twice-with-russian-ambassador-during-trumps-presidential-campaign-justice-officials-say/2017/03/01/77205eda-feac-11e6-99b4-9e613afeb09f_story.html?hpid=hp_rhp-top-table-main_no-name%3Ahomepage%2Fstory&utm_term=.f2a14c329123'
# subId = topics_data[topics_data['url'] == url]['id']
# subId = subId.to_numpy()
# subId = subId[0]
# subId

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mattc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
current time =  23:16:59


In [94]:
from datetime import datetime
now = datetime.now()
current_time = now.strftime('%H:%M:%S')
print('current time = ', current_time)

current time =  23:40:41


In [96]:
topics_data.to_csv("topics_data_119.csv", index=False)

In [8]:
topics_data = pd.read_csv('topics_data_119.csv')
topics_data

Unnamed: 0,title,score,id,url,comms_num,created,body,text
0,[META] Welcome to NeutralNews,225,4o2o29,https://www.reddit.com/r/neutralnews/comments/...,66,1.465955e+09,The goal of /r/NeutralNews is to provide a pla...,the goal of r/neutralnews is to provide a plac...
1,Wall Street has been rocked by an $8 billion h...,183,4op948,http://www.businessinsider.com/visium-asset-ma...,10,1.466297e+09,,"jake gottlieb, the founder of visium. reuters/..."
2,How a $2 Roadside Drug Test Sends Innocent Peo...,91,4sef35,http://www.nytimes.com/2016/07/10/magazine/how...,11,1.468315e+09,,field tests provide quick answers. but if thos...
3,Sessions spoke twice with Russian ambassador d...,668,5x0k84,https://www.washingtonpost.com/world/national-...,84,1.488450e+09,,error
4,The Russia story just keeps getting worse for ...,90,64zsim,http://www.cnn.com/2017/04/12/politics/trump-c...,31,1.492048e+09,,washington (cnn) two stories dealing with russ...
...,...,...,...,...,...,...,...,...
531,Trump falsely claimed an incident where an ele...,9,j078ii,https://www.businessinsider.com/trump-falsely-...,39,1.601160e+09,,president donald trump falsely claimed that an...
532,‘It’s like every red flag’: Trump-ordered HHS ...,5,j099m5,https://www.politico.com/news/2020/09/25/trump...,4,1.601167e+09,,"the $15 million contract, which has not been p..."
533,Fact-Checking Falsehoods on Mail-In Voting - V...,15,j09b0v,https://www.nytimes.com/article/fact-checking-...,3,1.601167e+09,,if you are among the tens of millions of ameri...
534,How a Pledge to Dismantle the Minneapolis Poli...,3,j0h85d,https://www.nytimes.com/2020/09/26/us/politics...,3,1.601195e+09,[deleted],"minneapolis — over three months ago, a majorit..."


In [18]:
#nlp = spacy.load('en_core_web_lg')
def getSimWordScore(comment_data, topics_data, simWordScore):
    for index, comment in comment_data.iterrows():
        subID = comment['submissionId'] #get submission id from comment table
        post = topics_data[topics_data['id'] == subID]
        art_doc = post['text']
        
        art_doc = art_doc.to_numpy()
        art_doc = art_doc[0]
        art_doc = nlp(str(art_doc))
        #art_doc = nlp(art_doc)
        art_items = [x.text for x in art_doc.ents]
        #get tokens
        art_tokens = []
        for (item, count) in Counter(art_items).most_common(5):
            token = nlp(item)[0]
            art_tokens += [token]
        #get comment content and ner
        comment_text = comment['content']
        comment_text = str(comment_text)
        doc = nlp(str(comment_text).lower())

        items = [x.text for x in doc.ents]

        #initialize list of scores
        score = 0
        #for each token, get a score
        for (item, count) in Counter(items).most_common(5):

            #get token
            token = nlp(item)#[0]

            wordScores = []

            #for each article item
            for art_word in art_tokens:

                #add similarity score to list of scores
                wordScores += [art_word.similarity(token)]
            #get average score
            if len(wordScores) != 0:
                score += sum(wordScores)/len(wordScores)
            else:
                score = 0
        simWordScore.append(score)
    return simWordScore

wordScoreList = getSimWordScore(comment_data, topics_data, [])
comment_data['WordScore'] = wordScoreList

In [19]:
from datetime import datetime
now = datetime.now()
current_time = now.strftime('%H:%M:%S')
print('current time = ', current_time)

current time =  11:16:40


In [21]:
comment_data.to_csv("comment_data_1110.csv", index=False)

In [29]:
def getSimWholeScore(comment_data, topics_data, simWholeScore):
    for index, comment in comment_data.iterrows():
        subID = comment['submissionId']
        #get article text
        post = topics_data[topics_data['id'] == subID]
        art_doc = str(post['text'])
        art_doc = nlp(art_doc)
        #get comment content and ner
        comment_text = comment['content']
        comment_text = str(comment).lower()
        doc = nlp(comment_text)
        #compare to get a score
        simWholeScore.append(art_doc.similarity(doc))
    return simWholeScore

wholeScoreList = getSimWholeScore(comment_data, topics_data, [])
comment_data['WholeScore'] = wholeScoreList

In [30]:
from datetime import datetime
now = datetime.now()
current_time = now.strftime('%H:%M:%S')
print('current time = ', current_time)

current time =  12:15:46


In [32]:
comment_data.to_csv("comment_data_1110.csv", index=False)

In [33]:
comment_data.to_csv('full_comment_data_with_features119.csv', index=False)