In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import praw
import datetime as dt
import random
import re
import string

import pickle
from compress_pickle import dump, load

import spacy
from spacy import displacy
from collections import Counter

import en_core_web_lg
nlp = en_core_web_lg.load()

STOP_WORDS = spacy.lang.en.stop_words.STOP_WORDS


In [3]:
def contains_url_feature(comment):
    urlarr = []
    urlarr = re.findall('[(]?http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', str(comment))
    if not urlarr:
        return False
    else:
        return True

#comment_text is the the comment and art_doc is the cleaned text of the article
def wordscore_feature(comment_text, art_doc):
    art_doc = nlp(str(art_doc))
    art_items = [x.text for x in art_doc.ents]
    #get tokens
    art_tokens = []
    for (item, count) in Counter(art_items).most_common(5):
        token = nlp(item)[0]
        art_tokens += [token]
        
    doc = nlp(str(comment_text).lower())
    items = [x.text for x in doc.ents]
    
    score = 0
    
    for (item, count) in Counter(items).most_common(5):
        
        token = nlp(item)
        
        wordScores = []
        
        for art_word in art_tokens:
            
            wordScores += [art_word.similarity(token)]
            
            if len(wordScores) != 0:
                score += sum(wordScores)/len(wordScores)
            else:
                score = 0
    return score

#comment_text is the the comment and art_doc is the cleaned text of the article
def wholescore_feature(comment_text, art_doc):
    art_doc = nlp(str(art_doc))
    comment_text = str(comment_text).lower()
    doc = nlp(comment_text)
    score = art_doc.similarity(doc)
    return score

def remove_urls(text):
    urlarr = re.findall('[(]?http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', str(text))
    for url in urlarr:
        text = text.replace(url, '')
    return text

def remove_stopwords(text):
    text = nlp(str(text))
    token_list = []
    for token in text:
        token_list.append(token.text)
        
    filtered_text = ''
    
    for word in token_list:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            if word in string.punctuation:
                filtered_text += word
            else:
                filtered_text += (' ' + word)
    return filtered_text

def adjwordscore_feature(comment_text, art_doc):
    adjustments = {"PERSON":0.4, "NORP":0.25, "FAC": 0.35, "ORG": 0.075, "GPE": 0.2, "LOC": 0.15, "PRODUCT": 0, "EVENT": 0.2, "WORK_OF_ART": 0, "LAW": 0, "LANGUAGE": 0.2, "DATE": 0.4, "TIME": 0.6, "PERCENT": 0.8, "MONEY": 0.5, "QUANTITY": 0.5, "ORDINAL": 0.25, "CARDINAL": 0.2}
    art_doc = nlp(str(art_doc))
    art_items = [x.text for x in art_doc.ents]
    art_labels = [x.label_ for x in art_doc.ents]

    art_tokens = []
    for (item, count) in Counter(art_items).most_common(5):
        token = nlp(item)[0]
        art_tokens += [token]
    comment_text = str(comment_text)
    doc = nlp(str(comment_text).lower())

    items = [x.text for x in doc.ents]
    labels = [x.label_ for x in doc.ents]

    score = 0

    for (item, count) in Counter(items).most_common(5):
        com_label = labels[items.index(item)]
        token = nlp(item)

        wordScores = []

        for art_word in art_tokens:
            art_label = art_labels[art_tokens.index(art_word)]

            if art_label == com_label:
                amt = adjustments[art_label]
                wordScores += [art_word.similarity(token) - amt]
            else:
                wordScores += [art_word.similarity(token)]
        
        if len(wordScores) != 0:
            score += sum(wordScores)/len(wordScores)
        else:
            score = 0

    return score

def ner_feature(comment_text, art_doc):
    art_doc = nlp(str(art_doc))
    art_items = [x.text for x in art_doc.ents]
    art_tokens = []
    for item in Counter(art_items):
        token = nlp(item)[0]
        art_tokens += [token]
    comment_text = str(comment_text)
    doc = nlp(str(comment_text).lower())
    items = [x.text for x in doc.ents]

    score = 0

    for item in Counter(items):
        token = nlp(item)
        if str(token) in str(art_tokens):
            score += 1
    return len(items), score

def length_feature(comment):
    comment = str(comment)
    length = len(comment)
    return length

#Send in comment text, reddit url, and feature list, change to give features
def big_func(comments, articles, swearwords, url_list, length_list, wordScore_list, 
             wholeScore_list, adjWordScore_list, no_url_wordScore_list, no_url_wholeScore_list, wordScore_noStop_list,
             wholeScore_noStop_list, no_url_stops_wordScore_list,  no_url_stops_wholeScore_list, ner_count_list,
             ner_match_list):
    for index, row in comments.iterrows():
        comment_text = row['Comment text']
        article_num = articles[articles['Article Number'] == row['Article Number']]
        article = articles[articles['Article Number'] == article_num]
        article_text = article['text']
        
        url_list.append(contains_url_feature(comment_text))

        length_list.append(length_feature(comment_text))

        wordScore_list.append(wordscore_feature(comment_text, article_text))
        wholeScore_list.append(wholescore_feature(comment_text, article_text))

        adjWordScore_list.append(adjwordscore_feature(comment_text, article_text))

        no_url_comment_text = remove_urls(comment_text)
        no_url_article_text = remove_urls(article_text)

        no_url_wordScore_list.append(wordscore_feature(no_url_comment_text, no_url_article_text))
        no_url_wholeScore_list.append(wholescore_feature(no_url_comment_text, no_url_article_text))

        comment_text = remove_stopwords(comment_text)
        cleaned_article_text = remove_stopwords(article_text)

        wordScore_noStop_list.append(wordscore_feature(comment_text, cleaned_article_text))
        wholeScore_noStop_list.append(wholescore_feature(comment_text, cleaned_article_text))

        comment_text = remove_urls(comment_text)
        cleaned_article_text = remove_urls(article_text)

        no_url_stops_wholeScore_list.append(wholescore_feature(comment_text, cleaned_article_text))
        no_url_stops_wordScore_list.append(wordscore_feature(comment_text, cleaned_article_text))

        NER_count, NER_match = ner_feature(comment_text, cleaned_article_text)
        ner_count_list.append(NER_count)
        ner_match_list.append(NER_match)

    
    return (url_list, length_list, wordScore_list, 
            wholeScore_list, adjWordScore_list, no_url_wordScore_list, no_url_wholeScore_list, 
            wordScore_noStop_list, wholeScore_noStop_list, no_url_stops_wordScore_list,  
            no_url_stops_wholeScore_list, ner_count_list, ner_match_list)

In [4]:
comments = pd.read_csv('Validation Comments - Sheet1.csv')
comments

Unnamed: 0,Article Number,Comment text,related score,composure score,source score,gut feeling,total,Unnamed: 7,Label
0,1,"What a farce. If not 19 then 20, if not 20 the...",7.0,3.0,0.0,3.0,13.0,,Bad
1,1,"""When the people find that they can vote thems...",7.0,5.0,3.0,3.0,18.0,,Bad
2,1,Interesting spin by the WSJ to note that the $...,8.0,4.0,4.0,5.0,21.0,,Good
3,1,"""Mr. Biden said Saturday that the checks would...",3.0,2.0,3.0,0.0,8.0,,Bad
4,1,"Free money Free money, get your free money... ...",8.0,1.0,0.0,0.0,9.0,,Bad
...,...,...,...,...,...,...,...,...,...
371,74,\nThe Democrats are no longer connected to the...,3.0,6.0,,3.0,12.0,,Bad
372,74,"The GOP acts like Biden owes them something, t...",3.0,6.0,,3.0,12.0,,Bad
373,74,The democrats left Joe Manchin years ago and I...,9.0,6.0,,7.0,22.0,,Good
374,74,Joe Biden said on the campaign trail that mine...,6.0,6.0,,6.0,18.0,,Bad


In [5]:
comments = comments.drop(['Unnamed: 7'], axis = 1)

In [6]:
articles = pd.read_csv('Validation Articles - Sheet1.csv')
bad = articles[articles['text'] == 'Error']
bad

Unnamed: 0.1,Unnamed: 0,Article Number,Article URL/text,text


In [7]:
swearwords_df = pd.read_csv('../files/edited-swear-words.csv')
swearwords = swearwords_df.swear.tolist()

url_list, length_list, wordScore_list, wholeScore_list, adjWordScore_list, no_url_wordScore_list, no_url_wholeScore_list, wordScore_noStop_list, wholeScore_noStop_list, no_url_stops_wordScore_list,  no_url_stops_wholeScore_list, ner_count_list, ner_match_list = big_func(comments, articles, swearwords, [], [], [], [], [], [], [], [], [], [], [], [], [])
comments['WordScore'] = wordScore_list
comments['WholeScore'] = wholeScore_list
comments['contains_url'] = url_list
comments['adjWordScore'] = adjWordScore_list
comments['no_url_WordScore'] = no_url_wordScore_list
comments['no_url_WholeScore'] = no_url_wholeScore_list
comments['WordScoreNoStop'] = wordScore_noStop_list
comments['WholeScoreNoStop'] = wholeScore_noStop_list
comments['no_url_or_stops_WholeScore'] = no_url_stops_wholeScore_list
comments['no_url_or_stops_WordScore'] = no_url_stops_wordScore_list
comments['NER_count'] = ner_count_list
comments['NER_match'] = ner_match_list
comments['length'] = length_list


  wordScores += [art_word.similarity(token)]
  wordScores += [art_word.similarity(token)]
  wordScores += [art_word.similarity(token) - amt]


In [8]:
comments

Unnamed: 0,Article Number,Comment text,related score,composure score,source score,gut feeling,total,Label,WordScore,WholeScore,...,adjWordScore,no_url_WordScore,no_url_WholeScore,WordScoreNoStop,WholeScoreNoStop,no_url_or_stops_WholeScore,no_url_or_stops_WordScore,NER_count,NER_match,length
0,1,"What a farce. If not 19 then 20, if not 20 the...",7.0,3.0,0.0,3.0,13.0,Bad,6.989692,0.455088,...,1.273692,6.989692,0.455088,4.826901,0.457190,0.467852,4.826901,2,0,72
1,1,"""When the people find that they can vote thems...",7.0,5.0,3.0,3.0,18.0,Bad,0.000000,0.352507,...,0.000000,0.000000,0.352507,1.246337,0.339320,0.361756,1.246337,2,0,120
2,1,Interesting spin by the WSJ to note that the $...,8.0,4.0,4.0,5.0,21.0,Good,2.059889,0.415445,...,0.433141,2.059889,0.415445,1.526402,0.379267,0.391046,1.526402,10,0,259
3,1,"""Mr. Biden said Saturday that the checks would...",3.0,2.0,3.0,0.0,8.0,Bad,0.290054,0.406714,...,0.067596,0.290054,0.406714,0.448589,0.421750,0.443767,0.448589,3,0,183
4,1,"Free money Free money, get your free money... ...",8.0,1.0,0.0,0.0,9.0,Bad,-0.240884,0.393341,...,-0.047110,-0.240884,0.393341,-0.240884,0.336735,0.354071,-0.240884,1,0,165
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371,74,\nThe Democrats are no longer connected to the...,3.0,6.0,,3.0,12.0,Bad,-1.080726,0.344537,...,-0.240814,-1.080726,0.344537,-0.649964,0.301120,0.322979,-0.649964,6,0,961
372,74,"The GOP acts like Biden owes them something, t...",3.0,6.0,,3.0,12.0,Bad,-0.663746,0.359682,...,-0.108143,-0.663746,0.359682,-0.745070,0.328813,0.348438,-0.745070,8,0,515
373,74,The democrats left Joe Manchin years ago and I...,9.0,6.0,,7.0,22.0,Good,0.967275,0.366591,...,0.220943,0.967275,0.366591,0.967275,0.314389,0.327972,0.967275,3,0,111
374,74,Joe Biden said on the campaign trail that mine...,6.0,6.0,,6.0,18.0,Bad,-0.427453,0.354744,...,-0.102493,-0.427453,0.354744,-0.427453,0.319018,0.335050,-0.427453,3,0,163


In [9]:
comments.to_csv('Validation Comments - Sheet1.csv', index = False)

# Used file to featurize comments, stuff below is from analyzing previous model

In [13]:
comments['model label'] = comments['model label'].replace({True : 'Good', False: "Bad"})
comments

Unnamed: 0,Article Number,Comment text,related score,composure score,source score,gut feeling,total,Label,model label
0,1,"What a farce. If not 19 then 20, if not 20 the...",7.0,3.0,0.0,3.0,13.0,Bad,Bad
1,1,"""When the people find that they can vote thems...",7.0,5.0,3.0,3.0,18.0,Bad,Good
2,1,Interesting spin by the WSJ to note that the $...,8.0,4.0,4.0,5.0,21.0,Good,Good
3,1,"""Mr. Biden said Saturday that the checks would...",3.0,2.0,3.0,0.0,8.0,Bad,Good
4,1,"Free money Free money, get your free money... ...",8.0,1.0,0.0,0.0,9.0,Bad,Bad
...,...,...,...,...,...,...,...,...,...
371,74,\nThe Democrats are no longer connected to the...,3.0,6.0,,3.0,12.0,Bad,Good
372,74,"The GOP acts like Biden owes them something, t...",3.0,6.0,,3.0,12.0,Bad,Good
373,74,The democrats left Joe Manchin years ago and I...,9.0,6.0,,7.0,22.0,Good,Good
374,74,Joe Biden said on the campaign trail that mine...,6.0,6.0,,6.0,18.0,Bad,Good


In [85]:
count = 0
for index, row in comments.iterrows():
    if row['Label'] == row['model label']:
        count += 1



In [86]:
count

205

In [88]:
205/376

0.5452127659574468

In [89]:
from sklearn.metrics import confusion_matrix

In [90]:
tn, fp, fn, tp = confusion_matrix(comments['Label'], comments['model label']).ravel()
print("True negative " + str(tn))
print("False Positive " + str(fp))
print("False negative " + str(fn))
print("True positive " + str(tp))

True negative 91
False Positive 130
False negative 41
True positive 114


A lot of false positives, it looks like we were harsher than our model

In [71]:
acc = 0
value = 0
for i in range(40):
    comments['Adjust Label'] = np.where(comments['total'] > i, 'Good', 'Bad')
    tn, fp, fn, tp = confusion_matrix(comments['Adjust Label'], comments['model label']).ravel()
    if acc <  (tn + tp)/376:
        acc = (tn + tp)/376
        value = i
comments['Adjust Label'] = np.where(comments['total'] > value, 'Good', 'Bad')
tn, fp, fn, tp = confusion_matrix(comments['Adjust Label'], comments['model label']).ravel()
print("True negative " + str(tn))
print("False Positive " + str(fp))
print("False negative " + str(fn))
print("True positive " + str(tp))
print("accuracy " + str(acc))
print("Good/bad cutoff " + str(value))

True negative 18
False Positive 14
False negative 114
True positive 230
accuracy 0.6595744680851063
Good/bad cutoff 5


Most accurate when good is greater than 5 at 65%. But that just lowers our bar so far that all the good model predictions are right. Our model predicted more good comments than bad comments

In [72]:
diff = 400
value = 0
for i in range(40):
    comments['Adjust Label'] = np.where(comments['total'] > i, 'Good', 'Bad')
    tn, fp, fn, tp = confusion_matrix(comments['Adjust Label'], comments['model label']).ravel()
    if diff >  abs(fp - fn):
        diff = abs(fp - fn)
        value = i

comments['Adjust Label'] = np.where(comments['total'] > value, 'Good', 'Bad')
tn, fp, fn, tp = confusion_matrix(comments['Adjust Label'], comments['model label']).ravel()

print("True negative " + str(tn))
print("False Positive " + str(fp))
print("False negative " + str(fn))
print("True positive " + str(tp))
print("difference " + str(diff))
print("accuracy " + str((tp + tn)/376))
print("Good/bad cutoff " + str(value))

True negative 56
False Positive 77
False negative 76
True positive 167
difference 1
accuracy 0.5930851063829787
Good/bad cutoff 14


Get the smallest difference between false positives and false negatives at cutoff of 14. Shows the cutoff that would be most balanced I think, and leads to a 60% accuracy.

In [73]:
diff = 400
value = 0
for i in range(40):
    comments['Adjust Label'] = np.where(comments['total'] > i, 'Good', 'Bad')
    tn, fp, fn, tp = confusion_matrix(comments['Adjust Label'], comments['model label']).ravel()
    if diff >  abs(tp - tn):
        diff = abs(tp - tn)
        value = i

comments['Adjust Label'] = np.where(comments['total'] > value, 'Good', 'Bad')
tn, fp, fn, tp = confusion_matrix(comments['Adjust Label'], comments['model label']).ravel()

print("True negative " + str(tn))
print("False Positive " + str(fp))
print("False negative " + str(fn))
print("True positive " + str(tp))
print("difference " + str(diff))
print("accuracy " + str((tp + tn)/376))
print("Good/bad cutoff " + str(value))

True negative 100
False Positive 153
False negative 32
True positive 91
difference 9
accuracy 0.5079787234042553
Good/bad cutoff 21


In [80]:
count = 0
for index, row in comments.iterrows():
    if row['Label'] == 'Good':
        count += 1
count

155