# Sim score feature. Gets a similiarity score based on NERed words from the comment and article

In [1]:
import spacy
from spacy import displacy
from collections import Counter
#!python -m spacy download en_core_web_lg
#!pip install spacy-wordnet

import en_core_web_lg
nlp = en_core_web_lg.load()

from spacy_wordnet.wordnet_annotator import WordnetAnnotator
nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')

import pandas as pd
import numpy as np

# Changed to text without URL

In [2]:
def getSimWordScore(comment_data, topics_data, simWordScore):
    for index, comment in comment_data.iterrows():
        subID = comment['submissionId'] #get submission id from comment table
        post = topics_data[topics_data['id'] == subID]
        art_doc = post['text']
        
        art_doc = art_doc.to_numpy()
        art_doc = art_doc[0]
        art_doc = nlp(str(art_doc))
        #art_doc = nlp(art_doc)
        art_items = [x.text for x in art_doc.ents]
        #get tokens
        art_tokens = []
        for (item, count) in Counter(art_items).most_common(5):
            token = nlp(item)[0]
            art_tokens += [token]
        #get comment content and ner
        comment_text = comment['text_without_url']
        comment_text = str(comment_text)
        doc = nlp(str(comment_text).lower())

        items = [x.text for x in doc.ents]

        #initialize list of scores
        score = 0
        #for each token, get a score
        for (item, count) in Counter(items).most_common(5):

            #get token
            token = nlp(item)#[0]

            wordScores = []

            #for each article item
            for art_word in art_tokens:

                #add similarity score to list of scores
                wordScores += [art_word.similarity(token)]
            #get average score
            if len(wordScores) != 0:
                score += sum(wordScores)/len(wordScores)
            else:
                score = 0
        simWordScore.append(score)
    return simWordScore

## Sim whole score. Gets a similarity score based on the whole article and whole comment

# Changed to text without url

In [3]:
def getSimWholeScore(comment_data, topics_data, simWholeScore):
    for index, comment in comment_data.iterrows():
        subID = comment['submissionId']
        #get article text
        post = topics_data[topics_data['id'] == subID]
        art_doc = str(post['text'])
        art_doc = nlp(art_doc)
        #get comment content and ner
        comment_text = comment['text_without_url']
        comment_text = str(comment).lower()
        doc = nlp(comment_text)
        #compare to get a score
        simWholeScore.append(art_doc.similarity(doc))
    return simWholeScore

In [6]:
comments = pd.read_csv('files/comments_no_stops.csv')
topics = pd.read_csv('files/topics_no_stops.csv')
comments

Unnamed: 0,action,content,author,details,submissionId,commentId,WordScore,WholeScore,contains_url,text_without_url
0,,So implications here? Does affect involved Vis...,Cody_Fox23,,4op948,d4eictg,0.000000,0.773069,False,So implications here? Does affect involved Vis...
1,,Sadly isn't new. Police officers use faulty te...,DrFrenchman,,4sef35,d58ts90,0.000000,0.857654,False,Sadly isn't new. Police officers use faulty te...
2,,What's disturbing government destroying lives ...,bbakks,,4sef35,d58y081,-0.038865,0.833865,False,What's disturbing government destroying lives ...
3,,What I find concerning horrible response law e...,poliscijunki,,4sef35,d5919n8,0.000000,0.865826,True,What I find concerning horrible response law e...
4,,This subject legs article opinion piece editor...,interweb1,,64zsim,dg6l969,0.000000,0.826162,False,This subject legs article opinion piece editor...
...,...,...,...,...,...,...,...,...,...,...
10231,,"Yes, East Baghdad platoons mission check build...",CapitalCockroach,,bav0rl,ekggrgk,1.000532,0.827872,True,"Yes, East Baghdad platoons mission check build..."
10232,,The [definition FBI currently uses internation...,CQME,,bav0rl,ekyelps,0.606157,0.852373,True,The [definition FBI currently uses internation...
10233,,[Yes.](https://en.m.wikipedia.org/wiki/Islamic...,Silent_As_The_Grave_,,bav0rl,ekehcqg,0.369440,0.782545,True,[Yes.] Have look allies with. Hezbollah fucks ...
10234,,Has ANY Shia committed act terrorism U.S.?\n\n...,bsmdphdjd,,bav0rl,ekfp4ls,1.788263,0.834425,False,Has ANY Shia committed act terrorism U.S.?\n\n...


In [7]:
word_scores = getSimWordScore(comments, topics, [])
comments['no_url_WordScore'] = word_scores

  wordScores += [art_word.similarity(token)]


In [8]:
whole_scores = getSimWholeScore(comments, topics, [])
comments['no_url_WholeScore'] = whole_scores

In [9]:
comments

Unnamed: 0,action,content,author,details,submissionId,commentId,WordScore,WholeScore,contains_url,text_without_url,no_url_WordScore,no_url_WholeScore
0,,So implications here? Does affect involved Vis...,Cody_Fox23,,4op948,d4eictg,0.000000,0.773069,False,So implications here? Does affect involved Vis...,0.000000,0.736582
1,,Sadly isn't new. Police officers use faulty te...,DrFrenchman,,4sef35,d58ts90,0.000000,0.857654,False,Sadly isn't new. Police officers use faulty te...,0.000000,0.844658
2,,What's disturbing government destroying lives ...,bbakks,,4sef35,d58y081,-0.038865,0.833865,False,What's disturbing government destroying lives ...,-0.038865,0.785302
3,,What I find concerning horrible response law e...,poliscijunki,,4sef35,d5919n8,0.000000,0.865826,True,What I find concerning horrible response law e...,0.000000,0.852412
4,,This subject legs article opinion piece editor...,interweb1,,64zsim,dg6l969,0.000000,0.826162,False,This subject legs article opinion piece editor...,0.000000,0.804306
...,...,...,...,...,...,...,...,...,...,...,...,...
10231,,"Yes, East Baghdad platoons mission check build...",CapitalCockroach,,bav0rl,ekggrgk,1.000532,0.827872,True,"Yes, East Baghdad platoons mission check build...",1.000532,0.788655
10232,,The [definition FBI currently uses internation...,CQME,,bav0rl,ekyelps,0.606157,0.852373,True,The [definition FBI currently uses internation...,0.600762,0.843292
10233,,[Yes.](https://en.m.wikipedia.org/wiki/Islamic...,Silent_As_The_Grave_,,bav0rl,ekehcqg,0.369440,0.782545,True,[Yes.] Have look allies with. Hezbollah fucks ...,0.369440,0.800717
10234,,Has ANY Shia committed act terrorism U.S.?\n\n...,bsmdphdjd,,bav0rl,ekfp4ls,1.788263,0.834425,False,Has ANY Shia committed act terrorism U.S.?\n\n...,1.788263,0.792615


In [11]:
updated_df = pd.read_csv('files/compiled_comments_2_24_2021.csv')
updated_df

Unnamed: 0,action,content,author,details,submissionId,commentId,WordScore,WholeScore,tfidf,contains_url,...,all_comments_scores,comment_article_score_ratio,comment_comments_score_ratio,adjWordScore,profanity,contains_!,no_url_WordScore,no_url_WholeScore,WordScoreNoStop,WholeScoreNoStop
0,,So what are the implications here? Does it onl...,Cody_Fox23,,4op948,d4eictg,0.000000,0.849655,0.001573,False,...,22,0.118280,1.000000,0.000000,False,False,0.000000,0.816813,0.000000,0.773069
1,,Sadly this isn't new. Police officers use many...,DrFrenchman,,4sef35,d58ts90,0.000000,0.900283,0.255802,False,...,7,0.067416,0.857143,0.000000,False,True,0.000000,0.884829,0.000000,0.857654
2,,What's disturbing about this is that our gover...,bbakks,,4sef35,d58y081,-0.038865,0.869078,0.000000,False,...,7,0.022472,0.285714,-0.038865,False,False,-0.038865,0.866455,-0.038865,0.833865
3,,What I find really concerning is the horrible ...,poliscijunki,,4sef35,d5919n8,0.000000,0.898426,0.000000,True,...,7,-0.011236,-0.142857,-0.015000,False,False,0.000000,0.884435,0.000000,0.865826
4,,This subject might have legs but this article ...,interweb1,,64zsim,dg6l969,0.000000,0.850127,0.000000,False,...,90,0.744186,0.711111,0.000000,False,False,0.000000,0.835723,0.000000,0.826162
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10231,,"Yes, while in East Baghdad my platoons mission...",CapitalCockroach,,bav0rl,ekggrgk,1.070477,0.840028,0.000000,True,...,429,0.014675,0.016317,1.020477,False,False,1.070477,0.831097,1.000532,0.827872
10232,,The [definition the FBI currently uses for int...,CQME,,bav0rl,ekyelps,0.941533,0.882768,0.217543,True,...,429,0.004193,0.004662,0.885283,False,False,0.884132,0.870870,0.606157,0.852373
10233,,[Yes.](https://en.m.wikipedia.org/wiki/Islamic...,Silent_As_The_Grave_,,bav0rl,ekehcqg,0.217683,0.779386,0.000000,True,...,429,-0.006289,-0.006993,0.161433,True,False,0.217683,0.833056,0.369440,0.782545
10234,,Has ANY Shia ever committed an act of terroris...,bsmdphdjd,,bav0rl,ekfp4ls,1.293729,0.861529,0.000000,False,...,429,-0.006289,-0.006993,1.143729,False,False,1.293729,0.847163,1.788263,0.834425


In [14]:
updated_df['no_url_or_stops_WordScore'] = comments['no_url_WordScore'] 
updated_df

Unnamed: 0,action,content,author,details,submissionId,commentId,WordScore,WholeScore,tfidf,contains_url,...,comment_comments_score_ratio,adjWordScore,profanity,contains_!,no_url_WordScore,no_url_WholeScore,WordScoreNoStop,WholeScoreNoStop,no_url_or_stops_WholeScore,no_url_or_stops_WordScore
0,,So what are the implications here? Does it onl...,Cody_Fox23,,4op948,d4eictg,0.000000,0.849655,0.001573,False,...,1.000000,0.000000,False,False,0.000000,0.816813,0.000000,0.773069,0.736582,0.000000
1,,Sadly this isn't new. Police officers use many...,DrFrenchman,,4sef35,d58ts90,0.000000,0.900283,0.255802,False,...,0.857143,0.000000,False,True,0.000000,0.884829,0.000000,0.857654,0.844658,0.000000
2,,What's disturbing about this is that our gover...,bbakks,,4sef35,d58y081,-0.038865,0.869078,0.000000,False,...,0.285714,-0.038865,False,False,-0.038865,0.866455,-0.038865,0.833865,0.785302,-0.038865
3,,What I find really concerning is the horrible ...,poliscijunki,,4sef35,d5919n8,0.000000,0.898426,0.000000,True,...,-0.142857,-0.015000,False,False,0.000000,0.884435,0.000000,0.865826,0.852412,0.000000
4,,This subject might have legs but this article ...,interweb1,,64zsim,dg6l969,0.000000,0.850127,0.000000,False,...,0.711111,0.000000,False,False,0.000000,0.835723,0.000000,0.826162,0.804306,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10231,,"Yes, while in East Baghdad my platoons mission...",CapitalCockroach,,bav0rl,ekggrgk,1.070477,0.840028,0.000000,True,...,0.016317,1.020477,False,False,1.070477,0.831097,1.000532,0.827872,0.788655,1.000532
10232,,The [definition the FBI currently uses for int...,CQME,,bav0rl,ekyelps,0.941533,0.882768,0.217543,True,...,0.004662,0.885283,False,False,0.884132,0.870870,0.606157,0.852373,0.843292,0.600762
10233,,[Yes.](https://en.m.wikipedia.org/wiki/Islamic...,Silent_As_The_Grave_,,bav0rl,ekehcqg,0.217683,0.779386,0.000000,True,...,-0.006993,0.161433,True,False,0.217683,0.833056,0.369440,0.782545,0.800717,0.369440
10234,,Has ANY Shia ever committed an act of terroris...,bsmdphdjd,,bav0rl,ekfp4ls,1.293729,0.861529,0.000000,False,...,-0.006993,1.143729,False,False,1.293729,0.847163,1.788263,0.834425,0.792615,1.788263


In [15]:
updated_df['no_url_or_stops_WholeScore'] = comments['no_url_WholeScore']
updated_df

Unnamed: 0,action,content,author,details,submissionId,commentId,WordScore,WholeScore,tfidf,contains_url,...,comment_comments_score_ratio,adjWordScore,profanity,contains_!,no_url_WordScore,no_url_WholeScore,WordScoreNoStop,WholeScoreNoStop,no_url_or_stops_WholeScore,no_url_or_stops_WordScore
0,,So what are the implications here? Does it onl...,Cody_Fox23,,4op948,d4eictg,0.000000,0.849655,0.001573,False,...,1.000000,0.000000,False,False,0.000000,0.816813,0.000000,0.773069,0.736582,0.000000
1,,Sadly this isn't new. Police officers use many...,DrFrenchman,,4sef35,d58ts90,0.000000,0.900283,0.255802,False,...,0.857143,0.000000,False,True,0.000000,0.884829,0.000000,0.857654,0.844658,0.000000
2,,What's disturbing about this is that our gover...,bbakks,,4sef35,d58y081,-0.038865,0.869078,0.000000,False,...,0.285714,-0.038865,False,False,-0.038865,0.866455,-0.038865,0.833865,0.785302,-0.038865
3,,What I find really concerning is the horrible ...,poliscijunki,,4sef35,d5919n8,0.000000,0.898426,0.000000,True,...,-0.142857,-0.015000,False,False,0.000000,0.884435,0.000000,0.865826,0.852412,0.000000
4,,This subject might have legs but this article ...,interweb1,,64zsim,dg6l969,0.000000,0.850127,0.000000,False,...,0.711111,0.000000,False,False,0.000000,0.835723,0.000000,0.826162,0.804306,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10231,,"Yes, while in East Baghdad my platoons mission...",CapitalCockroach,,bav0rl,ekggrgk,1.070477,0.840028,0.000000,True,...,0.016317,1.020477,False,False,1.070477,0.831097,1.000532,0.827872,0.788655,1.000532
10232,,The [definition the FBI currently uses for int...,CQME,,bav0rl,ekyelps,0.941533,0.882768,0.217543,True,...,0.004662,0.885283,False,False,0.884132,0.870870,0.606157,0.852373,0.843292,0.600762
10233,,[Yes.](https://en.m.wikipedia.org/wiki/Islamic...,Silent_As_The_Grave_,,bav0rl,ekehcqg,0.217683,0.779386,0.000000,True,...,-0.006993,0.161433,True,False,0.217683,0.833056,0.369440,0.782545,0.800717,0.369440
10234,,Has ANY Shia ever committed an act of terroris...,bsmdphdjd,,bav0rl,ekfp4ls,1.293729,0.861529,0.000000,False,...,-0.006993,1.143729,False,False,1.293729,0.847163,1.788263,0.834425,0.792615,1.788263


In [16]:
updated_df['no_url_or_stops_content'] = comments['text_without_url']

In [17]:
updated_df.to_csv('files/compiled_comments_2_25_2021.csv')