# Sim score feature. Gets a similiarity score based on NERed words from the comment and article

In [1]:
import spacy
from spacy import displacy
from collections import Counter
#!python -m spacy download en_core_web_lg
#!pip install spacy-wordnet

import en_core_web_lg
nlp = en_core_web_lg.load()

from spacy_wordnet.wordnet_annotator import WordnetAnnotator
nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')

import pandas as pd
import numpy as np

# Changed to text without URL

In [2]:
def getSimWordScore(comment_data, topics_data, simWordScore):
    for index, comment in comment_data.iterrows():
        subID = comment['submissionId'] #get submission id from comment table
        post = topics_data[topics_data['id'] == subID]
        art_doc = post['text']
        
        art_doc = art_doc.to_numpy()
        art_doc = art_doc[0]
        art_doc = nlp(str(art_doc))
        #art_doc = nlp(art_doc)
        art_items = [x.text for x in art_doc.ents]
        #get tokens
        art_tokens = []
        for (item, count) in Counter(art_items).most_common(5):
            token = nlp(item)[0]
            art_tokens += [token]
        #get comment content and ner
        comment_text = comment['text_without_url']
        comment_text = str(comment_text)
        doc = nlp(str(comment_text).lower())

        items = [x.text for x in doc.ents]

        #initialize list of scores
        score = 0
        #for each token, get a score
        for (item, count) in Counter(items).most_common(5):

            #get token
            token = nlp(item)#[0]

            wordScores = []

            #for each article item
            for art_word in art_tokens:

                #add similarity score to list of scores
                wordScores += [art_word.similarity(token)]
            #get average score
            if len(wordScores) != 0:
                score += sum(wordScores)/len(wordScores)
            else:
                score = 0
        simWordScore.append(score)
    return simWordScore

## Sim whole score. Gets a similarity score based on the whole article and whole comment

# Changed to text without url

In [3]:
def getSimWholeScore(comment_data, topics_data, simWholeScore):
    for index, comment in comment_data.iterrows():
        subID = comment['submissionId']
        #get article text
        post = topics_data[topics_data['id'] == subID]
        art_doc = str(post['text'])
        art_doc = nlp(art_doc)
        #get comment content and ner
        comment_text = comment['text_without_url']
        comment_text = str(comment).lower()
        doc = nlp(comment_text)
        #compare to get a score
        simWholeScore.append(art_doc.similarity(doc))
    return simWholeScore

In [4]:
comments = pd.read_csv('files/compiled_comments_2_22_2021.csv')
topics = pd.read_csv('files/compiled_topics.csv')
comments

Unnamed: 0,action,content,author,details,submissionId,commentId,WordScore,WholeScore,tfidf,contains_url,text_without_url,article_score,comment_score,all_comments_scores,comment_article_score_ratio,comment_comments_score_ratio,adjWordScore,profanity
0,,So what are the implications here? Does it onl...,Cody_Fox23,,4op948,d4eictg,0.000000,0.849655,0.001573,False,So what are the implications here? Does it onl...,186,22,22,0.118280,1.000000,0.000000,False
1,,Sadly this isn't new. Police officers use many...,DrFrenchman,,4sef35,d58ts90,0.000000,0.900283,0.255802,False,Sadly this isn't new. Police officers use many...,89,6,7,0.067416,0.857143,0.000000,False
2,,What's disturbing about this is that our gover...,bbakks,,4sef35,d58y081,-0.038865,0.869078,0.000000,False,What's disturbing about this is that our gover...,89,2,7,0.022472,0.285714,-0.038865,False
3,,What I find really concerning is the horrible ...,poliscijunki,,4sef35,d5919n8,0.000000,0.898426,0.000000,True,What I find really concerning is the horrible ...,89,-1,7,-0.011236,-0.142857,-0.015000,False
4,,This subject might have legs but this article ...,interweb1,,64zsim,dg6l969,0.000000,0.850127,0.000000,False,This subject might have legs but this article ...,86,64,90,0.744186,0.711111,0.000000,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10231,,"Yes, while in East Baghdad my platoons mission...",CapitalCockroach,,bav0rl,ekggrgk,1.070477,0.840028,0.000000,True,"Yes, while in East Baghdad my platoons mission...",477,7,429,0.014675,0.016317,1.020477,False
10232,,The [definition the FBI currently uses for int...,CQME,,bav0rl,ekyelps,0.941533,0.882768,0.217543,True,The [definition the FBI currently uses for int...,477,2,429,0.004193,0.004662,0.885283,False
10233,,[Yes.](https://en.m.wikipedia.org/wiki/Islamic...,Silent_As_The_Grave_,,bav0rl,ekehcqg,0.217683,0.779386,0.000000,True,[Yes.] Have a look at who they are allies with...,477,-3,429,-0.006289,-0.006993,0.161433,True
10234,,Has ANY Shia ever committed an act of terroris...,bsmdphdjd,,bav0rl,ekfp4ls,1.293729,0.861529,0.000000,False,Has ANY Shia ever committed an act of terroris...,477,-3,429,-0.006289,-0.006993,1.143729,False


In [5]:
word_scores = getSimWordScore(comments, topics, [])
comments['no_url_WordScore'] = word_scores

  wordScores += [art_word.similarity(token)]


In [6]:
whole_scores = getSimWholeScore(comments, topics, [])
comments['no_url_WholeScore'] = whole_scores

In [7]:
comments

Unnamed: 0,action,content,author,details,submissionId,commentId,WordScore,WholeScore,tfidf,contains_url,text_without_url,article_score,comment_score,all_comments_scores,comment_article_score_ratio,comment_comments_score_ratio,adjWordScore,profanity,no_url_WordScore,no_url_WholeScore
0,,So what are the implications here? Does it onl...,Cody_Fox23,,4op948,d4eictg,0.000000,0.849655,0.001573,False,So what are the implications here? Does it onl...,186,22,22,0.118280,1.000000,0.000000,False,0.000000,0.816813
1,,Sadly this isn't new. Police officers use many...,DrFrenchman,,4sef35,d58ts90,0.000000,0.900283,0.255802,False,Sadly this isn't new. Police officers use many...,89,6,7,0.067416,0.857143,0.000000,False,0.000000,0.884829
2,,What's disturbing about this is that our gover...,bbakks,,4sef35,d58y081,-0.038865,0.869078,0.000000,False,What's disturbing about this is that our gover...,89,2,7,0.022472,0.285714,-0.038865,False,-0.038865,0.866455
3,,What I find really concerning is the horrible ...,poliscijunki,,4sef35,d5919n8,0.000000,0.898426,0.000000,True,What I find really concerning is the horrible ...,89,-1,7,-0.011236,-0.142857,-0.015000,False,0.000000,0.884435
4,,This subject might have legs but this article ...,interweb1,,64zsim,dg6l969,0.000000,0.850127,0.000000,False,This subject might have legs but this article ...,86,64,90,0.744186,0.711111,0.000000,False,0.000000,0.835723
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10231,,"Yes, while in East Baghdad my platoons mission...",CapitalCockroach,,bav0rl,ekggrgk,1.070477,0.840028,0.000000,True,"Yes, while in East Baghdad my platoons mission...",477,7,429,0.014675,0.016317,1.020477,False,1.070477,0.831097
10232,,The [definition the FBI currently uses for int...,CQME,,bav0rl,ekyelps,0.941533,0.882768,0.217543,True,The [definition the FBI currently uses for int...,477,2,429,0.004193,0.004662,0.885283,False,0.884132,0.870870
10233,,[Yes.](https://en.m.wikipedia.org/wiki/Islamic...,Silent_As_The_Grave_,,bav0rl,ekehcqg,0.217683,0.779386,0.000000,True,[Yes.] Have a look at who they are allies with...,477,-3,429,-0.006289,-0.006993,0.161433,True,0.217683,0.833056
10234,,Has ANY Shia ever committed an act of terroris...,bsmdphdjd,,bav0rl,ekfp4ls,1.293729,0.861529,0.000000,False,Has ANY Shia ever committed an act of terroris...,477,-3,429,-0.006289,-0.006993,1.143729,False,1.293729,0.847163


In [8]:
updated_df = pd.read_csv('files/compiled_comments_2_23_2021.csv')
updated_df

Unnamed: 0,action,content,author,details,submissionId,commentId,WordScore,WholeScore,tfidf,contains_url,text_without_url,article_score,comment_score,all_comments_scores,comment_article_score_ratio,comment_comments_score_ratio,adjWordScore,profanity,contains_!
0,,So what are the implications here? Does it onl...,Cody_Fox23,,4op948,d4eictg,0.000000,0.849655,0.001573,False,So what are the implications here? Does it onl...,186,22,22,0.118280,1.000000,0.000000,False,False
1,,Sadly this isn't new. Police officers use many...,DrFrenchman,,4sef35,d58ts90,0.000000,0.900283,0.255802,False,Sadly this isn't new. Police officers use many...,89,6,7,0.067416,0.857143,0.000000,False,True
2,,What's disturbing about this is that our gover...,bbakks,,4sef35,d58y081,-0.038865,0.869078,0.000000,False,What's disturbing about this is that our gover...,89,2,7,0.022472,0.285714,-0.038865,False,False
3,,What I find really concerning is the horrible ...,poliscijunki,,4sef35,d5919n8,0.000000,0.898426,0.000000,True,What I find really concerning is the horrible ...,89,-1,7,-0.011236,-0.142857,-0.015000,False,False
4,,This subject might have legs but this article ...,interweb1,,64zsim,dg6l969,0.000000,0.850127,0.000000,False,This subject might have legs but this article ...,86,64,90,0.744186,0.711111,0.000000,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10231,,"Yes, while in East Baghdad my platoons mission...",CapitalCockroach,,bav0rl,ekggrgk,1.070477,0.840028,0.000000,True,"Yes, while in East Baghdad my platoons mission...",477,7,429,0.014675,0.016317,1.020477,False,False
10232,,The [definition the FBI currently uses for int...,CQME,,bav0rl,ekyelps,0.941533,0.882768,0.217543,True,The [definition the FBI currently uses for int...,477,2,429,0.004193,0.004662,0.885283,False,False
10233,,[Yes.](https://en.m.wikipedia.org/wiki/Islamic...,Silent_As_The_Grave_,,bav0rl,ekehcqg,0.217683,0.779386,0.000000,True,[Yes.] Have a look at who they are allies with...,477,-3,429,-0.006289,-0.006993,0.161433,True,False
10234,,Has ANY Shia ever committed an act of terroris...,bsmdphdjd,,bav0rl,ekfp4ls,1.293729,0.861529,0.000000,False,Has ANY Shia ever committed an act of terroris...,477,-3,429,-0.006289,-0.006993,1.143729,False,False


In [9]:
updated_df['no_url_WordScore'] = comments['no_url_WordScore'] 
updated_df

Unnamed: 0,action,content,author,details,submissionId,commentId,WordScore,WholeScore,tfidf,contains_url,text_without_url,article_score,comment_score,all_comments_scores,comment_article_score_ratio,comment_comments_score_ratio,adjWordScore,profanity,contains_!,no_url_WordScore
0,,So what are the implications here? Does it onl...,Cody_Fox23,,4op948,d4eictg,0.000000,0.849655,0.001573,False,So what are the implications here? Does it onl...,186,22,22,0.118280,1.000000,0.000000,False,False,0.000000
1,,Sadly this isn't new. Police officers use many...,DrFrenchman,,4sef35,d58ts90,0.000000,0.900283,0.255802,False,Sadly this isn't new. Police officers use many...,89,6,7,0.067416,0.857143,0.000000,False,True,0.000000
2,,What's disturbing about this is that our gover...,bbakks,,4sef35,d58y081,-0.038865,0.869078,0.000000,False,What's disturbing about this is that our gover...,89,2,7,0.022472,0.285714,-0.038865,False,False,-0.038865
3,,What I find really concerning is the horrible ...,poliscijunki,,4sef35,d5919n8,0.000000,0.898426,0.000000,True,What I find really concerning is the horrible ...,89,-1,7,-0.011236,-0.142857,-0.015000,False,False,0.000000
4,,This subject might have legs but this article ...,interweb1,,64zsim,dg6l969,0.000000,0.850127,0.000000,False,This subject might have legs but this article ...,86,64,90,0.744186,0.711111,0.000000,False,False,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10231,,"Yes, while in East Baghdad my platoons mission...",CapitalCockroach,,bav0rl,ekggrgk,1.070477,0.840028,0.000000,True,"Yes, while in East Baghdad my platoons mission...",477,7,429,0.014675,0.016317,1.020477,False,False,1.070477
10232,,The [definition the FBI currently uses for int...,CQME,,bav0rl,ekyelps,0.941533,0.882768,0.217543,True,The [definition the FBI currently uses for int...,477,2,429,0.004193,0.004662,0.885283,False,False,0.884132
10233,,[Yes.](https://en.m.wikipedia.org/wiki/Islamic...,Silent_As_The_Grave_,,bav0rl,ekehcqg,0.217683,0.779386,0.000000,True,[Yes.] Have a look at who they are allies with...,477,-3,429,-0.006289,-0.006993,0.161433,True,False,0.217683
10234,,Has ANY Shia ever committed an act of terroris...,bsmdphdjd,,bav0rl,ekfp4ls,1.293729,0.861529,0.000000,False,Has ANY Shia ever committed an act of terroris...,477,-3,429,-0.006289,-0.006993,1.143729,False,False,1.293729


In [10]:
updated_df['no_url_WholeScore'] = comments['no_url_WholeScore']
updated_df

Unnamed: 0,action,content,author,details,submissionId,commentId,WordScore,WholeScore,tfidf,contains_url,...,article_score,comment_score,all_comments_scores,comment_article_score_ratio,comment_comments_score_ratio,adjWordScore,profanity,contains_!,no_url_WordScore,no_url_WholeScore
0,,So what are the implications here? Does it onl...,Cody_Fox23,,4op948,d4eictg,0.000000,0.849655,0.001573,False,...,186,22,22,0.118280,1.000000,0.000000,False,False,0.000000,0.816813
1,,Sadly this isn't new. Police officers use many...,DrFrenchman,,4sef35,d58ts90,0.000000,0.900283,0.255802,False,...,89,6,7,0.067416,0.857143,0.000000,False,True,0.000000,0.884829
2,,What's disturbing about this is that our gover...,bbakks,,4sef35,d58y081,-0.038865,0.869078,0.000000,False,...,89,2,7,0.022472,0.285714,-0.038865,False,False,-0.038865,0.866455
3,,What I find really concerning is the horrible ...,poliscijunki,,4sef35,d5919n8,0.000000,0.898426,0.000000,True,...,89,-1,7,-0.011236,-0.142857,-0.015000,False,False,0.000000,0.884435
4,,This subject might have legs but this article ...,interweb1,,64zsim,dg6l969,0.000000,0.850127,0.000000,False,...,86,64,90,0.744186,0.711111,0.000000,False,False,0.000000,0.835723
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10231,,"Yes, while in East Baghdad my platoons mission...",CapitalCockroach,,bav0rl,ekggrgk,1.070477,0.840028,0.000000,True,...,477,7,429,0.014675,0.016317,1.020477,False,False,1.070477,0.831097
10232,,The [definition the FBI currently uses for int...,CQME,,bav0rl,ekyelps,0.941533,0.882768,0.217543,True,...,477,2,429,0.004193,0.004662,0.885283,False,False,0.884132,0.870870
10233,,[Yes.](https://en.m.wikipedia.org/wiki/Islamic...,Silent_As_The_Grave_,,bav0rl,ekehcqg,0.217683,0.779386,0.000000,True,...,477,-3,429,-0.006289,-0.006993,0.161433,True,False,0.217683,0.833056
10234,,Has ANY Shia ever committed an act of terroris...,bsmdphdjd,,bav0rl,ekfp4ls,1.293729,0.861529,0.000000,False,...,477,-3,429,-0.006289,-0.006993,1.143729,False,False,1.293729,0.847163


In [12]:
updated_df.to_csv('files/compiled_comments_2_24_2021.csv')