-----
### import libraries needed to analyses

In [73]:
from itertools import combinations

import en_core_web_sm
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import spacy

### Load in one of spacy english trained pipelines, en_core_web_sm: optimized for CPU:

* toke2vec
* tagger
* parser
* senter
* ner
* attribute_ruler
* lemmatizer

In [20]:
nlp_model = en_core_web_sm.load()

### Load in reddit vac data

In [9]:
processed_data = pd.read_csv("../data/filtered_data.csv")

In [12]:
processed_data.head(6)

Unnamed: 0,text,score,comms_num,year
0,Health Canada approves AstraZeneca COVID-19 va...,7,0,2021
1,COVID-19 in Canada: 'Vaccination passports' a ...,2,1,2021
2,Coronavirus variants could fuel Canada's third...,6,0,2021
3,Canadian government to extend COVID-19 emergen...,1,0,2021
4,Canada: Pfizer is 'extremely committed' to mee...,6,0,2021
5,Canada: Oxford-AstraZeneca vaccine approval ex...,5,0,2021


### test on one data in processed_data

In [31]:
for _, row in processed_data.head(6).iterrows():
    doc = nlp_model(row["text"])
    for token in doc:
        print(token.text, token.lemma_, token.pos_, token.tag_,
              token.dep_, token.shape_, token.is_alpha, token.is_stop)
        print("---"*16)

Health Health PROPN NNP compound Xxxxx True False
------------------------------------------------
Canada Canada PROPN NNP nsubj Xxxxx True False
------------------------------------------------
approves approve VERB VBZ ROOT xxxx True False
------------------------------------------------
AstraZeneca AstraZeneca PROPN NNP compound XxxxxXxxxx True False
------------------------------------------------
COVID-19 covid-19 NOUN NN compound XXXX-dd False False
------------------------------------------------
vaccine vaccine NOUN NN dobj xxxx True False
------------------------------------------------
COVID-19 covid-19 NOUN NN dep XXXX-dd False False
------------------------------------------------
in in ADP IN prep xx True True
------------------------------------------------
Canada Canada PROPN NNP pobj Xxxxx True False
------------------------------------------------
: : PUNCT : punct : False False
------------------------------------------------
' ' PUNCT '' punct ' False False
---------

### check for named entities

In [32]:
for _, row in processed_data.head(6).iterrows():
    doc = nlp_model(row["text"])
    for ent in doc.ents:
        print(ent.text, ent.start_char, ent.end_char, ent.label_)
        print("---"*16)

Health Canada 0 13 ORG
------------------------------------------------
AstraZeneca COVID-19 23 43 PRODUCT
------------------------------------------------
COVID-19 0 8 PERSON
------------------------------------------------
Canada 12 18 GPE
------------------------------------------------
Canada 32 38 GPE
------------------------------------------------
third 41 46 ORDINAL
------------------------------------------------
Canadian 0 8 NORP
------------------------------------------------
Canada 0 6 GPE
------------------------------------------------
Pfizer 8 14 PERSON
------------------------------------------------
Canada 0 6 GPE
------------------------------------------------
Oxford-AstraZeneca 8 26 ORG
------------------------------------------------
this week 53 62 DATE
------------------------------------------------


### Check for word similarity:

* word vectors or word embeddings using word2Vec not in sm so we will use lg 

In [33]:
import en_core_web_lg

In [35]:
nlp_model = en_core_web_lg.load()

In [38]:
for _, row in processed_data.head(6).iterrows():
    doc = nlp_model(row["text"])
    for token in doc:
        print(token.text, token.has_vector, token.vector_norm, token.is_oov, token.vector.shape)
        print("---"*16)

Health True 6.9867296 False (300,)
------------------------------------------------
Canada True 6.827161 False (300,)
------------------------------------------------
approves True 6.2131886 False (300,)
------------------------------------------------
AstraZeneca True 7.93687 False (300,)
------------------------------------------------
COVID-19 False 0.0 True (300,)
------------------------------------------------
vaccine True 8.14709 False (300,)
------------------------------------------------
COVID-19 False 0.0 True (300,)
------------------------------------------------
in True 5.0929856 False (300,)
------------------------------------------------
Canada True 6.827161 False (300,)
------------------------------------------------
: True 5.474056 False (300,)
------------------------------------------------
' True 6.137929 False (300,)
------------------------------------------------
Vaccination True 7.6967497 False (300,)
------------------------------------------------
passports

### Check highest rated comments are similar or lowest rated comments are similar

In [47]:
top_10_rated = processed_data.sort_values("score", ascending=False)[:10][["text"]]

In [49]:
bottom_10_rated = processed_data.sort_values("score")[:10][["text"]]

In [70]:
top_sim_scores = []
for pair in combinations(top_10_rated.text, 2):
    doc1, doc2 = nlp_model(pair[0]), nlp_model(pair[1])
    score = doc1.similarity(doc2)
    print(doc1, "<->", score)
    top_sim_scores.append(score)
    print("---"*16)

I would rage if this was handed to me... <-> 0.5837101095024435
------------------------------------------------
I would rage if this was handed to me... <-> 0.8765349747957699
------------------------------------------------
I would rage if this was handed to me... <-> 0.31949500986160057
------------------------------------------------
I would rage if this was handed to me... <-> 0.7502624636134892
------------------------------------------------
I would rage if this was handed to me... <-> 0.8992883353244511
------------------------------------------------
I would rage if this was handed to me... <-> 0.7135347295284674
------------------------------------------------
I would rage if this was handed to me... <-> 0.8314126433665695
------------------------------------------------
I would rage if this was handed to me... <-> 0.5741413619465091
------------------------------------------------
I would rage if this was handed to me... <-> 0.7802127625303276
-------------------------------

Vaccines exposed <-> 0.3828997484841429
------------------------------------------------
Vaccines exposed <-> 0.28787379814897146
------------------------------------------------
Vaccines exposed <-> 0.6030806313231314
------------------------------------------------
Vaccines exposed <-> 0.5413876491795431
------------------------------------------------
Vaccines exposed <-> 0.3403616025504241
------------------------------------------------
Meet my friend's anti-vax wife <-> 0.7296541712681599
------------------------------------------------
Meet my friend's anti-vax wife <-> 0.5164535915120289
------------------------------------------------
Meet my friend's anti-vax wife <-> 0.6587806225488709
------------------------------------------------
Meet my friend's anti-vax wife <-> 0.5248369958040966
------------------------------------------------
Meet my friend's anti-vax wife <-> 0.712503173936041
------------------------------------------------
Oh no! I got vaccinated! And I'm complet

In [71]:
bottom_sim_scores = []
for pair in combinations(bottom_10_rated.text, 2):
    doc1, doc2 = nlp_model(pair[0]), nlp_model(pair[1])
    score = doc1.similarity(doc2)
    print(doc1, "<->", score)
    bottom_sim_scores.append(score)
    print("---"*16)

well i DO. you disrespectful sheep. the natural scent lures out impurities in the mind and it’s our diseases! the crystals then kill these diseases on the outside <-> 0.9113625512247414
------------------------------------------------
well i DO. you disrespectful sheep. the natural scent lures out impurities in the mind and it’s our diseases! the crystals then kill these diseases on the outside <-> 0.8556950758994013
------------------------------------------------
well i DO. you disrespectful sheep. the natural scent lures out impurities in the mind and it’s our diseases! the crystals then kill these diseases on the outside <-> 0.9307129956274334
------------------------------------------------
well i DO. you disrespectful sheep. the natural scent lures out impurities in the mind and it’s our diseases! the crystals then kill these diseases on the outside <-> 0.8247784080448529
------------------------------------------------
well i DO. you disrespectful sheep. the natural scent lures 

>the CONCEPT that molecules are NOT the sum of their atomic parts ALWAYS shines throughOk, this is true, but thimerosal is still toxic and compounds of toxic heavy metals are still toxic in the VAST majority of cases>Do you know what mercuric, gold, or palladium catalysts are?Irrelevant but yes, I do>Your ignorance shines through your words.I am not attacking you personally, just pointing out errors in your statment, so please don't attack me <-> 0.9623364679697192
------------------------------------------------
Physicians for Informed Consent is a group of board certified medical doctors, neurobiologists, immunologists, nurses & lawyers. Members have also testified in front of Congress regarding vaccine safety. I don't see how that's a conspiracy <-> 0.903295731801224
------------------------------------------------
Physicians for Informed Consent is a group of board certified medical doctors, neurobiologists, immunologists, nurses & lawyers. Members have also testified in front of C

In [76]:
(np.mean(top_sim_scores),
np.std(top_sim_scores),
np.median(top_sim_scores),
np.max(top_sim_scores),
np.min(top_sim_scores))

(0.6012957775060893,
 0.17972013770551792,
 0.6036253414137118,
 0.9264170738385475,
 0.19991686161043892)

In [77]:
(np.mean(bottom_sim_scores),
np.std(bottom_sim_scores),
np.median(bottom_sim_scores),
np.max(bottom_sim_scores),
np.min(bottom_sim_scores))

(0.8510925100785558,
 0.11249667136610927,
 0.8862795497632656,
 0.9721250974083573,
 0.5941047970364449)

### It appears the the bottom scores are a lot more similar, mostly has to do with anti vax