In [95]:
import spacy
from spacy.lang.en import stop_words
import pandas as pd
import numpy as np
from collections import Counter
from glob import glob
import string
import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

# Display plots in this notebook, instead of externally. 
from pylab import rcParams
rcParams['figure.figsize'] = 16, 8
%matplotlib inline

# The following are optional dependencies. 
# Feel free to comment these out. Otherwise you need to pip install them
# Sent2tree uses the sent2tree.py module in this repository. 
from sent2tree import sentenceTree
import ete3 # data visualisation library
import seaborn # data visualisation library

from lmpylib.core import *

In [2]:
nlp = spacy.load('en_core_web_lg')

In [6]:
with open("../pratical/data/coronavirus-masks.txt") as f:
    opinion_text = f.read()
opinion = nlp(opinion_text)

In [7]:
comments_df = pd.read_csv("../pratical/data/coronavirus-masks-comments.csv")
comments = [nlp(txt) for txt in comments_df["comment"]]
comments_df["length"] = [len(c) for c in comments]
comments_df

[106,
 115,
 31,
 275,
 161,
 83,
 15,
 8,
 116,
 67,
 29,
 15,
 53,
 35,
 17,
 223,
 19,
 55,
 133,
 84,
 20,
 28,
 147,
 345,
 72,
 74,
 190,
 109,
 283,
 166,
 113,
 107,
 21,
 75,
 36,
 76,
 50,
 13,
 91,
 73,
 110,
 96,
 299,
 29,
 10,
 54,
 83,
 302,
 4,
 113,
 58,
 149,
 39,
 74,
 46,
 37,
 49,
 55,
 34,
 29,
 55,
 101,
 134,
 86,
 178,
 276,
 206,
 67,
 136,
 67,
 313,
 104,
 174,
 217,
 144,
 201,
 56,
 130,
 187,
 170,
 105,
 92,
 68,
 72,
 94,
 170,
 216,
 17,
 66,
 233,
 145,
 117,
 198,
 213,
 122,
 9,
 130,
 98,
 28,
 206,
 32,
 196,
 35,
 131,
 21,
 80,
 81,
 61,
 28,
 39,
 18,
 4,
 10,
 57,
 101,
 21,
 15,
 236,
 27,
 26,
 23,
 39,
 9,
 58,
 6,
 30,
 33,
 8,
 179,
 26,
 28,
 30,
 32,
 41,
 22,
 29,
 65,
 88,
 60,
 269,
 41,
 275,
 118,
 197,
 11,
 155,
 21,
 29,
 66,
 23,
 4,
 6,
 123,
 32,
 20,
 55,
 106,
 271,
 68,
 92,
 301,
 82,
 21,
 16,
 55,
 74,
 127,
 21,
 117,
 69,
 19,
 43,
 315,
 30,
 18,
 34,
 29,
 24,
 159,
 90,
 41,
 92,
 45,
 23,
 124,
 144,
 67,
 208,
 1

In [17]:
comment0 = comments[0]
comment0

On social media I have seen people give bizarre reasons against wearing masks. These make me to wonder what has happened to American critical thinking skills. 

At a Ventura CA county government entity, a stable genius follower said that wearing a mask "is an act of submission, subjugation, of slavery, of shame." SEE: 
twitter.com/nowthisnews/status/1278909582991953920

In Florida stable genius followers spoke out against a mask mandate. Some of the reasons given involved satanism, death, and pedophilia. SEE:
twitter.com/nowthisnews/status/1277814869840953345

Scary stuff! So much for American exceptionalism!

In [21]:
list(opinion.sents)

[Wear a mask.
 ,
 Seriously, just wear one.,
 Almost any mask will do, really.,
 N95, surgical, spandex, homespun cotton.,
 For people who aren’t front-line health care workers, what matters is whatever you can get your hands on that fits over your nose and mouth.
 ,
 As the nation plunges for a second time into the depths of this brutal pandemic, officials worry we’ll soon have as many as 100,000 new cases every day.,
 Summer won’t save us.,
 Neither will bluster or bleach.
 ,
 It’s easy to want to give up, but it would be wrong.,
 Wearing a mask is not only simple and cheap, it’s also proved to be effective in slowing the virus’s spread.,
 It will protect the health and even save the lives of your loved ones, your neighbors and people you don’t know.
 ,
 This isn’t hard.,
 If the lower half of your face is not covered when you go out in public, stop searching for excuses and go mask up.
 ,
 Think about it as good hygiene and common courtesy.,
 Would you sneeze into your hand and then

In [22]:
set([w.label_ for w in opinion.ents])

{'CARDINAL',
 'DATE',
 'EVENT',
 'GPE',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'WORK_OF_ART'}

In [41]:
[(w, w.label_) for w in opinion.ents]

[(N95, 'ORG'),
 (second, 'ORDINAL'),
 (as many as 100,000, 'CARDINAL'),
 (every day, 'DATE'),
 (Summer, 'DATE'),
 (half, 'CARDINAL'),
 (Americans, 'NORP'),
 (daily, 'DATE'),
 (nearly 5 percent, 'PERCENT'),
 ($1 trillion, 'MONEY'),
 (this week, 'DATE'),
 (Goldman Sachs, 'ORG'),
 (Independence Day, 'EVENT'),
 (Republicans, 'NORP'),
 (Alex Castellanos, 'PERSON'),
 (Republican, 'NORP'),
 (The Washington Post, 'ORG'),
 (Christians, 'NORP'),
 (Muslims, 'NORP'),
 (the Church of Secular Science, 'ORG'),
 (Americans, 'NORP'),
 (Republican, 'NORP'),
 (Democrat, 'NORP'),
 (Dozens, 'CARDINAL'),
 (the United States, 'GPE'),
 (Republicans, 'NORP'),
 (Thursday, 'DATE'),
 (Greg Abbott, 'PERSON'),
 (Texas, 'GPE'),
 (Senate, 'ORG'),
 (Mitch McConnell, 'PERSON'),
 (Mike Pence, 'PERSON'),
 (Sean Hannity, 'PERSON'),
 (Earlier this week, 'DATE'),
 (Steve Doocy, 'PERSON'),
 (Fox & Friends, 'ORG'),
 (Trump, 'PERSON'),
 (White House, 'ORG'),
 (MAGA, 'ORG'),
 (Masks Are Great Again, 'WORK_OF_ART'),
 (daily, 'DA

In [40]:
dir(opinion.ents[0])

['_',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '_fix_dep_copy',
 '_recalculate_indices',
 '_vector',
 '_vector_norm',
 'as_doc',
 'char_span',
 'conjuncts',
 'doc',
 'end',
 'end_char',
 'ent_id',
 'ent_id_',
 'ents',
 'get_extension',
 'get_lca_matrix',
 'has_extension',
 'has_vector',
 'kb_id',
 'kb_id_',
 'label',
 'label_',
 'lefts',
 'lemma_',
 'lower_',
 'merge',
 'n_lefts',
 'n_rights',
 'noun_chunks',
 'orth_',
 'remove_extension',
 'rights',
 'root',
 'sent',
 'sentiment',
 'set_extension',
 'similarity',
 'start',
 'start_char',
 'string',
 'subtree',
 'tensor',
 'text',
 'text_with_ws',
 'to_array',
 'upper_',
 'vector',
 'vector_norm'

In [110]:
words, word_strings, lemma, tags, pos = list(), list(), list(), list(), list()

for sent in opinion.sents:
    for word in sent:
        words.append(word)
        word_strings.append(word.string)
        lemma.append(word.lemma_)
        tags.append(word.tag_)
        pos.append(word.pos_)
    
words_df = pd.DataFrame({
    "word": word_strings,
    "lemma": lemma,
    "tag": tags,
    "pos": pos,
    "count": [1-int(stop_words.STOP_WORDS.__contains__(w)) for w in lemma],
})



words_df

Unnamed: 0,word,lemma,tag,pos,count
0,Wear,wear,VB,VERB,1
1,a,a,DT,DET,0
2,mask,mask,NN,NOUN,1
3,.,.,.,PUNCT,1
4,\n\n,\n\n,_SP,SPACE,1
...,...,...,...,...,...
1768,Twitter,Twitter,NNP,PROPN,1
1769,@NYTopinion,@nytopinion,NN,NOUN,1
1770,and,and,CC,CCONJ,0
1771,Instagram,Instagram,NNP,PROPN,1


In [117]:
k = words_df.loc[words_df["pos"].isin(["NOUN", "VERB"]), ["lemma", "tag", "count"]].groupby(["lemma", "tag"], as_index=False).sum().sort_values("count", ascending=False)
k.loc[k["count"] > 5, :]
# words_df.loc[words_df["word"] == "Twitter", :]

Unnamed: 0,lemma,tag,count
349,’,VBZ,18
170,mask,NN,18
171,mask,NNS,14
205,people,NNS,10
326,virus,NN,8
128,health,NN,7
335,wear,VBG,7
334,wear,VB,7
230,public,NN,6


In [130]:
comments[12]

What really, really annoys me is that those exercising their "rights" have revived the image of the "ugly American."  Quite apart from stunningly poor hygiene, those traveling on US passports will be presumed to have no consideration or even manners in host foreign countries.