In [5]:
import requests
from bs4 import BeautifulSoup
from gensim.summarization import summarize

In [6]:
url = 'https://www.npr.org/2021/04/28/991683886/frightened-to-death-cheerleader-speech-case-gives-supreme-court-pause'

In [7]:
page = requests.get(url).text

In [8]:
soup = BeautifulSoup(page)

In [9]:
header = soup.find('h1').text

In [10]:
date = soup.find('span', attrs={'class':'date'}).text

In [11]:
p_tags = soup.find_all('p')
p_tags

[<p class="byline__name byline__name--block">
 <a data-metrics='{"action":"Click Byline","category":"Story Metadata"}' href="https://www.npr.org/people/2101289/nina-totenberg" rel="author">
       Nina Totenberg
     </a>
 </p>,
 <p>
                 Brandi Levy wears her former cheerleading outfit as she looks at her mobile phone while sitting outside Mahanoy Area High School in Mahanoy City, Pa., on April 4.
                 
                 <b aria-label="Image credit" class="credit">
                     
                     Danna Singer/ACLU/AP
                     
                 </b>
 <b class="hide-caption"><b>hide caption</b></b>
 </p>,
 <p class="caption">Brandi Levy wears her former cheerleading outfit as she looks at her mobile phone while sitting outside Mahanoy Area High School in Mahanoy City, Pa., on April 4.</p>,
 <p>Facing its biggest student speech case in a half-century, the Supreme Court seemed to be looking for a narrow exit door on Wednesday.</p>,
 <p>At issu

In [12]:
text = [tags.text.strip() for tags in p_tags]
text

['Nina Totenberg',
 'Brandi Levy wears her former cheerleading outfit as she looks at her mobile phone while sitting outside Mahanoy Area High School in Mahanoy City, Pa., on April 4.\n                \n                \n                    \n                    Danna Singer/ACLU/AP\n                    \n                \nhide caption',
 'Brandi Levy wears her former cheerleading outfit as she looks at her mobile phone while sitting outside Mahanoy Area High School in Mahanoy City, Pa., on April 4.',
 'Facing its biggest student speech case in a half-century, the Supreme Court seemed to be looking for a narrow exit door on Wednesday.',
 'At issue was whether schools may punish students for speech that occurs online and off-campus but that may affect school order.',
 'The case has been billed as the most important student speech case since 1969. That landmark ruling came at the height of the Vietnam War. Mary Beth Tinker and four other students went to court after they were suspended f

In [13]:
sentence_list = [sentence for sentence in text if not '\n' in sentence]

In [14]:
sentence_list = [sentence for sentence in sentence_list if '.' in sentence]

In [15]:
article = ''.join(sentence_list)

In [16]:
summary = summarize(article)

In [17]:
summary

'Brandi Levy wears her former cheerleading outfit as she looks at her mobile phone while sitting outside Mahanoy Area High School in Mahanoy City, Pa., on April 4.Facing its biggest student speech case in a half-century, the Supreme Court seemed to be looking for a narrow exit door on Wednesday.At issue was whether schools may punish students for speech that occurs online and off-campus but that may affect school order.The case has been billed as the most important student speech case since 1969.\nThat landmark ruling came at the height of the Vietnam War. Mary Beth Tinker and four other students went to court after they were suspended for wearing black armbands to school to protest the war.By a vote of 7 to 2, the high court ruled at the time for the first time that kids do have First Amendment free speech rights at school, unless school officials reasonably forecast it will cause disruptions.Wednesday\'s case did not involve such serious speech.\nAnd on Wednesday, the Supreme Court f

# Tokenization

In [2]:
import spacy

### Load the language model

In [3]:
nlp = spacy.load('en_core_web_sm')

<img src = 'nlp.png'>

In [18]:
doc = nlp(summary)

In [44]:
spacy.explain?

In [19]:
for token in doc:
    print(token.text, '--->', spacy.explain(token.text))

Brandi ---> None
Levy ---> None
wears ---> None
her ---> None
former ---> None
cheerleading ---> None
outfit ---> None
as ---> None
she ---> None
looks ---> None
at ---> None
her ---> None
mobile ---> None
phone ---> None
while ---> None
sitting ---> None
outside ---> None
Mahanoy ---> None
Area ---> None
High ---> None
School ---> None
in ---> None
Mahanoy ---> None
City ---> None
, ---> punctuation mark, comma
Pa. ---> None
, ---> punctuation mark, comma
on ---> None
April ---> None
4.Facing ---> None
its ---> None
biggest ---> None
student ---> None
speech ---> None
case ---> case marking
in ---> None
a ---> None
half ---> None
- ---> None
century ---> None
, ---> punctuation mark, comma
the ---> None
Supreme ---> None
Court ---> None
seemed ---> None
to ---> None
be ---> None
looking ---> None
for ---> None
a ---> None
narrow ---> None
exit ---> None
door ---> None
on ---> None
Wednesday ---> None
. ---> punctuation mark, sentence closer
At ---> None
issue ---> None
was ---> None
w

' not found in glossary. It may however be explained in documentation for the corpora used to train the language. Please check `nlp.meta["sources"]` for any relevant links.


# POS tagging

In [20]:
for token in doc:
    print(token.text, '--->', token.pos_,'--->', spacy.explain(token.pos_))

Brandi ---> PROPN ---> proper noun
Levy ---> PROPN ---> proper noun
wears ---> VERB ---> verb
her ---> PRON ---> pronoun
former ---> ADJ ---> adjective
cheerleading ---> NOUN ---> noun
outfit ---> NOUN ---> noun
as ---> SCONJ ---> subordinating conjunction
she ---> PRON ---> pronoun
looks ---> VERB ---> verb
at ---> ADP ---> adposition
her ---> PRON ---> pronoun
mobile ---> ADJ ---> adjective
phone ---> NOUN ---> noun
while ---> SCONJ ---> subordinating conjunction
sitting ---> VERB ---> verb
outside ---> ADP ---> adposition
Mahanoy ---> PROPN ---> proper noun
Area ---> PROPN ---> proper noun
High ---> PROPN ---> proper noun
School ---> PROPN ---> proper noun
in ---> ADP ---> adposition
Mahanoy ---> PROPN ---> proper noun
City ---> PROPN ---> proper noun
, ---> PUNCT ---> punctuation
Pa. ---> PROPN ---> proper noun
, ---> PUNCT ---> punctuation
on ---> ADP ---> adposition
April ---> PROPN ---> proper noun
4.Facing ---> VERB ---> verb
its ---> PRON ---> pronoun
biggest ---> ADJ ---> adj

effectively ---> ADV ---> adverb
carry ---> VERB ---> verb
the ---> DET ---> determiner
schoolhouse ---> NOUN ---> noun
on ---> ADP ---> adposition
their ---> PRON ---> pronoun
backs ---> NOUN ---> noun
in ---> ADP ---> adposition
terms ---> NOUN ---> noun
of ---> ADP ---> adposition
speech ---> NOUN ---> noun
rights ---> NOUN ---> noun
everywhere ---> ADV ---> adverb
they ---> PRON ---> pronoun
go ---> VERB ---> verb
. ---> PUNCT ---> punctuation
"But ---> NOUN ---> noun
, ---> PUNCT ---> punctuation
the ---> DET ---> determiner
justices ---> NOUN ---> noun
wanted ---> VERB ---> verb
to ---> PART ---> particle
know ---> VERB ---> verb
, ---> PUNCT ---> punctuation
what ---> PRON ---> pronoun
about ---> ADP ---> adposition
cases ---> NOUN ---> noun
of ---> ADP ---> adposition
harassment ---> NOUN ---> noun
and ---> CCONJ ---> coordinating conjunction
bullying ---> NOUN ---> noun
? ---> PUNCT ---> punctuation


# Dependency parsing

In [21]:
for token in doc:
    print(token.text, '--->', token.dep_, '--->', spacy.explain(token.dep_))

Brandi ---> compound ---> compound
Levy ---> nsubj ---> nominal subject
wears ---> ROOT ---> None
her ---> poss ---> possession modifier
former ---> amod ---> adjectival modifier
cheerleading ---> compound ---> compound
outfit ---> dobj ---> direct object
as ---> mark ---> marker
she ---> nsubj ---> nominal subject
looks ---> advcl ---> adverbial clause modifier
at ---> prep ---> prepositional modifier
her ---> poss ---> possession modifier
mobile ---> amod ---> adjectival modifier
phone ---> pobj ---> object of preposition
while ---> mark ---> marker
sitting ---> advcl ---> adverbial clause modifier
outside ---> prep ---> prepositional modifier
Mahanoy ---> compound ---> compound
Area ---> compound ---> compound
High ---> compound ---> compound
School ---> pobj ---> object of preposition
in ---> prep ---> prepositional modifier
Mahanoy ---> compound ---> compound
City ---> pobj ---> object of preposition
, ---> punct ---> punctuation
Pa. ---> appos ---> appositional modifier
, ---> pu



# Lemmatization

In [22]:
for token in doc:
    print(token.text, '--->', token.lemma_)

Brandi ---> Brandi
Levy ---> Levy
wears ---> wear
her ---> her
former ---> former
cheerleading ---> cheerleading
outfit ---> outfit
as ---> as
she ---> she
looks ---> look
at ---> at
her ---> her
mobile ---> mobile
phone ---> phone
while ---> while
sitting ---> sit
outside ---> outside
Mahanoy ---> Mahanoy
Area ---> Area
High ---> High
School ---> School
in ---> in
Mahanoy ---> Mahanoy
City ---> City
, ---> ,
Pa. ---> Pa.
, ---> ,
on ---> on
April ---> April
4.Facing ---> 4.face
its ---> its
biggest ---> big
student ---> student
speech ---> speech
case ---> case
in ---> in
a ---> a
half ---> half
- ---> -
century ---> century
, ---> ,
the ---> the
Supreme ---> Supreme
Court ---> Court
seemed ---> seem
to ---> to
be ---> be
looking ---> look
for ---> for
a ---> a
narrow ---> narrow
exit ---> exit
door ---> door
on ---> on
Wednesday ---> Wednesday
. ---> .
At ---> at
issue ---> issue
was ---> be
whether ---> whether
schools ---> school
may ---> may
punish ---> punish
students ---> studen

# Sentence boundary detection

In [23]:
sentences = list(doc.sents)

In [24]:
for sentence in sentences:
    print(sentence)

Brandi Levy wears her former cheerleading outfit as she looks at her mobile phone while sitting outside Mahanoy Area High School in Mahanoy City, Pa., on April 4.Facing its biggest student speech case in a half-century, the Supreme Court seemed to be looking for a narrow exit door on Wednesday.
At issue was whether schools may punish students for speech that occurs online and off-campus but that may affect school order.
The case has been billed as the most important student speech case since 1969.

That landmark ruling came at the height of the Vietnam War.
Mary Beth Tinker and four other students went to court after they were suspended for wearing black armbands to school to protest the war.
By a vote of 7 to 2, the high court ruled at the time for the first time that kids do have First Amendment free speech rights at school, unless school officials reasonably forecast it will cause disruptions.
Wednesday's case did not involve such serious speech.

And on Wednesday, the Supreme Court

# Named entity 

In [25]:
for ent in doc.ents:
    print(ent.text, '---->', ent.label_, '---->', spacy.explain(ent.label_))

Brandi Levy ----> PERSON ----> People, including fictional
Mahanoy Area High School in Mahanoy City ----> ORG ----> Companies, agencies, institutions, etc.
Pa. ----> GPE ----> Countries, cities, states
April 4.Facing ----> DATE ----> Absolute or relative dates or periods
half-century ----> DATE ----> Absolute or relative dates or periods
the Supreme Court ----> ORG ----> Companies, agencies, institutions, etc.
Wednesday ----> DATE ----> Absolute or relative dates or periods
1969 ----> DATE ----> Absolute or relative dates or periods
the Vietnam War ----> EVENT ----> Named hurricanes, battles, wars, sports events, etc.
Mary Beth Tinker ----> PERSON ----> People, including fictional
four ----> CARDINAL ----> Numerals that do not fall under another type
7 ----> CARDINAL ----> Numerals that do not fall under another type
2 ----> CARDINAL ----> Numerals that do not fall under another type
first ----> ORDINAL ----> "first", "second", etc.
First Amendment ----> LAW ----> Named documents made 

# Visualization

In [26]:
from spacy import displacy

In [27]:
displacy.render(doc, style='ent', jupyter=True)

In [43]:
displacy.render(doc, style='dep', jupyter=True)

# Putting it all together

In [29]:
lis = []

In [30]:
for token in doc:
    dic = {}
    dic['Token'] = token.text
    dic['POS'] = token.pos_
    dic['Tags'] = token.tag_
    dic['Dep'] = token.dep_
    dic['Explanation'] = spacy.explain(token.tag_)
    lis.append(dic)

In [31]:
import pandas as pd

In [32]:
data = pd.DataFrame(lis)

In [36]:
data

Unnamed: 0,Token,POS,Tags,Dep,Explanation
0,Brandi,PROPN,NNP,compound,"noun, proper singular"
1,Levy,PROPN,NNP,nsubj,"noun, proper singular"
2,wears,VERB,VBZ,ROOT,"verb, 3rd person singular present"
3,her,PRON,PRP$,poss,"pronoun, possessive"
4,former,ADJ,JJ,amod,"adjective (English), other noun-modifier (Chin..."
...,...,...,...,...,...
359,of,ADP,IN,prep,"conjunction, subordinating or preposition"
360,harassment,NOUN,NN,pobj,"noun, singular or mass"
361,and,CCONJ,CC,cc,"conjunction, coordinating"
362,bullying,NOUN,NN,conj,"noun, singular or mass"


In [33]:
l = []
for ent in doc.ents:
    d = {}
    d['entities'] = ent.text
    d['labels'] = ent.label_
    d['Explanation'] = spacy.explain(ent.label_)
    l.append(d)

In [34]:
entities = pd.DataFrame(l)

In [35]:
entities

Unnamed: 0,entities,labels,Explanation
0,Brandi Levy,PERSON,"People, including fictional"
1,Mahanoy Area High School in Mahanoy City,ORG,"Companies, agencies, institutions, etc."
2,Pa.,GPE,"Countries, cities, states"
3,April 4.Facing,DATE,Absolute or relative dates or periods
4,half-century,DATE,Absolute or relative dates or periods
5,the Supreme Court,ORG,"Companies, agencies, institutions, etc."
6,Wednesday,DATE,Absolute or relative dates or periods
7,1969,DATE,Absolute or relative dates or periods
8,the Vietnam War,EVENT,"Named hurricanes, battles, wars, sports events..."
9,Mary Beth Tinker,PERSON,"People, including fictional"


In [37]:
import docx

In [38]:
doc = docx.Document()

In [39]:
doc.add_heading(header)

<docx.text.paragraph.Paragraph at 0x25af08e0470>

In [41]:
doc.add_picture('img.jpg')

<docx.shape.InlineShape at 0x25af08e0048>

In [44]:
doc.add_paragraph(summary)

<docx.text.paragraph.Paragraph at 0x25af09124e0>

In [50]:
doc.save('last1.docx')

In [49]:
doc.add_paragraph(date)

<docx.text.paragraph.Paragraph at 0x25af08de048>

In [None]:
doc.a