In [None]:
import markdown

# My Jupyter Notebook

## Installing necessary packages

In [None]:
# Installing packages
#!pip install spacy
#!pip install markdown
#!python -m spacy download en_core_web_sm

## Importing spacy and pandas

In [1]:
import spacy
import pandas as pd

## Reading wiki_sentences dataset 
<a href="https://github.com/phgunawan/Latihan-ML/blob/master/wiki_sentences_v2.csv">Visit source</a>


In [24]:
candidate_sentences = pd.read_csv("wiki_sentences_v2.csv")

In [3]:
# Getting the size of pandas dataframe and looking at the first few rows for sanity check
candidate_sentences.shape
candidate_sentences['sentence'].sample(5)

1749    some bollywood films have been widely apprecia...
2541    m. night shyamalan has been asked numerous tim...
623     a month later, zombie officially disbanded whi...
1854    double super 8 film  is a 16 mm wide film but ...
843     each year the afi awards honor the ten outstan...
Name: sentence, dtype: object

### Loading en_core_web_sm, which is a small English pipeline trained on written web text (blogs, news, comments), that includes vocabulary, syntax and entities.

In [4]:
nlp = spacy.load('en_core_web_sm')
doc = nlp("the drawdown process is governed by astm standard d823")
for tok in doc:
  print(tok.text, "...", tok.dep_)

the ... det
drawdown ... compound
process ... nsubjpass
is ... auxpass
governed ... ROOT
by ... agent
astm ... compound
standard ... compound
d823 ... pobj


## Spacy.explain

In [8]:
spacy.explain("auxpass")

'auxiliary (passive)'

## Accessing token attributes

In [81]:
doc = nlp("Airstrikes continued into the early hours of Monday morning in Gaza.")
# Token texts
[token.text for token in doc]

['Airstrikes',
 'continued',
 'into',
 'the',
 'early',
 'hours',
 'of',
 'Monday',
 'morning',
 'in',
 'Gaza',
 '.']

## Accessing spans

In [76]:
doc = nlp("I argue that states’ built environments of conflict are material manifestations of state power that certain, targeted citizens experience as violence. ")
span = doc[2:4]
span.text

'that states'

## Creating a span manually


In [71]:
from spacy.tokens import Span
# Create a Doc object
doc = nlp("Airstrikes continued into the early hours of Monday morning in Gaza.")
span = Span(doc, 4, 11, label="NORP")
span.text

'early hours of'

## Part-of-speech tags

In [75]:
doc = nlp("Airstrikes continued into the early hours of Monday morning in Gaza.")
# Coarse-grained part-of-speech tags
[token.pos_ for token in doc]
# Fine-grained part-of-speech tags
[token.tag_ for token in doc]

['NNS', 'VBD', 'IN', 'DT', 'JJ', 'NNS', 'IN', 'NNP', 'NN', 'IN', 'NNP', '.']

In [72]:
spacy.explain("NNP")

'noun, proper singular'

## Syntactic dependencies

In [53]:
doc = nlp("Airstrikes continued into the early hours of Monday morning in Gaza.")
# Dependency labels
[token.dep_ for token in doc]
[token.head.text for token in doc]

['continued',
 'continued',
 'continued',
 'hours',
 'hours',
 'into',
 'hours',
 'morning',
 'of',
 'continued',
 'in',
 'continued']

## Named entities
#### <font color='red'> Bias within NLP labelling </font> 
##### Python library recognizing Israel's legitimacy as a state without doing the same for Palestine 

In [51]:
doc = nlp("Israel's new plan is to 'shrink,' not solve, the Palestinian conflict.")
# Text and label of named entity span
[(ent.text, ent.label_) for ent in doc.ents]

[('Israel', 'GPE'), ('Palestinian', 'NORP')]

In [None]:
#spacy.explain("NORP") -->'Nationalities or religious or political groups'
#spacy.explain("GPE") --> 'Countries, cities, states'

## Syntax iterators -- Sentences

In [49]:
doc = nlp("This is more than just eavesdropping, it’s terrifying. The spyware takes complete control over the phone. It can make calls to anybody, send messages and it can download content,” Aboudi told Al Jazeera.")
# doc.sents is a generator that yields sentence spans
[sent.text for sent in doc.sents]

['This is more than just eavesdropping, it’s terrifying.',
 'The spyware takes complete control over the phone.',
 'It can make calls to anybody, send messages and it can download content,” Aboudi told Al Jazeera.']

## Base noun phrases

In [48]:
doc = nlp("Ghassan Halaika, a Jerusalem-based researcher with Al Haq, recently noticed strange things happening with his phone.")
# doc.noun_chunks is a generator that yields spans
[chunk.text for chunk in doc.noun_chunks]

['Ghassan Halaika',
 'a Jerusalem-based researcher',
 'Al Haq',
 'strange things',
 'his phone']

## Label Explanations

In [52]:
#Exploring label explanations in spacy
spacy.explain("NORP")
# 'Nationalities or religious or political groups'
#  spacy.explain("GPE")
# 'Countries, cities, states'

'Nationalities or religious or political groups'

# Visualizing dependencies from various news sources


## Importing displacy for visualizing sentence structure

In [38]:
from spacy import displacy

## Sentence from Al Jazeera article
<a href="https://www.aljazeera.com/news/2021/11/14/palestinian-rights-activists-defiant-over-israeli-spyware-hacks">Palestinian rights activists defiant over Israeli spyware hacks</a>


In [44]:
doc = nlp("The rights groups deny any links to the PFLP and Israel has failed to publicly release any evidence to substantiate its claims.")
displacy.render(doc, style="dep")

## Sentence from NYT article
<a href="https://www.nytimes.com/2021/05/15/world/middleeast/israel-palestinian-conflict-gaza.html">Conflict Spirals Across Israel and the Palestinian Territories</a>


In [41]:
doc = nlp("An American envoy landed in Israel for cease-fire talks with Palestinians and Israelis. But as of Saturday evening, there was no sign of a letup in fighting and a media center in Gaza was destroyed.")
displacy.render(doc, style="dep")

## Visualize named entities

In [80]:
doc = nlp("Internment, Torture and Pro-government Militia in Northern Ireland (with Sarah K. Dreier and Dani Villa, and supported by Michael McCann and Noah Smith (co-PIs))")
displacy.render(doc, style="ent")