
# Task 2: spaCy Hello World

In [1]:
from spacy.tokens.doc import Doc
from spacy.vocab import Vocab


In [2]:
doc = Doc(Vocab(), words = [u'Hello', u'World!'])
print(doc)

Hello World! 


In [3]:
print(type(doc))


<class 'spacy.tokens.doc.Doc'>


In [4]:
print(doc.vocab)

<spacy.vocab.Vocab object at 0x000002423171B310>


In [5]:
for token in doc:
    lexeme = doc.vocab[token.text]
    print(lexeme.text)

Hello
World!


1.The Vocab() object belongs to which class?

The Vocab() object belongs to the spacy.vocab.Vocab class. In the code provided, Vocab() is instantiated to create a vocabulary for the Doc object.

2.What is a Lexeme object?

In spaCy, a Lexeme object represents a single entry in the vocabulary. It contains information about a specific word, including its text, unique hash, and attributes like part-of-speech tags, syntactic dependencies, etc.

In [6]:
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(u'I want to learn spaCy.')
token_text1 = [token.text for token in doc]
token_text2 = [doc[i].text for i in range(len(doc))]

In [7]:
print(token_text1)

['I', 'want', 'to', 'learn', 'spaCy', '.']


In [8]:
print(token_text2)

['I', 'want', 'to', 'learn', 'spaCy', '.']


# 1.What is en_core_web_sm?

en_core_web_sm is a small English language model provided by spaCy. It's a pre-trained statistical model that includes vocabulary, syntax, named entities, word vectors, and various other linguistic annotations. This model is designed for general-purpose use, offering a balance between size and performance.

# 2.What is the size of en_core_web_sm?

The en_core_web_sm model is one of the smaller models provided by spaCy. As of my last update, its size is approximately 11-12 MB when downloaded. This size may vary slightly depending on updates or modifications made by spaCy developers.

# 3.What other variations can be used?

SpaCy offers different variations of language models based on size and features. Some other variations apart from en_core_web_sm for English include:
en_core_web_md: A medium-sized English model with more word vectors and larger vocabulary.
en_core_web_lg: A large English model with even more word vectors and a larger vocabulary, providing better performance but at a larger file size.
Additionally, there might be domain-specific or experimental models that focus on particular types of texts or tasks

In [9]:
doc = nlp(u'I want to learn spaCy.')
for i in range(len(doc)):
    print([t for t in doc[i].lefts])

[]
[I]
[]
[to]
[]
[]


In [10]:
doc = nlp(u'I want to learn spaCy.')
for i in range(len(doc)):
    print([t for t in doc[i].rights])
    print([t for t in doc[i].children])

[]
[]
[learn, .]
[I, learn, .]
[]
[]
[spaCy]
[to, spaCy]
[]
[]
[]
[]


In [11]:
from spacy import displacy

In [12]:
displacy.render(doc, style='dep')

# Draw the left and right dependencies for the sentence: I want to learn spaCy.

In [13]:
import spacy
from spacy import displacy

# Load the English model in spaCy
nlp = spacy.load('en_core_web_sm')

# Sentence
sentence = "I want to learn spaCy."

# Process the sentence
doc = nlp(sentence)

# Visualize left and right dependencies
displacy.render(doc, style='dep', options={'compact': True, 'distance': 120}, jupyter=True)


# Draw the children for the sentence: I want to learn spaCy.

In [14]:
sentence = "I want to learn spaCy."
# Process the sentence
doc = nlp(sentence)

# Retrieve children for each token in the sentence
for token in doc:
    children = [child.text for child in token.children]
    print(f"Token: {token.text} | Children: {children}")

Token: I | Children: []
Token: want | Children: ['I', 'learn', '.']
Token: to | Children: []
Token: learn | Children: ['to', 'spaCy']
Token: spaCy | Children: []
Token: . | Children: []


# Draw the left and right dependencies for the sentence: I would very much want to eat a hot dinner.

In [15]:

nlp = spacy.load('en_core_web_sm')

# Sentence
sentence = "I would very much want to eat a hot dinner."

# Process the sentence
doc = nlp(sentence)

# Visualize left and right dependencies
displacy.render(doc, style='dep', options={'compact': True, 'distance': 120}, jupyter=True)




# Present a list of all dependency grammars of your sentences above.

In [16]:
dependency_grammars = [token.dep_ for token in doc]
print(f"Dependency Grammars: {dependency_grammars}")

Dependency Grammars: ['nsubj', 'aux', 'advmod', 'advmod', 'ROOT', 'aux', 'xcomp', 'det', 'amod', 'dobj', 'punct']


# Task 3: NLTK vs spaCy Pipelines

In [23]:
nltk.download('punkt') # Sentence Tokenize
nltk.download('averaged_perceptron_tagger') # POS Tagging
nltk.download('maxent_ne_chunker') # Named Entity Chunking
nltk.download('words') # Word Tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\UMAIR\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\UMAIR\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\UMAIR\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\UMAIR\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [24]:
texts = [u"We are nearing the end of the semester at Peshawar. Final exams of the Fall 2023 semester will start soon."]

In [25]:
import nltk
nltk.download('punkt')
for text in texts:
    sentences = nltk.sent_tokenize(text)
    print(sentences)

['We are nearing the end of the semester at Peshawar.', 'Final exams of the Fall 2023 semester will start soon.']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\UMAIR\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [26]:
for sentence in sentences:
    words = nltk.word_tokenize(sentence)
    print(words)

['We', 'are', 'nearing', 'the', 'end', 'of', 'the', 'semester', 'at', 'Peshawar', '.']
['Final', 'exams', 'of', 'the', 'Fall', '2023', 'semester', 'will', 'start', 'soon', '.']


In [27]:
tagged_words = nltk.pos_tag(words)
print(tagged_words)

[('Final', 'JJ'), ('exams', 'NN'), ('of', 'IN'), ('the', 'DT'), ('Fall', 'NN'), ('2023', 'CD'), ('semester', 'NN'), ('will', 'MD'), ('start', 'VB'), ('soon', 'RB'), ('.', '.')]


In [28]:
ne_tagged_words = nltk.ne_chunk(tagged_words)
print(ne_tagged_words)

(S
  Final/JJ
  exams/NN
  of/IN
  the/DT
  Fall/NN
  2023/CD
  semester/NN
  will/MD
  start/VB
  soon/RB
  ./.)


In [30]:
from spacy import displacy
doc = nlp(u'We are nearing the end of the semester at Peshawar. Final exams of the Fall 2023 semester will start soon.')
displacy.render(doc, style='ent')

In [31]:
for ent in doc.ents:
    print(ent.text, ent.label_)

the end of the semester DATE
Peshawar GPE


# Attempt the following questions:
1. How did the Named Entity Output of the NLTK pipeline look like? Present its output.

Certainly! The Named Entity Recognition (NER) output from NLTK typically appears as a nested tree structure indicating recognized entities and their types. In the output, entities like organizations, locations, dates, etc., are tagged within a nested format, showing their positions in the text along with their respective types.

In [33]:
text = "We are nearing the end of the semester at Peshawar. Final exams of the Fall 2023 semester will start soon."
# Tokenize sentences
sentences = nltk.sent_tokenize(text)
# Tokenize words and perform POS tagging
tagged_words = []
for sentence in sentences:
    words = nltk.word_tokenize(sentence)
    tagged_words.extend(nltk.pos_tag(words))

# Perform Named Entity Recognition
ne_tagged_words = nltk.ne_chunk(tagged_words)

print(ne_tagged_words)


(S
  We/PRP
  are/VBP
  nearing/VBG
  the/DT
  end/NN
  of/IN
  the/DT
  semester/NN
  at/IN
  (ORGANIZATION Peshawar/NNP)
  ./.
  Final/JJ
  exams/NN
  of/IN
  the/DT
  Fall/NN
  2023/CD
  semester/NN
  will/MD
  start/VB
  soon/RB
  ./.)


2. How did the Named Entity Output of the spaCy pipeline look like? Present its output.

The Named Entity Recognition (NER) output of spaCy appears as identified entities with their respective labels directly tagged in the text. Each recognized entity is displayed along with its label (e.g., PERSON, ORG, DATE) in a clear, human-readable format, making it easy to understand and utilize for downstream tasks.

In [34]:
import spacy
from spacy import displacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")

text = "We are nearing the end of the semester at Peshawar. Final exams of the Fall 2023 semester will start soon."
doc = nlp(text)

# Display the Named Entities using displacy
displacy.render(doc, style='ent')

# Extract and print the entities and their labels
for ent in doc.ents:
    print(ent.text, ent.label_)


the end of the semester DATE
Peshawar GPE


In [35]:
import spacy
nlp = spacy.load('en_core_web_sm')
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

# Attempt the following questions:
1. What is the default pipeline structure of spaCy?

The default pipeline structure of spaCy can be seen as a set of steps a language expert follows to understand a text. It starts by breaking the text into smaller parts (tokens), figures out what each word means (part-of-speech tagging), understands how words relate to each other (dependency parsing), identifies important named things like people or places (named entity recognition), and sometimes assigns categories to whole pieces of text (text categorization).

In [36]:
nlp = spacy.load('en_core_web_sm',disable=['parser'])

In [39]:
from spacy.language import Language

@Language.component("my_component")
def my_component(doc):
    # Do something to the doc here
    return doc


In [40]:
nlp.add_pipe("my_component")

<function __main__.my_component(doc)>

# Task 4: Finding Patterns in Sentences

In [42]:
doc = nlp(u'I want to learn spaCy.')
displacy.render(doc, style='dep')

https://spacy.io/usage/models


In [43]:
doc = nlp(u'How do I learn spaCy.')
displacy.render(doc, style='dep')

In [47]:
import spacy
nlp = spacy.load('en_core_web_sm')
def dep_pattern(doc):
    for i in range(len(doc)-1):
        print(doc[i].dep_)
        if doc[i].dep_ == 'nsubj' and doc[i+1].dep_ == 'ROOT' and doc[i+2].dep_ =='acomp':
            return True
    return False
doc = nlp(u'How do I learn spaCy.')
if dep_pattern(doc):
    print('Found')
else:
    print('Not found')

advmod
aux
nsubj
Found
