A notebook experimenting with Natural Language Processing in Python

For now I am using spaCy and following this online resource: https://course.spacy.io/en/chapter1

In [1]:
import spacy

In [2]:
nlp = spacy.blank("en")

In [3]:
doc = nlp("Hello world!")

for token in doc:
    print(token.text)

Hello
world
!


In [4]:
doc2 = nlp("how many takens are in this sentence")

for token in doc2:
    print(token.text)

how
many
takens
are
in
this
sentence


In [5]:
doc = nlp("Hello World!")

token = doc[1]

print(token.text)

World


In [7]:
span = doc[1:3]
span = doc[0:2]
print(span)

Hello World


In [8]:
doc = nlp("It costs $5.")

print("Index:   ", [token.i for token in doc])
print("Text:    ", [token.text for token in doc])

print("is_alpha:", [token.is_alpha for token in doc])
print("is_punct:", [token.is_punct for token in doc])
print("like_num:", [token.like_num for token in doc])

Index:    [0, 1, 2, 3, 4]
Text:     ['It', 'costs', '$', '5', '.']
is_alpha: [True, True, False, False, False]
is_punct: [False, False, False, False, True]
like_num: [False, False, False, True, False]


In [9]:
doc = nlp("I like tree kangaroos and narwhals.")

first_token = doc[0]

print(first_token.text)

I


In [10]:
# Process the text
doc = nlp(
    "In 1990, more than 60% of people in East Asia were in extreme poverty. "
    "Now less than 4% are."
)

# Iterate over the tokens in the doc
for token in doc:
    # Check if the token resembles a number
    if token.like_num:
        # Get the next token in the document
        next_token = doc[token.i + 1]
        # Check if the next token's text equals "%"
        if next_token.text == "%":
            print("Percentage found:", token.text)

Percentage found: 60
Percentage found: 4


have to download en_core_web_sm first for this to work. From the command line: python -m spacy download en_core_web_sm

In [11]:
nlp = spacy.load("en_core_web_sm")

In [12]:
doc = nlp("She ate a pizza")

for token in doc:
    print(token.text, token.pos_)



She PRON
ate VERB
a DET
pizza NOUN


In [19]:
for token in doc:
    print(token.text, token.pos_, token.dep_, token.head.text)

She PRON nsubj ate
ate VERB ROOT ate
a DET det pizza
pizza NOUN dobj ate


In [25]:
# Process a text
doc = nlp("Apple is in the United States and valued over 1 trillion")

# Iterate over the predicted entities
for ent in doc.ents:
    # Print the entity text and its label
    print(ent.text, ent.label_)

Apple ORG
the United States GPE
over 1 trillion MONEY


In [22]:
spacy.explain("GPE")

'Countries, cities, states'

In [33]:
spacy.explain("DET")

'determiner'

In [26]:
text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"

# Process the text
doc = nlp(text)

# Print the document text
print(doc.text)

It’s official: Apple is the first U.S. public company to reach a $1 trillion market value


In [27]:
# Iterate over the predicted entities
for ent in doc.ents:
    # Print the entity text and its label
    print(ent.text, ent.label_)

Apple ORG
first ORDINAL
U.S. GPE
$1 trillion MONEY


In [31]:
text = "Upcoming iPhone X release date leaked as Apple reveals pre-orders"

# Process the text
doc = nlp(text)

# Iterate over the entities
for ent in doc.ents:
    # Print the entity text and label
    print(ent.text, ent.label_)

# Get the span for "iPhone X"
iphone_x = doc[1:3]

# Print the span text
print("Missing entity:", iphone_x.text)

Apple ORG
Missing entity: iPhone X


In [32]:
[{"TEXT": "iPhone"}, {"TEXT": "X"}]
[{"LOWER": "iphone"}, {"LOWER": "x"}]
[{"LEMMA": "buy"}, {"POS": "NOUN"}]

In [36]:
# Import the Matcher
from spacy.matcher import Matcher

# Initialize the matcher with the shared vocab
matcher = Matcher(nlp.vocab)

# Add the pattern to the matcher
pattern = [{"TEXT": "iPhone"}, {"TEXT": "X"}]
matcher.add("IPHONE_PATTERN", [pattern])

# Process some text
doc = nlp("Upcoming iPhone X release date leaked")

# Call the matcher on the doc
matches = matcher(doc)

# Iterate over the matches #why the for loop? There's only one match in matches
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print(matched_span.text)

manual_span = doc[1:3]
print(manual_span)

iPhone X
iPhone X


In [39]:
pattern = [
    {"IS_DIGIT": True},
    {"LOWER": "fifa"},
    {"LOWER": "world"},
    {"LOWER": "cup"},
    {"IS_PUNCT": True}
]

doc = nlp("2018 FIFA World Cup: France won!")
matcher.add("FIFA_PATTERN", [pattern])

# Call the matcher on the doc
matches = matcher(doc)

# Iterate over the matches #why the for loop? There's only one match in matches
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print(matched_span.text)

2018 FIFA World Cup:


In [43]:
pattern = [
    {"LEMMA": "love", "POS": "VERB"},
    {"POS": "NOUN"}
]

doc = nlp("I loved dogs but now I love cats more.")
matcher.add("PET_PATTERN", [pattern])

# Call the matcher on the doc
matches = matcher(doc)

for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print(matched_span.text)

loved dogs
love cats


In [44]:
pattern = [
    {"LEMMA": "buy"},
    {"POS": "DET", "OP": "?"},  # optional: match 0 or 1 times
    {"POS": "NOUN"}
]

doc = nlp("I bought a smartphone. Now I'm buying apps.")
matcher.add("PHONE_PATTERN", [pattern])

matches = matcher(doc)

for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(matched_span.text)

bought a smartphone
buying apps


In [None]:
import spacy

# Import the Matcher
from spacy.____ import ____

nlp = spacy.load("en_core_web_sm")
doc = nlp("Upcoming iPhone X release date leaked as Apple reveals pre-orders")

# Initialize the Matcher with the shared vocabulary
matcher = ____(____.____)

# Create a pattern matching two tokens: "iPhone" and "X"
pattern = [____]

# Add the pattern to the matcher
____.____("IPHONE_X_PATTERN", ____)

# Use the matcher on the doc
matches = ____
print("Matches:", [doc[start:end].text for match_id, start, end in matches])