# Chapter 1: Finding words, Phrases, Names, and Concepts

## Intro to spacy

In [1]:
# Import spaCy
import spacy

# Create a blank English nlp object
nlp = spacy.blank("en")

- contains the processing pipeline
- includes language-specific rules for tokenization etc.

### Doc object

In [2]:
# Created by processing a string of text with the nlp object
doc = nlp("Hello world!")

# Iterate over tokens in a Doc
for token in doc:
    print(token.text)

Hello
world
!


### Token object

In [3]:
doc = nlp("Hello world!")

# Index into the Doc to get a single Token
token = doc[1]

# Get the token text via the .text attribute
print(token.text)

world


### Span object

In [4]:
doc = nlp("Hello world!")

# A slice from the Doc is a Span object
span = doc[1:3]

# Get the span text via the .text attribute
print(span.text)

world!


### Lexical attributes

In [5]:
doc = nlp("It costs $5.")

print("Index:   ", [token.i for token in doc])
print("Text:    ", [token.text for token in doc])

print("is_alpha:", [token.is_alpha for token in doc])
print("is_punct:", [token.is_punct for token in doc])
print("like_num:", [token.like_num for token in doc])

Index:    [0, 1, 2, 3, 4]
Text:     ['It', 'costs', '$', '5', '.']
is_alpha: [True, True, False, False, False]
is_punct: [False, False, False, False, True]
like_num: [False, False, False, True, False]


## Getting Started

In [8]:
import spacy

# Load blank English model
nlp = spacy.blank("en")
doc = nlp("She ate the pizza")
print(doc.text)

# Load blank german model
nlp = spacy.blank("de")
doc = nlp("Sie zog das Pizza.")
print(doc.text)

# load blank spanish model
nlp = spacy.blank("es")
doc = nlp("Ella comio la pizza.")
print(doc.text)

She ate the pizza
Sie zog das Pizza.
Ella comio la pizza.


## Documents, spans and tokens  

In [11]:
import spacy

nlp = spacy.blank('en')
doc = nlp('I like tree kangaroos and narwhals.')
#first token
first_token = doc[0]
print(first_token.text)

#slice "tree kangaroos"
tree_kangaroos = doc[2:4]
print(tree_kangaroos.text)

#slice "tree kangaroos and narwhals" 
tree_kangaroos_and_narwhals = doc[2:6]
print(tree_kangaroos_and_narwhals.text)

I
tree kangaroos
tree kangaroos and narwhals


## Lexical attributes

In [12]:
# Process the text
doc = nlp(
    "In 1990, more than 60% of people in East Asia were in extreme poverty. "
    "Now less than 4% are."
)

# Iterate over the tokens
for token in doc:
    # Check if the token resembles a number
    if token.like_num:
        # Get the next token in the document
        next_token = doc[token.i + 1]
        # Check if the next token's text equals "%"
        if next_token.text == "%":
            print("Percentage found:", token.text)

Percentage found: 60
Percentage found: 4


## Trained pipelines
What are trained pipelines?
- Models that enable spaCy to predict linguistic attributes in context
    - Part-of-speech tags
    - Syntactic dependencies
    - Named entities
- Trained on labeled example texts
- Can be updated with more examples to fine-tune predictions