In [3]:
import spacy

# load the en_core_web_sm model
nlp = spacy.load("en_core_web_sm")

# process a text
string = "Hello! I don't know what I'm doing here."

# create a Doc object
doc = nlp(string)

# print named entities in the doc
tokens = [token.text for token in doc]
print(tokens)


['Hello', '!', 'I', 'do', "n't", 'know', 'what', 'I', "'m", 'doing', 'here', '.']


### Lemmatization

- Convert word into its base form
    - `reducing`, `reduces`, `reduced`, `reduction` -> `reduce`
    - `am`, `are`, `is` -> `be`
    - `n't` -> `not`
    - `'ve` -> `have`


In [4]:
# lemmatization using spaCy
lemmas = [token.lemma_ for token in doc]
print(lemmas)

['hello', '!', 'I', 'do', 'not', 'know', 'what', 'I', 'be', 'do', 'here', '.']


In [6]:
import spacy

# Load the en_core_web_sm model
nlp = spacy.load('en_core_web_sm')

# Create a Doc object
doc = nlp("Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that")

# Generate lemmas
lemmas = [token.lemma_ for token in doc]

# Convert lemmas into a string
print(' '.join(lemmas))

four score and seven year ago our father bring forth on this continent , a new nation , conceive in Liberty , and dedicate to the proposition that


### Text cleaning techniques
- Unnecessary whitespaces and escape sequences
- Punctuations
- Special characters (numbers, emojis, etc.)
- Stopwords

In [None]:
# isalpha() - alphabetical characters only

"dog".isalpha() # True

"3dogs".isalpha() # False

"123".isalpha() # False

"!".isalpha() # False

"?".isalpha() # False

False

In [8]:
# stopwords
stopwords = spacy.lang.en.stop_words.STOP_WORDS


# remove stopwords and non-alhpabetical tokens
a_lemmas = [lemma for lemma in lemmas
            if lemma.isalpha() and lemma not in stopwords]

# print string after text cleaning
print(' '.join(a_lemmas))

score seven year ago father bring forth continent new nation conceive Liberty dedicate proposition


In [None]:
# Load model and create Doc object
nlp = spacy.load('en_core_web_sm')
doc = nlp(blog)

# Generate lemmatized tokens
lemmas = [token.lemma_ for token in doc]

# Remove stopwords and non-alphabetic tokens
a_lemmas = [lemma for lemma in lemmas 
            if lemma.isalpha() and lemma not in stopwords]

# Print string after text cleaning
print(' '.join(a_lemmas))

In [None]:
# Function to preprocess text
def preprocess(text):
  	# Create Doc object
    doc = nlp(text, disable=['ner', 'parser'])
    # Generate lemmas
    lemmas = [token.lemma_ for token in doc]
    # Remove stopwords and non-alphabetic characters
    a_lemmas = [lemma for lemma in lemmas 
            if lemma.isalpha() and lemma not in stopwords]
    
    return ' '.join(a_lemmas)
  
# Apply preprocess to ted['transcript']
ted['transcript'] = ted['transcript'].apply(preprocess)
print(ted['transcript'])

## Part-of-speech tagging

**POS tagging**
- Assigning every word, its corresponding part of speech

`"Jane is an amazing guitarist"`
- POS Tagging:
    - `Jane` -> proper noun
    - `is` -> verb
    - `an` -> determiner
    - `amazing` -> adjective
    - `guitarist` -> noun

In [9]:
# pos tagging using spacy
pos = [(token.text, token.pos_) for token in doc]
print(pos)

[('Four', 'NUM'), ('score', 'NOUN'), ('and', 'CCONJ'), ('seven', 'NUM'), ('years', 'NOUN'), ('ago', 'ADV'), ('our', 'PRON'), ('fathers', 'NOUN'), ('brought', 'VERB'), ('forth', 'ADP'), ('on', 'ADP'), ('this', 'DET'), ('continent', 'NOUN'), (',', 'PUNCT'), ('a', 'DET'), ('new', 'ADJ'), ('nation', 'NOUN'), (',', 'PUNCT'), ('conceived', 'VERB'), ('in', 'ADP'), ('Liberty', 'PROPN'), (',', 'PUNCT'), ('and', 'CCONJ'), ('dedicated', 'VERB'), ('to', 'ADP'), ('the', 'DET'), ('proposition', 'NOUN'), ('that', 'PRON')]


In [11]:
# lotf
lotf = "He found himself understanding the wearisomeness of this life, where every path was an improvisation and a considerable part of one’s waking life was spent watching one’s feet."

nlp = spacy.load('en_core_web_sm')

doc = nlp(lotf) 

# Generate tokens and pos tags
pos = [(token.text, token.pos_) for token in doc]
print(pos)

[('He', 'PRON'), ('found', 'VERB'), ('himself', 'PRON'), ('understanding', 'VERB'), ('the', 'DET'), ('wearisomeness', 'NOUN'), ('of', 'ADP'), ('this', 'DET'), ('life', 'NOUN'), (',', 'PUNCT'), ('where', 'SCONJ'), ('every', 'DET'), ('path', 'NOUN'), ('was', 'AUX'), ('an', 'DET'), ('improvisation', 'NOUN'), ('and', 'CCONJ'), ('a', 'DET'), ('considerable', 'ADJ'), ('part', 'NOUN'), ('of', 'ADP'), ('one', 'NUM'), ('’s', 'PART'), ('waking', 'VERB'), ('life', 'NOUN'), ('was', 'AUX'), ('spent', 'VERB'), ('watching', 'VERB'), ('one', 'NUM'), ('’s', 'PART'), ('feet', 'NOUN'), ('.', 'PUNCT')]


In [12]:
# counting nouns in a piece of text

nlp = spacy.load('en_core_web_sm')

# Returns number of proper nouns
def proper_nouns(text, model=nlp):
  	# Create doc object
    doc = model(text)
    # Generate list of POS tags
    pos = [token.pos_ for token in doc]
    
    # Return number of proper nouns
    return pos.count('PROPN')

print(proper_nouns("Abdul, Bill and Cathy went to the market to buy apples.", nlp))

3


In [13]:
# counting other nouns in a piece of text

nlp = spacy.load('en_core_web_sm')

# Returns number of other nouns
def nouns(text, model=nlp):
  	# Create doc object
    doc = model(text)
    # Generate list of POS tags
    pos = [token.pos_ for token in doc]
    
    # Return number of other nouns
    return pos.count('NOUN')

print(nouns("Abdul, Bill and Cathy went to the market to buy apples.", nlp))

2


In [None]:
# noun usage in fake news

headlines['num_propn'] = headlines['title'].apply(proper_nouns)

# Compute mean of proper nouns
real_propn = headlines[headlines['label'] == 'REAL']['num_propn'].mean()
fake_propn = headlines[headlines['label'] == 'FAKE']['num_propn'].mean()

# Print results
print("Mean no. of proper nouns in real and fake headlines are %.2f and %.2f respectively"%(real_propn, fake_propn))

### Named entity recognition

**Applications**
- Efficient search algorithms
- Question answering
- News article classification
- Customer service

**Named entity recognition (NER)**
- Identifying and classifying named entities into predefined categories.
- Categories include person, organization, country, etc.

`"John Doe is a software engineer working at Google. He lives in France."`

- **Named Entities**
- `John Doe` -> person
- `Google` -> organization
- `France` -> country (geopolitical entity)


In [14]:
# NER using spacy
string = "John Doe is a software engineer working at Google. He lives in France."

# load model and create doc object
nlp = spacy.load('en_core_web_sm')
doc = nlp(string)

# Generate named entities
ne = [(ent.text, ent.label_) for ent in doc.ents]
print(ne)

[('John Doe', 'PERSON'), ('Google', 'ORG'), ('France', 'GPE')]


In [15]:
# identifying people mentioned in a news article
string = "It’s' been a busy day for Facebook  exec op-eds. Earlier this morning, Sheryl Sandberg broke the site’s silence around the Christchurch massacre, and now Mark Zuckerberg is calling on governments and other bodies to increase regulation around the sorts of data Facebook traffics in. He’s hoping to get out in front of heavy-handed regulation and get a seat at the table shaping it."

def find_persons(text):
  # Create Doc object
  doc = nlp(text)
  
  # Identify the persons
  persons = [ent.text for ent in doc.ents if ent.label_ == 'PERSON']
  
  # Return persons
  return persons

print(find_persons(string))


['Sheryl Sandberg', 'Mark Zuckerberg']
