# One Hot Encoding

In [1]:
import numpy as np
docs = 'Can I eat the Pizza'.lower().split()
doc1 = set(docs)
doc1 = sorted(doc1)
print('\nvalues: ',doc1)

integer_encoded = []
for i in docs:
  v = np.where(np.array(doc1)==i)[0][0]
  integer_encoded.append(v)
print('\ninteger encoded: ',integer_encoded)

def get_vec(len_doc,word):
  empty_vector = [0] * len_doc
  vect = 0
  find = np.where(np.array(doc1)==word)[0][0]
  empty_vector[find] = 1
  return empty_vector

def get_matrix(doc1):
  mat = []
  len_doc = len(doc1)
  for i in docs:
    vec = get_vec(len_doc,i)
    mat.append(vec)
  return np.asarray(mat)
print('\nOne Hot Encoding:')
print(get_matrix(doc1))



values:  ['can', 'eat', 'i', 'pizza', 'the']

integer encoded:  [0, 2, 1, 4, 3]

One Hot Encoding:
[[1 0 0 0 0]
 [0 0 1 0 0]
 [0 1 0 0 0]
 [0 0 0 0 1]
 [0 0 0 1 0]]


# Bag Of Words

In [12]:
from tensorflow.keras.preprocessing.text import Tokenizer


text = [
    'There was a man',
    'The man had a dog',
    'The dog and the man walked',
]

#using tokenizer
model = Tokenizer()
model.fit_on_texts(text)

#print keys
print(f'Key : {list(model.word_index.keys())}')

#Create bag of words representation
#ignore the first column
rep = model.texts_to_matrix(text,mode='count')
print(rep)

Key : ['man', 'the', 'a', 'dog', 'there', 'was', 'had', 'and', 'walked']
[[0. 1. 0. 1. 0. 1. 1. 0. 0. 0.]
 [0. 1. 1. 1. 1. 0. 0. 1. 0. 0.]
 [0. 1. 2. 0. 1. 0. 0. 0. 1. 1.]]


# N-Grams

In [20]:
import re
from nltk.util import ngrams
s = 'natural language nlp is science and artificial intelligence concerned'
s = s.lower()
s = re.sub(r'[^a-zA-Z0-9\s]',' ',s)
tokens = [token for token in s.split(' ') if token!='']

#Generate Bigrams
output = list(ngrams(tokens,2))
print(output)

#Generate Trigrams
output = list(ngrams(tokens, 3))
print(output)

[('natural', 'language'), ('language', 'nlp'), ('nlp', 'is'), ('is', 'science'), ('science', 'and'), ('and', 'artificial'), ('artificial', 'intelligence'), ('intelligence', 'concerned')]
[('natural', 'language', 'nlp'), ('language', 'nlp', 'is'), ('nlp', 'is', 'science'), ('is', 'science', 'and'), ('science', 'and', 'artificial'), ('and', 'artificial', 'intelligence'), ('artificial', 'intelligence', 'concerned')]


In [23]:
#N-grams hashing
import pandas as pd
import requests
from collections import Counter

seuss_dir = 'https://dlsun.github.io/pods/data/drseuss/'
seuss_files = [
    'green_eggs_and_ham.txt', 'cat_in_the_hat.txt']

docs_seuss = pd.Series()
for file in seuss_files:
  response = requests.get(seuss_dir + file, 'r')
  docs_seuss[file[:-4]] = response.text
  docs_seuss
  docs_seuss.str.split()
  words = (
      docs_seuss.str.lower().str.replace("[^\w\s]", " ").str.split())
words


Unnamed: 0,0
green_eggs_and_ham,"[i, am, sam, i, am, sam, sam, i, am, that, sam..."
cat_in_the_hat,"[the, sun, did, not, shine., it, was, too, wet..."


In [None]:
def get_bigrams(words):
  return zip(words, words[1:])

words.apply(get_bigrams).apply(Counter)

Unnamed: 0,0
green_eggs_and_ham,"{('i', 'am'): 3, ('am', 'sam'): 2, ('sam', 'i'..."
cat_in_the_hat,"{('the', 'sun'): 2, ('sun', 'did'): 1, ('did',..."


# TF-IDF

In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer
#assign documents
d0 = 'I like to walk'
d1 = 'I like to swim'
d2 = 'I dont wish to walk behind you'

#merge documents into a single corpus
string = [d0, d1, d2]
#create object
tfidf = TfidfVectorizer()

#get tf-df values
result = tfidf.fit_transform(string)
#get idf values
print('\nidf values:')
for ele1, ele2,in zip(tfidf.get_feature_names_out(), tfidf.idf_):
  print(ele1, ':', ele2)

# get indexing
print('\nWord indexes:')
print(tfidf.vocabulary_)

#display tf-idf values
print('\ntf-idf value:')
print(result)

# in matrix form
print('\ntf-idf values in matrix form:')
print(result.toarray())



idf values:
behind : 1.6931471805599454
dont : 1.6931471805599454
like : 1.2876820724517808
swim : 1.6931471805599454
to : 1.0
walk : 1.2876820724517808
wish : 1.6931471805599454
you : 1.6931471805599454

Word indexes:
{'like': 2, 'to': 4, 'walk': 5, 'swim': 3, 'dont': 1, 'wish': 6, 'behind': 0, 'you': 7}

tf-idf value:
  (0, 2)	0.6198053799406072
  (0, 4)	0.48133416873660545
  (0, 5)	0.6198053799406072
  (1, 2)	0.5478321549274363
  (1, 4)	0.4254405389711991
  (1, 3)	0.7203334490549893
  (2, 4)	0.2660749625405929
  (2, 5)	0.3426199591918006
  (2, 1)	0.450504072643198
  (2, 6)	0.450504072643198
  (2, 0)	0.450504072643198
  (2, 7)	0.450504072643198

tf-idf values in matrix form:
[[0.         0.         0.61980538 0.         0.48133417 0.61980538
  0.         0.        ]
 [0.         0.         0.54783215 0.72033345 0.42544054 0.
  0.         0.        ]
 [0.45050407 0.45050407 0.         0.         0.26607496 0.34261996
  0.45050407 0.45050407]]


In [None]:
# import required module
from sklearn.feature_extraction.text import TfidfVectorizer
#assign documents
d0 = 'I like machine learning'
d1 = 'machine learning is fun'
d2 = 'I teach machine learning'

#merge documents into a single corpus
string = [d0,d1,d2]
#create object
tfidf = TfidfVectorizer()

#get tf-idf values
result = tfidf.fit_transform(string)
#get idf values
print('\nidf values:')
for ele1,ele2 in zip(tfidf.get_feature_names_out(),tfidf.idf_):
  print(ele1,':',ele2)

#get indexing
print('\nWord indexes:')
print(tfidf.vocabulary_)

#display tf-idf values
print('\ntf-idf value:')
print(result)

#in matrix form
print('\ntf-idf values in matrix form:')
print(result.toarray())


idf values:
fun : 1.6931471805599454
is : 1.6931471805599454
learning : 1.0
like : 1.6931471805599454
machine : 1.0
teach : 1.6931471805599454

Word indexes:
{'like': 3, 'machine': 4, 'learning': 2, 'is': 1, 'fun': 0, 'teach': 5}

tf-idf value:
  (0, 2)	0.4532946552278861
  (0, 4)	0.4532946552278861
  (0, 3)	0.7674945674619879
  (1, 0)	0.6088450986844796
  (1, 1)	0.6088450986844796
  (1, 2)	0.35959372325985667
  (1, 4)	0.35959372325985667
  (2, 5)	0.7674945674619879
  (2, 2)	0.4532946552278861
  (2, 4)	0.4532946552278861

tf-idf values in matrix form:
[[0.         0.         0.45329466 0.76749457 0.45329466 0.        ]
 [0.6088451  0.6088451  0.35959372 0.         0.35959372 0.        ]
 [0.         0.         0.45329466 0.         0.45329466 0.76749457]]


# Word2Vec

In [None]:
import nltk
nltk.download('punkt')

from nltk.tokenize import sent_tokenize , word_tokenize
import warnings

warnings.filterwarnings(action='ignore')

import gensim
from gensim.models import Word2Vec

s='natural language processing nlp is science and artificial intelligence.'

# Replaces escape character with space
f = s.replace('\n',' ')

data = []

#iterate through each sentence in the file
for i in sent_tokenize(f):
  temp = []

# tokenize the sentences into words
for j in word_tokenize(i):
  temp.append(j.lower())

data.append(temp)
data



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[['natural',
  'language',
  'processing',
  'nlp',
  'is',
  'science',
  'and',
  'artificial',
  'intelligence',
  '.']]

In [None]:
model1 = gensim.models.Word2Vec(sentences = data,vector_size=8,window=5,min_count=1,workers=4)
print(model1.wv.get_vector('natural'))

print('Cosine Similarity between "artificial"'+
      'and "intelligence" -CBOW : ',
      model1.wv.similarity('artificial','intelligence'))

print("Cosine similarity between 'artificial' "+
      "and 'natural' - CBOW : ",
      model1.wv.similarity('artificial','natural'))

[-0.01817176 -0.11510178  0.0546319   0.00714731  0.09302385 -0.01016603
 -0.03298017 -0.10941261]
Cosine Similarity between "artificial"and "intelligence" -CBOW :  -0.4236376
Cosine similarity between 'artificial' and 'natural' - CBOW :  -0.10864579


In [None]:
model2 = gensim.models.Word2Vec(data,vector_size=100,window=5,min_count=1)


print('Cosine Similarity between "artificial"'+
      'and "intelligence" -CBOW : ',
      model2.wv.similarity('artificial','intelligence'))

print("Cosine similarity between 'artificial' "+
      "and 'natural' - CBOW : ",
      model2.wv.similarity('artificial','natural'))

Cosine Similarity between "artificial"and "intelligence" -CBOW :  -0.02367166
Cosine similarity between 'artificial' and 'natural' - CBOW :  -0.032843158


In [None]:
from gensim.models import Word2Vec
sentences = [['cat','say','meow'], ['dog','say','woof']]
model = Word2Vec(min_count=1)
model.build_vocab(sentences)
model.train(sentences,total_examples=model.corpus_count,epochs=model.epochs)
# (1,30)
model.wv.similarity('meow','woof')

0.06797595

# GLOVE Embedding

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

--2024-08-26 15:38:39--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-08-26 15:38:39--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-08-26 15:38:39--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
!unzip glove*.zip
!ls
!pwd

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       
glove.6B.100d.txt  glove.6B.300d.txt  glove.6B.zip
glove.6B.200d.txt  glove.6B.50d.txt   sample_data
/content


In [None]:
print('Indexing word vectors')
embeddings_index = {}
f = open('/content/glove.6B.100d.txt',encoding='utf-8')
for line in f:
  values = line.split()
  word = values[0]
  coefs = np.asarray(values[1:],dtype='float32')
  embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors
Found 400000 word vectors.


In [None]:
u = embeddings_index['program']
norm_u = np.linalg.norm(u)
similarity = []
for word in embeddings_index.keys():
  v = embeddings_index[word]
  cosine = np.dot(u,v)/norm_u/np.linalg.norm(v)
  similarity.append((word,cosine))
print(len(similarity))
sorted(similarity,key=lambda x:x[1],reverse=True)[:10]

400000


[('program', 0.99999994),
 ('programs', 0.9071443),
 ('programme', 0.7875009),
 ('project', 0.7505014),
 ('funding', 0.7270174),
 ('plan', 0.71068203),
 ('system', 0.6989038),
 ('plans', 0.6961886),
 ('education', 0.69365185),
 ('programmes', 0.68979746)]

# BERT

In [None]:
!pip install transformers
from transformers import pipeline
unmasker = pipeline('fill-mask',model = 'bert-base-uncased')



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
unmasker("Artificial Intelligence [MASK] take over the world.")

[{'score': 0.31824102997779846,
  'token': 2064,
  'token_str': 'can',
  'sequence': 'artificial intelligence can take over the world.'},
 {'score': 0.18299654126167297,
  'token': 2097,
  'token_str': 'will',
  'sequence': 'artificial intelligence will take over the world.'},
 {'score': 0.05600149929523468,
  'token': 2000,
  'token_str': 'to',
  'sequence': 'artificial intelligence to take over the world.'},
 {'score': 0.045194968581199646,
  'token': 2015,
  'token_str': '##s',
  'sequence': 'artificial intelligences take over the world.'},
 {'score': 0.045153141021728516,
  'token': 2052,
  'token_str': 'would',
  'sequence': 'artificial intelligence would take over the world.'}]

In [None]:
unmasker("The man worked as a [MASK].")

[{'score': 0.09747565537691116,
  'token': 10533,
  'token_str': 'carpenter',
  'sequence': 'the man worked as a carpenter.'},
 {'score': 0.052383214235305786,
  'token': 15610,
  'token_str': 'waiter',
  'sequence': 'the man worked as a waiter.'},
 {'score': 0.04962708428502083,
  'token': 13362,
  'token_str': 'barber',
  'sequence': 'the man worked as a barber.'},
 {'score': 0.037886086851358414,
  'token': 15893,
  'token_str': 'mechanic',
  'sequence': 'the man worked as a mechanic.'},
 {'score': 0.037680841982364655,
  'token': 18968,
  'token_str': 'salesman',
  'sequence': 'the man worked as a salesman.'}]

In [None]:
unmasker("The woman worked as a [MASK].")

[{'score': 0.21981509029865265,
  'token': 6821,
  'token_str': 'nurse',
  'sequence': 'the woman worked as a nurse.'},
 {'score': 0.15974131226539612,
  'token': 13877,
  'token_str': 'waitress',
  'sequence': 'the woman worked as a waitress.'},
 {'score': 0.11547307670116425,
  'token': 10850,
  'token_str': 'maid',
  'sequence': 'the woman worked as a maid.'},
 {'score': 0.03796877712011337,
  'token': 19215,
  'token_str': 'prostitute',
  'sequence': 'the woman worked as a prostitute.'},
 {'score': 0.03042384423315525,
  'token': 5660,
  'token_str': 'cook',
  'sequence': 'the woman worked as a cook.'}]

# Word Operations

**Tokenizer**

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
EXAMPLE_TEXT = "Hello dear students, how are you doing today? The weather is great, and Python is awesome. Hope you are focussing on the class."

print(sent_tokenize(EXAMPLE_TEXT))
print(word_tokenize(EXAMPLE_TEXT))

['Hello dear students, how are you doing today?', 'The weather is great, and Python is awesome.', 'Hope you are focussing on the class.']
['Hello', 'dear', 'students', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'weather', 'is', 'great', ',', 'and', 'Python', 'is', 'awesome', '.', 'Hope', 'you', 'are', 'focussing', 'on', 'the', 'class', '.']


**POS Tagging**

In [None]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
nltk.download('state_union')

[nltk_data] Downloading package state_union to /root/nltk_data...
[nltk_data]   Unzipping corpora/state_union.zip.


True

In [None]:
nltk.download('averaged_perceptron_tagger')
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
def process_content():
    try:
        for i in tokenized[:5]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)

    except Exception as e:
        print(str(e))


process_content()

[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('BEFORE', 'IN'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('UNION', 'NNP'), ('January', 'NNP'), ('31', 'CD'), (',', ','), ('2006', 'CD'), ('THE', 'NNP'), ('PRESIDENT', 'NNP'), (':', ':'), ('Thank', 'NNP'), ('you', 'PRP'), ('all', 'DT'), ('.', '.')]
[('Mr.', 'NNP'), ('Speaker', 'NNP'), (',', ','), ('Vice', 'NNP'), ('President', 'NNP'), ('Cheney', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('Supreme', 'NNP'), ('Court', 'NNP'), ('and', 'CC'), ('diplomatic', 'JJ'), ('corps', 'NN'), (',', ','), ('distinguished', 'JJ'), ('guests', 'NNS'), (',', ','), ('and', 'CC'), ('fellow', 'JJ'), ('citizens', 'NNS'), (':', ':'), ('Today', 'VB'), ('our', 'PRP$'), ('nat

**Stop Words Removal**

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
example_sent = 'This is a sample sentence, showing off the stop words filtration.'
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(example_sent)
filtered_Sentence = [w for w in word_tokens if not w in stop_words]
filtered_sentence=[]
for w in word_tokens:
  if w not in stop_words:
    filtered_sentence.append(w)
print(word_tokens)
print(filtered_Sentence)

['This', 'is', 'a', 'sample', 'sentence', ',', 'showing', 'off', 'the', 'stop', 'words', 'filtration', '.']
['This', 'sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# NER Tagging

In [None]:
import spacy
from spacy import displacy
ner_pipeline = spacy.load('en_core_web_sm')
ner_pipeline_labels = ner_pipeline.get_pipe('ner').labels
ner_pipeline_labels

('CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART')

In [None]:
sample_text = """ The company was founded in December 2002 by Reid Hoffman and the founding team members from PayPal and Socialnet.com (Allen Blue, Eric Ly, Jean-Luc Vaillant, Lee Hower, Konstantin Guericke, Stephen Beitzel, David Eves, Ian McNish, Yan Pujante, Chris Saccheri).In late 2003, Sequoia Capital led the Series A investment in the company.In August 2004, LinkedIn reached 1 million users.In March 2006, LinkedIn achieved its first month of profitability.In April 2007, LinkedIn reached 10 million users.In February 2008, LinkedIn launched a mobile version of the site.

In June 2008, Sequoia Capital, Greylock Partners, and other venture capital firms purchased a 5% stake in the company for $53 million, giving the company a post-money valuation of approximately $1 billion. In November 2009, LinkedIn opened its office in Mumbai and soon thereafter in Sydney, as it started its Asia-Pacific team expansion. In 2010 LinkedIn opened an International Headquarters in Dublin, Ireland,received a $20 million investment from Tiger Global Management LLC at a valuation of approximately $2 billion,announced its first acquisition, Mspoke,and improved its 1% premium subscription ratio. In October of that year, Silicon Valley Insider ranked the company No. 10 on its Top 100 List of most valuable startups. By December, the company was valued at $1.575 billion in private markets. LinkedIn started its India operations in 2009 and a major part of the first year was dedicated to understanding professionals in India and educating members to leverage LinkedIn for career development.

LinkedIn office building at 222 Second Street in San Francisco (opened in March 2016)

LinkedIn office in Toronto inside the Toronto Eaton Centre

LinkedIn filed for an initial public offering in January 2011."""

len(sample_text.split('.'))
ner_text = ner_pipeline(sample_text)
for word in ner_text.ents:
  print(word.text,word.label_,word.start_char,word.end_char)



December 2002 DATE 28 41
Reid Hoffman PERSON 45 57
PayPal ORG 93 99
Socialnet.com ORG 104 117
Allen Blue PERSON 119 129
Eric Ly PERSON 131 138
Jean-Luc Vaillant PERSON 140 157
Lee Hower PERSON 159 168
Konstantin Guericke PERSON 170 189
Stephen Beitzel PERSON 191 206
David Eves PERSON 208 218
Ian McNish PERSON 220 230
Yan Pujante PERSON 232 243
Chris PERSON 245 250
late 2003 DATE 264 273
Sequoia Capital ORG 275 290
Series EVENT 299 305
August 2004 DATE 337 348
LinkedIn GPE 350 358
1 million CARDINAL 367 376
March 2006 DATE 386 396
LinkedIn GPE 398 406
first month DATE 420 431
April 2007 DATE 452 462
LinkedIn GPE 464 472
10 million CARDINAL 481 491
February 2008 DATE 501 514
LinkedIn PERSON 516 524
June 2008 DATE 568 577
Sequoia Capital ORG 579 594
Greylock Partners ORG 596 613
5% PERCENT 659 661
$53 million MONEY 687 698
approximately $1 billion MONEY 745 769
November 2009 DATE 774 787
LinkedIn GPE 789 797
Mumbai GPE 819 825
Sydney GPE 849 855
Asia-Pacific LOC 875 887
2010 DATE 907 911


In [None]:
displacy.render(ner_text, style='ent',jupyter=True)

In [None]:
#Stemming
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

words = ['jump','jumped','jumps','jumping']
stemmer = PorterStemmer()
for word in words:
  print(word + '='+stemmer.stem(word))

jump=jump
jumped=jump
jumps=jump
jumping=jump


In [None]:
#Lemmatization
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

words = ['jump','jumped','jumps','jumping']
lemmatizer = WordNetLemmatizer()
for word in words:
  print(word+'='+lemmatizer.lemmatize(word))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


jump=jump
jumped=jumped
jumps=jump
jumping=jumping
