## Natural Language Processing (NLP) and Natural Language Toolkit (NLTK)

In [1]:
#Download a dataset
import nltk
nltk.download('brown')

[nltk_data] Downloading package brown to /home/muskan/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [2]:
#check categories in 'brown' dataset 
from nltk.corpus import brown
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [4]:
#Print words from a category
data=brown.sents(categories='editorial')[:100]
print(data)
print(len(data))

[['Assembly', 'session', 'brought', 'much', 'good'], ['The', 'General', 'Assembly', ',', 'which', 'adjourns', 'today', ',', 'has', 'performed', 'in', 'an', 'atmosphere', 'of', 'crisis', 'and', 'struggle', 'from', 'the', 'day', 'it', 'convened', '.'], ...]
100


## Tokenization

In [3]:
text="It was a very pleasant day, the weather was cool and there were light showers. I went to market to buy some fruits."

In [4]:
#sentence tokenizer and word tokenizer
from nltk.tokenize import sent_tokenize,word_tokenize
sents=sent_tokenize(text)
print(sents)
words=word_tokenize(sents[0].lower())
print(words)

['It was a very pleasant day, the weather was cool and there were light showers.', 'I went to market to buy some fruits.']
['it', 'was', 'a', 'very', 'pleasant', 'day', ',', 'the', 'weather', 'was', 'cool', 'and', 'there', 'were', 'light', 'showers', '.']


## Stopword Removal

In [8]:
#list of stopwords in nltk
from nltk.corpus import stopwords
sw=set(stopwords.words('english'))
print(len(sw))
print(sw)

179
{'re', 'from', 'his', 'wasn', 'weren', 'than', 'can', 'between', 'does', "didn't", 't', 'theirs', 'doesn', 'haven', "wouldn't", 'against', "hadn't", 'ma', 'her', 'of', 'there', 'do', 'some', 'should', 've', "hasn't", 'at', 'we', "that'll", "it's", 'hasn', "she's", 'while', 'was', 'or', 'most', 'the', 'be', 'nor', 'didn', 'those', 'itself', 'after', 'been', 'because', 'on', 'before', 'its', 'why', 'me', 'this', 'just', 'm', 'are', 'he', 'other', 'further', 'very', 'these', 'now', 'when', 'not', "don't", 'o', 'whom', 'will', 'where', 'it', 'only', 'your', 'had', 'hadn', 'if', 'have', 'yours', 'each', 'ain', "you've", 'with', 'both', "couldn't", 'and', 'needn', 'aren', "isn't", "needn't", "wasn't", 'am', 'my', "you'd", 'an', 'is', 'as', 's', 'how', 'y', 'during', 'isn', 'any', "weren't", 'won', 'more', "mustn't", "shouldn't", 'ourselves', 'hers', 'yourselves', 'about', 'ours', "mightn't", 'by', 'themselves', 'again', 'for', 'has', 'then', 'through', 'them', "shan't", 'below', 'you', '

In [9]:
#filter words by removing stopwords
def filter_words(word_list):
    useful_words=[w for w in word_list if w not in sw]
    return useful_words

filtered_words=filter_words(words)
print(filtered_words)

['pleasant', 'day', ',', 'weather', 'cool', 'light', 'showers', '.']


## Tokenization using regular expression

In [10]:
#make your own tokenizer
from nltk.tokenize import RegexpTokenizer
tokenizer=RegexpTokenizer("[a-zA-Z@]+")

In [11]:
#tokenize through own tokenizer
text="It was a very pleasant day, the weather was cool and there were light showers. I went to market to buy some fruits."
print(tokenizer.tokenize(text))

['It', 'was', 'a', 'very', 'pleasant', 'day', 'the', 'weather', 'was', 'cool', 'and', 'there', 'were', 'light', 'showers', 'I', 'went', 'to', 'market', 'to', 'buy', 'some', 'fruits']


In [11]:
word_list=tokenizer.tokenize(text.lower())
fil_word=filter_words(word_list)
print(fil_word)

['pleasant', 'day', 'weather', 'cool', 'light', 'showers', 'went', 'market', 'buy', 'fruits']


## Stemming of words

In [12]:
#Porter Stemmer
from nltk.stem.snowball import PorterStemmer
ps=PorterStemmer()
ps.stem("jumped")

'jump'

In [13]:
ps.stem("lovely")

'love'

In [14]:
ps.stem("awesome")

'awesom'

In [15]:
#Lancaster Stemmer
from nltk.stem.lancaster import LancasterStemmer
ls=LancasterStemmer()
ls.stem("teeth")

'tee'

In [16]:
print(ps.stem("teenager"))
print(ls.stem("teenager"))

teenag
teen


In [17]:
#Snowball Stemmer
from nltk.stem.snowball import SnowballStemmer
ss=SnowballStemmer('english')
ss.stem("meaningly")

'mean'

In [18]:
print(ss.stem('lovely'))
print(ss.stem('teenager'))

love
teenag


In [19]:
ss_french = SnowballStemmer('french')
print(ss.stem('courais'))

courai


## Word Lemmatization

In [19]:
from nltk.stem import WordNetLemmatizer
l=WordNetLemmatizer()
l.lemmatize("crying")

'cry'

## Text Cleaning : tokenize, stopword removal and stemming

In [20]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer

def mytokenizer(text):
    tokenizer=RegexpTokenizer("[a-zA-Z@]+")
    word_list=tokenizer.tokenize(text)
    #print("word_list after tokenization:")
    #print(word_list)
    sw=set(stopwords.words('english'))
    useful_words=[w for w in word_list if w not in sw]
    #print("word_list after stopword removal:")
    #print(useful_words)
    ls=LancasterStemmer()
    stem_words=[]
    for w in useful_words:
        t=ls.stem(w)
        stem_words.append(t)
    #print("word_list after stemming:")
    #print(stem_words)        
    return stem_words

In [21]:
text= """    Foxes love to make   jumps. The quick brown    fox was seen jumping over the 
        lovely dog from a 6ft feet high wall   """
ans=mytokenizer(text)
print(ans)

['fox', 'lov', 'mak', 'jump', 'the', 'quick', 'brown', 'fox', 'seen', 'jump', 'lov', 'dog', 'ft', 'feet', 'high', 'wal']


# Bag of words Model
--Building common vocabulary and vectorizing documents

In [22]:
corpus = [
        'Indian cricket team will wins World Cup, says Capt. Virat Kohli. World cup will be held at SriLanka.',
        'We will win next Lok Sabha Elections, says confident Indian PM',
        'The nobel laurate won the hearts of the people',
        'The movie Raazi is an exciting Indian Spy thriller based upon a real story'
]

In [23]:
#Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()
vectorized_corpus=cv.fit_transform(corpus).toarray()
print(vectorized_corpus)
print(len(vectorized_corpus[0]))

[[0 1 0 1 1 0 1 2 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1 0
  2 0 1 0 2]
 [0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 1
  1 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 3 0 0 0 0
  0 0 0 1 0]
 [1 0 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 1 0 0 0 0 0 1 1 0 0 1 0 1 0 1 1 1 0 0
  0 0 0 0 0]]
41


In [24]:
print(cv.vocabulary_) #dictionary ---word:index

{'indian': 12, 'cricket': 6, 'team': 30, 'will': 36, 'wins': 38, 'world': 40, 'cup': 7, 'says': 26, 'capt': 4, 'virat': 34, 'kohli': 14, 'be': 3, 'held': 11, 'at': 1, 'srilanka': 28, 'we': 35, 'win': 37, 'next': 18, 'lok': 16, 'sabha': 25, 'elections': 8, 'confident': 5, 'pm': 22, 'the': 31, 'nobel': 19, 'laurate': 15, 'won': 39, 'hearts': 10, 'of': 20, 'people': 21, 'movie': 17, 'raazi': 23, 'is': 13, 'an': 0, 'exciting': 9, 'spy': 27, 'thriller': 32, 'based': 2, 'upon': 33, 'real': 24, 'story': 29}


In [25]:
print(len(cv.vocabulary_))

41


In [26]:
import numpy as np
vector=np.ones((41,))
vector[3:7]=0
print(vector)

[1. 1. 1. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [27]:
print(cv.inverse_transform(vector)) #index with value 0 are absent from list

[array(['an', 'at', 'based', 'cup', 'elections', 'exciting', 'hearts',
       'held', 'indian', 'is', 'kohli', 'laurate', 'lok', 'movie', 'next',
       'nobel', 'of', 'people', 'pm', 'raazi', 'real', 'sabha', 'says',
       'spy', 'srilanka', 'story', 'team', 'the', 'thriller', 'upon',
       'virat', 'we', 'will', 'win', 'wins', 'won', 'world'], dtype='<U9')]


In [28]:
print(cv.vocabulary_["capt"])  #gives index of word

4


In [29]:
mytokenizer(corpus[0])

['ind',
 'cricket',
 'team',
 'win',
 'world',
 'cup',
 'say',
 'capt',
 'vir',
 'kohl',
 'world',
 'cup',
 'held',
 'srilank']

In [30]:
#give your own tokenizer to count vectorizer
cv=CountVectorizer(tokenizer=mytokenizer)
vectorised_corpus=cv.fit_transform(corpus)
print(vectorised_corpus.todense()[0])
print(len(vectorised_corpus.toarray()[0]))

[[0 1 0 1 2 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 1 2]]
31


In [31]:
v=vectorised_corpus.toarray()[0]
print(v)
v[0]=1
print(v)
cv.inverse_transform(v)

[0 1 0 1 2 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 1 2]
[1 1 0 1 2 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 1 2]


[array(['bas', 'capt', 'cricket', 'cup', 'held', 'ind', 'kohl', 'say',
        'srilank', 'team', 'vir', 'win', 'world'], dtype='<U8')]

# N-gram features

In [32]:
#unigram features
cv=CountVectorizer(tokenizer=mytokenizer,ngram_range=(1,1))
vectorised_corpus=cv.fit_transform(corpus)
vc=vectorised_corpus.toarray()
print(cv.vocabulary_)
print(len(vc[0]))

{'ind': 9, 'cricket': 3, 'team': 25, 'win': 29, 'world': 30, 'cup': 4, 'say': 21, 'capt': 1, 'vir': 28, 'kohl': 10, 'held': 8, 'srilank': 23, 'next': 14, 'lok': 12, 'sabh': 20, 'elect': 5, 'confid': 2, 'pm': 17, 'nobel': 15, 'laur': 11, 'heart': 7, 'peopl': 16, 'movy': 13, 'raaz': 18, 'excit': 6, 'spy': 22, 'thriller': 26, 'bas': 0, 'upon': 27, 'real': 19, 'story': 24}
31


In [33]:
#bigram and unigram features
cv=CountVectorizer(tokenizer=mytokenizer,ngram_range=(1,2))
vectorised_corpus=cv.fit_transform(corpus)
vc=vectorised_corpus.toarray()
print(cv.vocabulary_)
print(len(vc[0]))

{'ind': 19, 'cricket': 6, 'team': 50, 'win': 58, 'world': 61, 'cup': 8, 'say': 43, 'capt': 2, 'vir': 56, 'kohl': 23, 'held': 17, 'srilank': 48, 'ind cricket': 20, 'cricket team': 7, 'team win': 51, 'win world': 60, 'world cup': 62, 'cup say': 10, 'say capt': 44, 'capt vir': 3, 'vir kohl': 57, 'kohl world': 24, 'cup held': 9, 'held srilank': 18, 'next': 31, 'lok': 27, 'sabh': 41, 'elect': 11, 'confid': 4, 'pm': 36, 'win next': 59, 'next lok': 32, 'lok sabh': 28, 'sabh elect': 42, 'elect say': 12, 'say confid': 45, 'confid ind': 5, 'ind pm': 21, 'nobel': 33, 'laur': 25, 'heart': 15, 'peopl': 35, 'nobel laur': 34, 'laur heart': 26, 'heart peopl': 16, 'movy': 29, 'raaz': 37, 'excit': 13, 'spy': 46, 'thriller': 52, 'bas': 0, 'upon': 54, 'real': 39, 'story': 49, 'movy raaz': 30, 'raaz excit': 38, 'excit ind': 14, 'ind spy': 22, 'spy thriller': 47, 'thriller bas': 53, 'bas upon': 1, 'upon real': 55, 'real story': 40}
63


In [34]:
#trigram features
cv=CountVectorizer(tokenizer=mytokenizer,ngram_range=(3,3))
vectorised_corpus=cv.fit_transform(corpus)
vc=vectorised_corpus.toarray()
print(cv.vocabulary_)
print(len(vc[0]))

{'ind cricket team': 8, 'cricket team win': 3, 'team win world': 21, 'win world cup': 26, 'world cup say': 28, 'cup say capt': 5, 'say capt vir': 18, 'capt vir kohl': 1, 'vir kohl world': 24, 'kohl world cup': 10, 'world cup held': 27, 'cup held srilank': 4, 'win next lok': 25, 'next lok sabh': 14, 'lok sabh elect': 12, 'sabh elect say': 17, 'elect say confid': 6, 'say confid ind': 19, 'confid ind pm': 2, 'nobel laur heart': 15, 'laur heart peopl': 11, 'movy raaz excit': 13, 'raaz excit ind': 16, 'excit ind spy': 7, 'ind spy thriller': 9, 'spy thriller bas': 20, 'thriller bas upon': 22, 'bas upon real': 0, 'upon real story': 23}
29


# TF-IDF Normalization
-term document frequency : associates weight with every term

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer=TfidfVectorizer(tokenizer=mytokenizer,ngram_range=(1,1),norm='l2')
vectorized_corpus=tfidf_vectorizer.fit_transform(corpus).toarray()
print(vectorized_corpus)

[[0.         0.24506716 0.         0.24506716 0.49013431 0.
  0.         0.         0.24506716 0.15642319 0.24506716 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.19321376 0.         0.24506716
  0.         0.24506716 0.         0.         0.24506716 0.19321376
  0.49013431]
 [0.         0.         0.36153669 0.         0.         0.36153669
  0.         0.         0.         0.23076418 0.         0.
  0.36153669 0.         0.36153669 0.         0.         0.36153669
  0.         0.         0.36153669 0.28503968 0.         0.
  0.         0.         0.         0.         0.         0.28503968
  0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.5        0.         0.         0.         0.5
  0.         0.         0.         0.5        0.5        0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.        ]
 [0.32603555 0.  

In [36]:
print(tfidf_vectorizer.vocabulary_)

{'ind': 9, 'cricket': 3, 'team': 25, 'win': 29, 'world': 30, 'cup': 4, 'say': 21, 'capt': 1, 'vir': 28, 'kohl': 10, 'held': 8, 'srilank': 23, 'next': 14, 'lok': 12, 'sabh': 20, 'elect': 5, 'confid': 2, 'pm': 17, 'nobel': 15, 'laur': 11, 'heart': 7, 'peopl': 16, 'movy': 13, 'raaz': 18, 'excit': 6, 'spy': 22, 'thriller': 26, 'bas': 0, 'upon': 27, 'real': 19, 'story': 24}
