In [1]:
import numpy as np
import nltk

In [1]:
!pip install nltk



In [2]:
from nltk.corpus import brown

In [9]:
nltk.download('brown')

[nltk_data] Downloading package brown to C:\Users\Mohit
[nltk_data]     Uniyal\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [4]:
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [10]:
reviews = brown.sents(categories='reviews')

In [12]:
print(reviews[3])

['Not', 'the', 'noblest', 'performance', 'we', 'have', 'heard', 'him', 'play', ',', 'or', 'the', 'most', 'spacious', ',', 'or', 'even', 'the', 'most', 'eloquent', '.']


In [19]:
brown.words(categories="news")

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]

## Basic NLP Pipeline

* Data Collection
* Tokenization, Stopwords removal, stemming
* Building common vocabulary
* vectorizing all documents
* Perform classification

### 2.a Tokenization

In [2]:
text = "It was very pleasnt day, i went to the market to buy some fruits...!!"

In [24]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [23]:
print(word_tokenize(text))

['It', 'was', 'very', 'pleasnt', 'day', ',', 'i', 'went', 'to', 'the', 'market', 'to', 'buy', 'some', 'fruits', '...', '!', '!']


In [25]:
sent_text = "Cat is running. I play football. I love python"

In [26]:
sent_tokenize(sent_text)

['Cat is running.', 'I play football.', 'I love python']

### 2.b Stopwords

In [27]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Mohit
[nltk_data]     Uniyal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [28]:
from nltk.corpus import stopwords

In [31]:
sw = set(stopwords.words('english'))

In [222]:
print(sw)

{'because', "you'd", 'out', 'too', "you're", 'do', 'himself', 'only', 'were', 'it', 'weren', 'into', 'below', 'down', 'hers', 'am', 't', 'our', 'there', 'herself', 'through', 'own', 've', 'i', 'couldn', "you've", 'under', 'then', 'and', 'after', 'wasn', 'ours', 'a', 'those', 'where', 'all', "weren't", 'me', 'they', 're', 'them', 'some', 'should', 'wouldn', 'few', "couldn't", "won't", 'once', 'at', 'her', 'an', 'as', 'don', 'before', 'his', "hadn't", 'had', 'your', 'why', 'itself', 'll', "that'll", 'just', 'this', "shan't", 'd', 'between', 'which', "isn't", 'theirs', 'than', 'further', 'other', 'has', 'now', 'themselves', "didn't", 'y', 'him', 'very', 'if', 'doesn', 'of', 'yourselves', 'what', 'he', 'their', 'against', "haven't", 'shan', 'from', 'more', 'nor', 'haven', 'my', 'needn', 'won', "wouldn't", "hasn't", 'm', 'did', 'no', 'does', 'but', 'each', 'is', 'while', 'these', 'both', 'to', 'over', 'here', 'not', "mustn't", 'yourself', 'aren', 'been', "you'll", 'you', 'mightn', 'such', '

### 2.c Stemming

In [37]:
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer

In [38]:
ps = PorterStemmer()

In [39]:
ps.stem("jumping")

'jump'

In [40]:
ps.stem("jumped")

'jump'

In [45]:
ps.stem("awesome")

'awesom'

In [46]:
sb = SnowballStemmer("french")

In [50]:
sb.stem("merci")

'merc'

Write one function that performs all three steps

In [77]:
def filter_words(sentence):
    sentence = word_tokenize(sentence)
    return [ps.stem(w) for w in sentence if w not in sw]

In [3]:
text = "Cat is running, !!!!! Dog %$$@$^&%*^(&)&^&%$^ is running"

In [79]:
print(filter_words(text))

['cat', 'run', ',', '!', '!', '!', '!', '!', 'dog', '%', '$', '$', '@', '$', '^', '&', '%', '*^', '(', '&', ')', '&', '^', '&', '%', '$', '^', 'run']


In [5]:
from nltk.tokenize import RegexpTokenizer

In [12]:
tokenizer = RegexpTokenizer("[a-zA-Z]+")
print(tokenizer.tokenize(text))

['Cat', 'is', 'running', 'Dog', 'is', 'running']


### 3. Building common vocabulary

In [183]:
corpus= ["Indian indian cricket team will win World Cup",
        "We will win next Lok Sabha Election, says Indian PM",
        "Raazi is an exciting Indian spy movie based on real incident",
        "APJ Won hearts of many Indians."]

In [184]:
def myTokenizer(sentence):
    sentence = tokenizer.tokenize(sentence.lower())
    return [ps.stem(w) for w in sentence if w not in sw]

In [185]:
from sklearn.feature_extraction.text import CountVectorizer

In [203]:
cv = CountVectorizer(tokenizer=myTokenizer, ngram_range=(1,2))

In [204]:
vectorized_corpus = cv.fit_transform(corpus).toarray()

In [205]:
print(vectorized_corpus)

[[0 0 0 0 1 1 1 0 0 0 0 0 0 0 2 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 1 1 1 0 1 1 1]
 [0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 0 0 1 0 1 1 0 0 0 0 1 1 1 0 0 0 0 1 1 1 1
  0 0 0 0 1 1 0 0 0]
 [0 0 1 1 0 0 0 0 0 1 1 0 0 1 1 0 0 0 1 0 0 0 0 1 1 0 0 0 1 1 1 1 0 0 0 0
  1 1 0 0 0 0 0 0 0]
 [1 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0]]


In [206]:
print(cv.vocabulary_)

{'indian': 14, 'cricket': 4, 'team': 38, 'win': 40, 'world': 43, 'cup': 6, 'indian indian': 16, 'indian cricket': 15, 'cricket team': 5, 'team win': 39, 'win world': 42, 'world cup': 44, 'next': 25, 'lok': 19, 'sabha': 32, 'elect': 7, 'say': 34, 'pm': 27, 'win next': 41, 'next lok': 26, 'lok sabha': 20, 'sabha elect': 33, 'elect say': 8, 'say indian': 35, 'indian pm': 17, 'raazi': 28, 'excit': 9, 'spi': 36, 'movi': 23, 'base': 2, 'real': 30, 'incid': 13, 'raazi excit': 29, 'excit indian': 10, 'indian spi': 18, 'spi movi': 37, 'movi base': 24, 'base real': 3, 'real incid': 31, 'apj': 0, 'heart': 11, 'mani': 21, 'apj heart': 1, 'heart mani': 12, 'mani indian': 22}


In [207]:
cv.vocabulary_["indian"]

14

In [208]:
vectorized_corpus.shape

(4, 45)

In [209]:
new_text = np.ones((22,))

In [210]:
new_text[3:7] = 0
new_text[10:17] = 0
new_text[18:21] =0

In [211]:
new_text

array([1., 1., 1., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 1.])

In [212]:
cv.inverse_transform(new_text)

[array(['apj', 'apj heart', 'base', 'elect', 'elect say', 'excit',
        'indian pm', 'mani'], dtype='<U14')]

In [213]:
cv.transform(["Indian movi PM "]).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0]], dtype=int64)

## Tfidf Vectorizer

In [216]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [217]:
tfidf = TfidfVectorizer(tokenizer= myTokenizer, ngram_range=(1,1))

In [218]:
vectorized_tfidf = tfidf.fit_transform(corpus).toarray()

In [220]:
print(vectorized_tfidf)

[[0.         0.         0.41845521 0.41845521 0.         0.
  0.         0.         0.43673458 0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.41845521 0.3299149  0.41845521]
 [0.         0.         0.         0.         0.38086157 0.
  0.         0.         0.19874937 0.38086157 0.         0.
  0.38086157 0.38086157 0.         0.         0.38086157 0.38086157
  0.         0.         0.30027564 0.        ]
 [0.         0.37082034 0.         0.         0.         0.37082034
  0.         0.37082034 0.19350944 0.         0.         0.37082034
  0.         0.         0.37082034 0.37082034 0.         0.
  0.37082034 0.         0.         0.        ]
 [0.55280532 0.         0.         0.         0.         0.
  0.55280532 0.         0.28847675 0.         0.55280532 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.        ]]


In [221]:
np.array(vectorized_tfidf).shape

(4, 22)