# Basic NLP Pipeline
 * Data Collection
 * Tokenization,Stopword,Stemming
 * Building a common Vocab
 * Vectorizing the documents
 * Performing Classification/ Clustering

## 1. Data Collection

In [1]:
from nltk.corpus import brown

In [2]:
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [3]:
data = brown.sents(categories='editorial')[:100]

In [4]:
print(data)

[['Assembly', 'session', 'brought', 'much', 'good'], ['The', 'General', 'Assembly', ',', 'which', 'adjourns', 'today', ',', 'has', 'performed', 'in', 'an', 'atmosphere', 'of', 'crisis', 'and', 'struggle', 'from', 'the', 'day', 'it', 'convened', '.'], ...]


In [5]:
print(len(data))

100


## 2.  Tokenization

In [6]:
text = 'It was a very pleasant day, the weather was cool and there were light showers. I went to the market to buy some fruits.'
print(text)

It was a very pleasant day, the weather was cool and there were light showers. I went to the market to buy some fruits.


In [7]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [8]:
sents = sent_tokenize(text)
print(sents)

['It was a very pleasant day, the weather was cool and there were light showers.', 'I went to the market to buy some fruits.']


In [9]:
word_list = word_tokenize(sents[0].lower())

### Stopword Removal

In [10]:
from nltk.corpus import stopwords

In [11]:
sw = set(stopwords.words('english'))

In [12]:
print(len(sw))

179


### filter the  words from sentenses

In [13]:
def filter_words(word_list):
    useful_words = [w for w in word_list if w not in sw]
    return useful_words

### Tokenization using Regular Expression
- Problem with word tokenizer - can't handle complex tokenization ! so we use Regexp Tokenizer class in NLTK 

In [14]:
from nltk.tokenize import RegexpTokenizer

In [15]:
tokenizer = RegexpTokenizer("[a-zA-Z@]+")

In [16]:
text = 'Send all the 50 Documents related to clauses 1,2,3 abc@xyz.com'

print(tokenizer.tokenize(text))

['Send', 'all', 'the', 'Documents', 'related', 'to', 'clauses', 'abc@xyz', 'com']


## Stemming

- Processs that transforms perticular words(verbs,plurals) into their radical forms
- Perserve the semantics of sentence without increasing the number of unique tokens
- jumps,jumping,jumped,jump ==> jump

In [17]:
text = """Foxes love to make jumps. The quick brown fox was seen jumping over the lovely dog
        from a 6 ft feet high wall"""

words_list = tokenizer.tokenize(text.lower())
print(words_list)

['foxes', 'love', 'to', 'make', 'jumps', 'the', 'quick', 'brown', 'fox', 'was', 'seen', 'jumping', 'over', 'the', 'lovely', 'dog', 'from', 'a', 'ft', 'feet', 'high', 'wall']


In [18]:
words_list =  filter_words(words_list)
print(word_list)

['it', 'was', 'a', 'very', 'pleasant', 'day', ',', 'the', 'weather', 'was', 'cool', 'and', 'there', 'were', 'light', 'showers', '.']


### Stemming -

    - Snowball Stemmer
    - Porter Stemmer
    - Lancaster Stemmer

In [19]:
from nltk.stem.snowball import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer



ps = PorterStemmer()

In [20]:
ps.stem("jumped")

'jump'

In [21]:
ps.stem("lovely")

'love'

In [22]:
ls = LancasterStemmer()
ls.stem("teeth")

'tee'

In [23]:
print(ps.stem("teenager"))
print(ls.stem("teenager"))
      

teenag
teen


In [24]:
# Snowball Stemmer
ss = SnowballStemmer('english')
print(ss.stem('lovely'))



love


## Building Common Vocabulary and Vectorizing Documents (Based on Bag of words Model)

In [25]:
corpus = ['Indian Cricket team will win World Cup,says Capt. Virat Kohli',
        'We will Win next Lok Sabha Election,says Confident Indian PM',
          'The nobel laurate won the hearts of the people',
          'The Movie Raazi is an Exciting Indian Spy thriller based upon a real story'
         ]

In [26]:
from sklearn.feature_extraction.text import CountVectorizer

In [27]:
cv = CountVectorizer()

In [28]:
vectorized_corpus = cv.fit_transform(corpus).toarray()

In [29]:
vectorized_corpus
print(len(vectorized_corpus))

4


In [30]:
print(cv.vocabulary_)

{'exciting': 7, 'we': 31, 'real': 21, 'based': 1, 'upon': 29, 'virat': 30, 'world': 35, 'of': 17, 'movie': 14, 'raazi': 20, 'sabha': 22, 'team': 26, 'is': 10, 'laurate': 12, 'hearts': 8, 'next': 15, 'spy': 24, 'capt': 2, 'indian': 9, 'pm': 19, 'thriller': 28, 'kohli': 11, 'lok': 13, 'will': 32, 'nobel': 16, 'story': 25, 'election': 6, 'the': 27, 'cricket': 4, 'an': 0, 'confident': 3, 'says': 23, 'cup': 5, 'win': 33, 'won': 34, 'people': 18}


In [31]:
# Given a Vector what is the sentence
import numpy as np

vector = np.ones((4,))
vector[2:3] = 0

print(vector)

[1. 1. 0. 1.]


In [32]:
print(cv.inverse_transform(vector))

[array(['an', 'based', 'confident'], dtype='<U9')]


In [35]:
### Effectively reduced the size of the vector

def myTokenizer(sentence):
    words = tokenizer.tokenize(sentence.lower())
    return filter_words(words)

myTokenizer(corpus[0])

['indian',
 'cricket',
 'team',
 'win',
 'world',
 'cup',
 'says',
 'capt',
 'virat',
 'kohli']

In [41]:
cv = CountVectorizer(tokenizer=myTokenizer)
vectorized_corpus = cv.fit_transform(corpus)
vc = vectorized_corpus.toarray()
print(vc[0])
print(len(vc[0]))

[0 1 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 1 1]
29


In [42]:
vc[0][0] = 1
v = vc[0]
print(vc[0])
cv.inverse_transform(v)

[1 1 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 1 1]


[array(['based', 'capt', 'cricket', 'cup', 'indian', 'kohli', 'says',
        'team', 'virat', 'win', 'world'], dtype='<U9')]

## Features in Bag of Words Model

- Unigrams
- Bigrams,Trigrams
- N-Grams

In [44]:
cv = CountVectorizer(tokenizer=myTokenizer,ngram_range=(1,3))
vectorized_corpus = cv.fit_transform(corpus)
vc = vectorized_corpus.toarray()
print(cv.vocabulary_)

{'win world': 78, 'team win world': 66, 'indian cricket': 24, 'election says confident': 17, 'nobel': 42, 'says capt': 56, 'exciting indian spy': 20, 'win next': 76, 'exciting': 18, 'upon': 70, 'world cup says': 82, 'next lok sabha': 41, 'movie': 36, 'raazi exciting': 48, 'raazi': 47, 'team': 64, 'spy thriller based': 62, 'nobel laurate': 43, 'hearts people': 22, 'capt virat kohli': 5, 'thriller based': 68, 'indian cricket team': 25, 'spy': 60, 'next lok': 40, 'thriller': 67, 'kohli': 29, 'lok': 33, 'lok sabha election': 35, 'says confident indian': 59, 'story': 63, 'confident indian pm': 8, 'virat kohli': 74, 'cricket team': 10, 'cricket': 9, 'win world cup': 79, 'indian spy': 27, 'says capt virat': 57, 'says confident': 58, 'laurate hearts people': 32, 'lok sabha': 34, 'indian pm': 26, 'confident indian': 7, 'election says': 16, 'real': 50, 'sabha election says': 54, 'upon real story': 72, 'win next lok': 77, 'laurate': 30, 'cup says': 13, 'movie raazi': 37, 'virat': 73, 'world': 80,

# Tf - Idf Normalisation

   * Avoid Feature that occur very often,because they contain very less information
   * Information decreases as the number of occurenses increases across different type of documents
   * So we define another term - term document- frequency which associates a weight with every term

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [51]:
tfidf_vectorizer = TfidfVectorizer(tokenizer=myTokenizer,ngram_range=(1,1),norm= 'l2')
vectorized_corpus = tfidf_vectorizer.fit_transform(corpus).toarray()
print(vectorized_corpus)

[[0.         0.33999849 0.         0.33999849 0.33999849 0.
  0.         0.         0.21701663 0.33999849 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.26805872 0.         0.         0.33999849
  0.         0.         0.33999849 0.26805872 0.33999849]
 [0.         0.         0.36153669 0.         0.         0.36153669
  0.         0.         0.23076418 0.         0.         0.36153669
  0.         0.36153669 0.         0.         0.36153669 0.
  0.         0.36153669 0.28503968 0.         0.         0.
  0.         0.         0.         0.28503968 0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.5        0.         0.         0.5        0.
  0.         0.         0.5        0.5        0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.        ]
 [0.32603555 0.         0.         0.         0.         0.
  0.32603555 0.       

In [50]:
print(tfidf_vectorizer.vocabulary_)

{'team': 23, 'laurate': 10, 'election': 5, 'hearts': 7, 'next': 13, 'spy': 21, 'capt': 1, 'real': 18, 'nobel': 14, 'thriller': 24, 'kohli': 9, 'lok': 11, 'based': 0, 'story': 22, 'exciting': 6, 'indian': 8, 'upon': 25, 'cricket': 3, 'virat': 26, 'world': 28, 'confident': 2, 'raazi': 17, 'says': 20, 'sabha': 19, 'movie': 12, 'win': 27, 'people': 15, 'pm': 16, 'cup': 4}
