In [6]:
import nltk

In [18]:
from nltk.corpus import brown

In [19]:
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [20]:
data=brown.sents(categories='adventure')

In [21]:
' '.join(data[1])

'He was well rid of her .'

### Bag of Words Pipeline 
    1.Get the Data/Corpus
    2.Tokenisation/Stopward Removal
    3.Stemming/Lemmitization
    4.Building a vocab
    5.Vectorisation
    6.Clasification
    

### Tokenisation

In [22]:
document= """" It was a very pleasent day.
              The weather was cool and there were light showers.
               I went to market to buy some fruits"""
sentence="send all 50 documents related to chapter 1, 2 ,3 to parth@apple.com"

In [23]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [24]:
sents=sent_tokenize(document)
print(sents)
print(len(sents))

['" It was a very pleasent day.', 'The weather was cool and there were light showers.', 'I went to market to buy some fruits']
3


In [25]:
sents[1]

'The weather was cool and there were light showers.'

In [26]:
sentence.split()

['send',
 'all',
 '50',
 'documents',
 'related',
 'to',
 'chapter',
 '1,',
 '2',
 ',3',
 'to',
 'parth@apple.com']

In [27]:
words=word_tokenize(sentence)

In [28]:
words

['send',
 'all',
 '50',
 'documents',
 'related',
 'to',
 'chapter',
 '1',
 ',',
 '2',
 ',3',
 'to',
 'parth',
 '@',
 'apple.com']

### Stopwords

In [29]:
from nltk.corpus import stopwords

In [30]:
sw=set(stopwords.words('english'))

In [31]:
print(sw)

{"she's", 'can', 'do', 'of', 'off', "mustn't", 'why', 'and', 'in', "aren't", "you'll", 'him', 'here', 'under', 'me', 'hers', 'before', 'there', "didn't", "haven't", 'who', 'wasn', "wasn't", 'did', 'they', 'them', 'as', 'than', 'yourselves', 'has', 'should', 'herself', 'until', 'my', 'isn', 'very', 'were', 'where', 'does', "shouldn't", 'some', 'don', 'yours', 'hadn', 'couldn', 'between', 'aren', 'these', 'be', 'over', 'am', 'no', 'll', "hasn't", 'having', 'each', "it's", 'those', 've', 'had', 'it', 'shouldn', 't', 'most', 'have', 'was', 'doesn', 'we', 'an', 'now', "shan't", "weren't", 'himself', 'their', 'shan', 'from', 'just', 'y', 'he', 'ain', "you'd", 'being', 'i', 'needn', 'm', 'how', "hadn't", 'on', 'theirs', 'during', "won't", 'few', 'at', 'that', 'will', 'about', 'through', 'because', 'out', "wouldn't", "should've", 'o', 'more', 'won', 'which', 'then', 'again', 'after', 'own', 'all', 'yourself', "couldn't", 'you', 'what', "isn't", 'down', 'against', 'weren', 'myself', 'below', 'y

In [20]:
def remove_stopwords(text,stopword):
    useful_words=[w for w in text if w not in stopword]
    return useful_words


In [21]:
text='i am not bothered about her very much'.split()
useful_text=remove_stopwords(text,sw)
print(useful_text)

['bothered', 'much']


### Token using Regex ( Regular Expression)

In [22]:
from nltk.tokenize import RegexpTokenizer

In [26]:
tokenizer=RegexpTokenizer('[a-zA-Z@.]+')
useful_text=tokenizer.tokenize(sentence)

In [27]:
useful_text

['send',
 'all',
 'documents',
 'related',
 'to',
 'chapter',
 'to',
 'parth@apple.com']

### Stemming
     Example - jumps,jumping,jummed => jump 
     Presvers the semantics of sentence without increasing the number of unique words

In [28]:
text= """"Foxes love to make jumps.
          The quick fox was seen jumping over a dog from 6ft high wall.
        """

In [29]:
#Snowball stemmer , Porter , Lancaster Stemmer 

from nltk.stem.snowball import SnowballStemmer,PorterStemmer
from nltk.stem.lancaster import LancasterStemmer


In [33]:
ps=PorterStemmer()

In [34]:
ps.stem('jumping')

'jump'

In [39]:
ps.stem('jumps')

'jump'

In [40]:
ps.stem('lovely')

'love'

In [41]:
ps.stem('loving')

'love'

In [43]:
ss=SnowballStemmer('english')

In [45]:
ss.stem('jumping')

'jump'

### Lemmitization

In [46]:
from nltk.stem import WordNetLemmatizer

In [49]:
wn=WordNetLemmatizer()
wn.lemmatize('jumping')

'jumping'

### Building a Vocab & Vectorization

In [50]:
corpus = [
        'Indian cricket team will wins World Cup, says Capt. Virat Kohli. World cup will be held at Sri Lanka.',
        'We will win next Lok Sabha Elections, says confident Indian PM',
        'The nobel laurate won the hearts of the people.',
        'The movie Raazi is an exciting Indian Spy thriller based upon a real story.'
]

In [51]:
from sklearn.feature_extraction.text import CountVectorizer

In [52]:
cv=CountVectorizer()

In [53]:
vectorized_corpus=cv.fit_transform(corpus)

In [55]:
vectorized_corpus=vectorized_corpus.toarray()

In [56]:
vectorized_corpus

array([[0, 1, 0, 1, 1, 0, 1, 2, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 2, 0, 1, 0, 2],
       [0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
        0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]])

In [57]:
vectorized_corpus[0]  #first document 

array([0, 1, 0, 1, 1, 0, 1, 2, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 2, 0, 1, 0, 2])

In [59]:
cv.vocabulary_

{'indian': 12,
 'cricket': 6,
 'team': 31,
 'will': 37,
 'wins': 39,
 'world': 41,
 'cup': 7,
 'says': 27,
 'capt': 4,
 'virat': 35,
 'kohli': 14,
 'be': 3,
 'held': 11,
 'at': 1,
 'sri': 29,
 'lanka': 15,
 'we': 36,
 'win': 38,
 'next': 19,
 'lok': 17,
 'sabha': 26,
 'elections': 8,
 'confident': 5,
 'pm': 23,
 'the': 32,
 'nobel': 20,
 'laurate': 16,
 'won': 40,
 'hearts': 10,
 'of': 21,
 'people': 22,
 'movie': 18,
 'raazi': 24,
 'is': 13,
 'an': 0,
 'exciting': 9,
 'spy': 28,
 'thriller': 33,
 'based': 2,
 'upon': 34,
 'real': 25,
 'story': 30}

In [60]:
number = vectorized_corpus[2]
number

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0])

In [61]:
cv.inverse_transform(number)

[array(['hearts', 'laurate', 'nobel', 'of', 'people', 'the', 'won'],
       dtype='<U9')]

### Vectorisation with Stopword Removal

In [64]:
#tokenizer=RegexpTokenizer('[a-zA-Z@.]+') already created above in tokenise with regex section

#def remove_stopwords(text,stopword):
    #useful_words=[w for w in text if w not in stopword]
    #return useful_words

def myTokanizer(document):
    words=tokenizer.tokenize(document.lower())
    #remove stopwards (function already created above in stopwords section)
    words= remove_stopwords(words,sw) #sw is the object of stopwords created above 
    return words
    


In [65]:
myTokanizer("this is some function")

['function']

In [67]:
cv=CountVectorizer(tokenizer=myTokanizer)

In [70]:
vectorised_corpus=cv.fit_transform(corpus).toarray()

In [71]:
vectorised_corpus

array([[0, 1, 0, 1, 2, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 2],
       [0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1,
        1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
        0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0]])

In [72]:
cv.inverse_transform(vectorised_corpus)

[array(['capt.', 'cricket', 'cup', 'held', 'indian', 'kohli.', 'lanka.',
        'says', 'sri', 'team', 'virat', 'wins', 'world'], dtype='<U9'),
 array(['confident', 'elections', 'indian', 'lok', 'next', 'pm', 'sabha',
        'says', 'win'], dtype='<U9'),
 array(['hearts', 'laurate', 'nobel', 'people.'], dtype='<U9'),
 array(['based', 'exciting', 'indian', 'movie', 'raazi', 'real', 'spy',
        'story.', 'thriller', 'upon'], dtype='<U9')]

In [73]:
# For Test Data
test_corpus = [
        'Indian cricket rock !',        
]

In [74]:
cv.transform(test_corpus).toarray()

array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [75]:
#call fit transform for training data and tranform for test data

### More ways to Create Features

           1.Unigram - every word as a feature
           2.Bigrams
           3.Trigrams
           4.n-grams
           5.TF-IDF Normalisation

In [76]:
sent_1  = ["this is good movie"]
sent_2 = ["this is good movie but actor is not present"]
sent_3 = ["this is not good movie"]



In [80]:
cv = CountVectorizer(ngram_range=(1,3))

In [81]:
docs = [sent_1[0],sent_2[0]]
cv.fit_transform(docs).toarray()

array([[0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
        1],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1]])

In [82]:
cv.vocabulary_

{'this': 20,
 'is': 9,
 'good': 6,
 'movie': 14,
 'this is': 21,
 'is good': 10,
 'good movie': 7,
 'this is good': 22,
 'is good movie': 11,
 'but': 3,
 'actor': 0,
 'not': 17,
 'present': 19,
 'movie but': 15,
 'but actor': 4,
 'actor is': 1,
 'is not': 12,
 'not present': 18,
 'good movie but': 8,
 'movie but actor': 16,
 'but actor is': 5,
 'actor is not': 2,
 'is not present': 13}

# Tf-idf Normalisation

     1.Avoid features that occur very often, becauase they contain less information
     2.Information decreases as the number of occurences increases across different type of documents
     3.So we define another term - term-document-frequency which associates a weight with every term
     
     tf = term frequency 
     idf = inverse document frequency



In [83]:
sent_1  = "this is good movie"
sent_2 = "this was good movie"
sent_3 = "this is not good movie"

corpus = [sent_1,sent_2,sent_3]

In [84]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [85]:
tfidf = TfidfVectorizer()

In [86]:
vc = tfidf.fit_transform(corpus).toarray()


In [87]:
vc   #tfidf features 

array([[0.46333427, 0.59662724, 0.46333427, 0.        , 0.46333427,
        0.        ],
       [0.41285857, 0.        , 0.41285857, 0.        , 0.41285857,
        0.69903033],
       [0.3645444 , 0.46941728, 0.3645444 , 0.61722732, 0.3645444 ,
        0.        ]])

In [88]:
tfidf.vocabulary_

{'this': 4, 'is': 1, 'good': 0, 'movie': 2, 'was': 5, 'not': 3}

In [None]:
# not at index 3 which in vc is 0.617 has higher frequency 