In [3]:
import nltk

In [4]:
from nltk.corpus import brown

In [5]:
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [6]:
data = brown.sents(categories='adventure')
print(len(data), data)

4637 [['Dan', 'Morgan', 'told', 'himself', 'he', 'would', 'forget', 'Ann', 'Turner', '.'], ['He', 'was', 'well', 'rid', 'of', 'her', '.'], ...]


In [7]:
### bag ofwords pipeline
#get the data/corpus
#tokenisation. stopwords removal
#stemming
#building a vocab
#vectorization
#classification

In [8]:
## tokenization and stopwords 

In [9]:
document = """It was a very pleasant day. The weather was cool and there were light showers.
I went to the market to buy some fruits."""

sentence = "Send all the 50 documents related to chapters 1,2,3 at preteek@cb.com"

In [10]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [11]:
sents = sent_tokenize(document)
print(len(sents), sents)

3 ['It was a very pleasant day.', 'The weather was cool and there were light showers.', 'I went to the market to buy some fruits.']


In [12]:
words = sentence.split()
print(words)
words = word_tokenize(sentence)
print(words)

['Send', 'all', 'the', '50', 'documents', 'related', 'to', 'chapters', '1,2,3', 'at', 'preteek@cb.com']
['Send', 'all', 'the', '50', 'documents', 'related', 'to', 'chapters', '1,2,3', 'at', 'preteek', '@', 'cb.com']


In [13]:
## stopwords removal

In [14]:
from nltk.corpus import stopwords

In [15]:
sw = set(stopwords.words('english'))

print(sw)

{'wouldn', 'while', "mustn't", 'under', 'ma', 'have', 'itself', 'when', 'yourselves', 'ours', "weren't", "should've", 'this', 'didn', 'she', 'each', "you'll", 'themselves', "needn't", 'because', 'has', 'an', 're', 'hasn', 'up', 'most', 'don', 'these', 'into', 'i', 'that', "won't", 'was', 'no', 'll', "wouldn't", "aren't", 'will', 'its', 'down', 'are', 'myself', 'her', 'of', 'any', 'out', 'isn', 'after', 'then', 'be', 'as', 'further', 'is', 'it', "didn't", 'for', 'the', 'a', 'what', "mightn't", 'haven', 'won', 'all', 'where', 'on', 'ain', "don't", "she's", 'herself', 'about', 'until', 'with', 'o', 'mightn', 'again', 'him', 'were', 've', 'off', 'own', 'some', 'nor', 'if', "couldn't", "isn't", 'which', 'such', 'we', 'am', 'couldn', "hadn't", 'during', 'm', 'more', 'ourselves', 'too', 'who', 'or', 'your', 'yours', 'only', 'can', 'weren', 'had', "that'll", 'both', "shan't", 'our', 's', 'those', 'aren', 'before', 'few', 'there', "shouldn't", "hasn't", 'whom', 'his', "you're", 'yourself', 'was

In [16]:
def remove_stopwords(text, stopwords):
    useful_words = [w for w in text if w not in stopwords]
    
    return useful_words

In [17]:
text = "i am not bothered about her very much".split()
useful_text = remove_stopwords(text, sw)
print(useful_text)

['bothered', 'much']


In [18]:
## tokenization using regex

In [19]:
from nltk.tokenize import RegexpTokenizer

In [20]:
tokenizer = RegexpTokenizer('[a-zA-Z@.]+')
useful_text = tokenizer.tokenize(sentence)
print(useful_text)

['Send', 'all', 'the', 'documents', 'related', 'to', 'chapters', 'at', 'preteek@cb.com']


In [21]:
## stemming and lametzation

In [22]:
text = """Foxes love to make jumps. The quick brown fox was seen jumping over the 
lovely dog from a 6ft feet high wall"""

In [23]:
# snowball(multilingual stemmer), stemmer, porter, lancaster stemmer
from nltk.stem.snowball import SnowballStemmer, PorterStemmer
from nltk.stem.lancaster import LancasterStemmer

In [24]:
# create object of stemmer
ss = SnowballStemmer('english')
ps = PorterStemmer()

In [25]:
ps.stem('jumped')

'jump'

In [26]:
# lametization
from nltk.stem import WordNetLemmatizer

wordnet = WordNetLemmatizer()
wordnet.lemmatize('jumping')

'jumping'

In [27]:
### vectorization

In [28]:
# Sample Corpus - Contains 4 Documents, each document can have 1 or more sentences
corpus = [
        'Indian cricket team will wins World Cup, says Capt. Virat Kohli. World cup will be held at Sri Lanka.',
        'We will win next Lok Sabha Elections, says confident Indian PM',
        'The nobel laurate won the hearts of the people.',
        'The movie Raazi is an exciting Indian Spy thriller based upon a real story.'
]

In [29]:
from sklearn.feature_extraction.text import CountVectorizer

In [30]:
cv = CountVectorizer()

In [31]:
vectorizedCorpus = cv.fit_transform(corpus)
print(vectorizedCorpus.toarray())

[[0 1 0 1 1 0 1 2 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1
  0 2 0 1 0 2]
 [0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 1 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0
  1 1 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 3 0 0 0
  0 0 0 0 1 0]
 [1 0 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 1 0 0 0 0 0 1 1 0 0 1 0 1 0 1 1 1 0
  0 0 0 0 0 0]]


In [32]:
cv.vocabulary_

{'indian': 12,
 'cricket': 6,
 'team': 31,
 'will': 37,
 'wins': 39,
 'world': 41,
 'cup': 7,
 'says': 27,
 'capt': 4,
 'virat': 35,
 'kohli': 14,
 'be': 3,
 'held': 11,
 'at': 1,
 'sri': 29,
 'lanka': 15,
 'we': 36,
 'win': 38,
 'next': 19,
 'lok': 17,
 'sabha': 26,
 'elections': 8,
 'confident': 5,
 'pm': 23,
 'the': 32,
 'nobel': 20,
 'laurate': 16,
 'won': 40,
 'hearts': 10,
 'of': 21,
 'people': 22,
 'movie': 18,
 'raazi': 24,
 'is': 13,
 'an': 0,
 'exciting': 9,
 'spy': 28,
 'thriller': 33,
 'based': 2,
 'upon': 34,
 'real': 25,
 'story': 30}

In [33]:
cv.inverse_transform(vectorizedCorpus[0])

[array(['indian', 'cricket', 'team', 'will', 'wins', 'world', 'cup',
        'says', 'capt', 'virat', 'kohli', 'be', 'held', 'at', 'sri',
        'lanka'], dtype='<U9')]

In [34]:
## vectorizer with stopwords removal

In [36]:
def myTokenizer(document):
    words = tokenizer.tokenize(document.lower())
    # remove stopwords
    words = remove_stopwords(words, sw)
    
    return words

In [37]:
myTokenizer(sentence)

['send', 'documents', 'related', 'chapters', 'preteek@cb.com']

In [38]:
cv = CountVectorizer(tokenizer=myTokenizer)

In [41]:
vectorizedCorpus = cv.fit_transform(corpus).toarray()
print(len(vectorizedCorpus[0]), vectorizedCorpus)

33 [[0 1 0 1 2 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 1 2]
 [0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 1 1 0 0 1 0 1 0 1 1 0 0 0 0]]


In [42]:
cv.inverse_transform(vectorizedCorpus)

[array(['capt.', 'cricket', 'cup', 'held', 'indian', 'kohli.', 'lanka.',
        'says', 'sri', 'team', 'virat', 'wins', 'world'], dtype='<U9'),
 array(['confident', 'elections', 'indian', 'lok', 'next', 'pm', 'sabha',
        'says', 'win'], dtype='<U9'),
 array(['hearts', 'laurate', 'nobel', 'people.'], dtype='<U9'),
 array(['based', 'exciting', 'indian', 'movie', 'raazi', 'real', 'spy',
        'story.', 'thriller', 'upon'], dtype='<U9')]

In [43]:
##for test data

In [44]:
test_corpus = [
        'Indian cricket team rocks'
]

In [46]:
##if called fit_transform, that will overwrite thhe train vocab
cv.transform(test_corpus).toarray()


array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]])

In [47]:
#### more ways to create features
# Unigram - every word as a feature
# Bigrams
# Trigrams
# n-grams
# TF-IDF Normalisation

In [53]:
sent1 = ["this is good movie"]
sent2 = ["this is not good movie"]
sent3 = ["this is good movie but actor is not present"]

In [63]:
cv = CountVectorizer(ngram_range=(1,3))

In [64]:
docs = [sent1[0], sent2[0]]
cv.fit_transform(docs).toarray()

array([[1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0],
       [1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1]])

In [65]:
cv.vocabulary_

{'this': 11,
 'is': 2,
 'good': 0,
 'movie': 7,
 'this is': 12,
 'is good': 3,
 'good movie': 1,
 'this is good': 13,
 'is good movie': 4,
 'not': 8,
 'is not': 5,
 'not good': 9,
 'this is not': 14,
 'is not good': 6,
 'not good movie': 10}

In [67]:
## tf-idf normalisation (tf - term frequency , idf = inverse document frequency)
# avoid features that occur very ofter, because they contain less information
# information decreases as the number of occurances increases across different type of documents
# So we define another term - document-frequency which associates a weight with every term

In [71]:
sent1 = "this is good movie"
sent2 = "this was good movie"
sent3 = "this is not good movie"

corpus = [sent1, sent2, sent3]

In [72]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [73]:
tfidf = TfidfVectorizer()

In [75]:
vc = tfidf.fit_transform(corpus).toarray()

In [77]:
print(vc)
print(tfidf.vocabulary_)

[[0.46333427 0.59662724 0.46333427 0.         0.46333427 0.        ]
 [0.41285857 0.         0.41285857 0.         0.41285857 0.69903033]
 [0.3645444  0.46941728 0.3645444  0.61722732 0.3645444  0.        ]]
{'this': 4, 'is': 1, 'good': 0, 'movie': 2, 'was': 5, 'not': 3}
