In [1]:
import nltk
from nltk.corpus import brown
print (brown.categories())
data = brown.sents(categories = "news")
print(' '.join(data[0]))

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced `` no evidence '' that any irregularities took place .


# Tokenization

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\srish\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
from nltk.tokenize import sent_tokenize, word_tokenize
document = """
We are having fun at major league hacking local hack day build! 
feel free to join the discord channel. also share about the session on social media.
"""

sentence = "send all the documents related to chapter 1,2,3,4 to kunal@xyz.com"
sents = sent_tokenize(document)
sents

['\nWe are having fun at major league hacking local hack day build!',
 'feel free to join the discord channel.',
 'also share about the session on social media.']

In [4]:
words = word_tokenize(sentence)
words

['send',
 'all',
 'the',
 'documents',
 'related',
 'to',
 'chapter',
 '1,2,3,4',
 'to',
 'kunal',
 '@',
 'xyz.com']

# Stopword Removal

In [5]:
from nltk.corpus import stopwords

In [6]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\srish\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
sw = set(stopwords.words("english"))

In [8]:
def remove_stopwords(text, stopwords):
    useful_words = [word for word in text if word not in stopwords]
    return useful_words

In [9]:
remove_stopwords(sentence.split(), sw)

['send', 'documents', 'related', 'chapter', '1,2,3,4', 'kunal@xyz.com']

In [10]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer('[a-zA-Z@.]+')
useful_text = tokenizer.tokenize(sentence)
print(useful_text)

['send', 'all', 'the', 'documents', 'related', 'to', 'chapter', 'to', 'kunal@xyz.com']


# Stemming

In [11]:
document = """
I like this session and I liked all the sessions. I am liking everything. 
We are having fun at major league hacking local hack day build! 
feel free to join the discord channel. also share about the session on social media.
"""

from nltk.stem import PorterStemmer

In [12]:
ps = PorterStemmer()

In [13]:
ps.stem('liking')

'like'

In [14]:
corpus = [
    'Indian cricket team will win World Cup, says Capt. Virat Kohli. World cup will be held at Sri Lanka.',
    'We will win next Lok Sabha Elections, says confident Indian PM',
    'The nobel laurate won the hearts of the people',
    'The movie Raazi is an exciting Indian Spy thriller based upon a real story'
]

In [15]:
pip install -U scikit-learn scipy matplotlib

Note: you may need to restart the kernel to use updated packages.


In [16]:
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
cv = CountVectorizer()

In [18]:
vectorized_corpus = cv.fit_transform(corpus)
vectorized_corpus = vectorized_corpus.toarray()
print(vectorized_corpus[0])

[0 1 0 1 1 0 1 2 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1 0
 2 1 0 2]


In [19]:
print(cv.vocabulary_)

{'indian': 12, 'cricket': 6, 'team': 31, 'will': 37, 'win': 38, 'world': 40, 'cup': 7, 'says': 27, 'capt': 4, 'virat': 35, 'kohli': 14, 'be': 3, 'held': 11, 'at': 1, 'sri': 29, 'lanka': 15, 'we': 36, 'next': 19, 'lok': 17, 'sabha': 26, 'elections': 8, 'confident': 5, 'pm': 23, 'the': 32, 'nobel': 20, 'laurate': 16, 'won': 39, 'hearts': 10, 'of': 21, 'people': 22, 'movie': 18, 'raazi': 24, 'is': 13, 'an': 0, 'exciting': 9, 'spy': 28, 'thriller': 33, 'based': 2, 'upon': 34, 'real': 25, 'story': 30}


# Vectorisation with stopword removal

In [20]:
def myTokenizer(document):
    words = tokenizer.tokenize(document.lower())
    words = remove_stopwords(words, sw)
    return words

In [21]:
myTokenizer('this is some function')

['function']

In [22]:
cv = CountVectorizer(tokenizer = myTokenizer)

In [23]:
vectorizedCorpus = cv.fit_transform(corpus).toarray()

In [24]:
print(len(vectorizedCorpus[0]))

32


In [25]:
print(cv.vocabulary_)

{'indian': 9, 'cricket': 3, 'team': 26, 'win': 30, 'world': 31, 'cup': 4, 'says': 22, 'capt.': 1, 'virat': 29, 'kohli.': 10, 'held': 8, 'sri': 24, 'lanka.': 11, 'next': 15, 'lok': 13, 'sabha': 21, 'elections': 5, 'confident': 2, 'pm': 18, 'nobel': 16, 'laurate': 12, 'hearts': 7, 'people': 17, 'movie': 14, 'raazi': 19, 'exciting': 6, 'spy': 23, 'thriller': 27, 'based': 0, 'upon': 28, 'real': 20, 'story': 25}
