In [1]:
import nltk

In [2]:
from nltk.corpus import brown
nltk.download('brown')

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\archi\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [3]:
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [4]:
brown.words()

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]

In [5]:
data=brown.sents(categories=["adventure"])

In [6]:
len(data)

4637

In [7]:
data


[['Dan', 'Morgan', 'told', 'himself', 'he', 'would', 'forget', 'Ann', 'Turner', '.'], ['He', 'was', 'well', 'rid', 'of', 'her', '.'], ...]

In [8]:
" ".join(data[0])

'Dan Morgan told himself he would forget Ann Turner .'

# PIPELINE: converting the text into numeric data

# Tokenization

In [9]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [10]:
document = """ It was a very good movie. The cast was amazing and I liked the story.
I went to the movie hall to see it.
"""

sentence = "Code for Cause is too OP kunal@codeforcause.org"

In [11]:
nltk.download('punkt')
sents = sent_tokenize(document)
print(sents)
len(sents)

[' It was a very good movie.', 'The cast was amazing and I liked the story.', 'I went to the movie hall to see it.']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\archi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


3

In [12]:
words = word_tokenize(sentence) # also break down special characters
print(words)
print(len(words))

['Code', 'for', 'Cause', 'is', 'too', 'OP', 'kunal', '@', 'codeforcause.org']
9


# Stopword Removal

In [13]:
from nltk.corpus import stopwords

In [14]:
nltk.download("stopwords")
sw = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\archi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
sw

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [16]:
text = "i am not a very good cricket player".split()
print(text)

['i', 'am', 'not', 'a', 'very', 'good', 'cricket', 'player']


In [17]:
def remove_stoprwords(text, stopwords):
    useful = [w for w in text if w not in stopwords]
    return useful

In [18]:
useful_words = remove_stoprwords(text, sw)

In [19]:
useful_words

['good', 'cricket', 'player']

In [20]:
# tokenization using regex

In [21]:
sent = "My email is kunal@codeforcause.org, please don't spam my inbox"

In [22]:
from nltk.tokenize import RegexpTokenizer

In [23]:
tokenizer = RegexpTokenizer('[a-zA-Z@.]+')
useful = tokenizer.tokenize(sentence)
print(useful)

['Code', 'for', 'Cause', 'is', 'too', 'OP', 'kunal@codeforcause.org']


# Stemming

In [24]:
# nltk provides us: porter, snowball, lancaster stemmer

In [25]:
from nltk.stem import SnowballStemmer, PorterStemmer, LancasterStemmer

In [26]:
ps = PorterStemmer()

In [27]:
ps.stem('laughing')

'laugh'

In [28]:
# SnowballStemmer = Multilingul, supports other langs also. 

In [29]:
corpus = [
    'Dan Morgan told himself he would forget Ann Turner.',
    'Sometimes he woke up in the middle of the night thinking of Ann , and then could not get back to sleep .',
    'His plans and dreams had revolved around her so much and for so long that now he felt as if he had nothing .',
    'He found that if he was tired enough at night , he went to sleep simply because he was too exhausted to stay awake .'
]

# creating vocab

In [30]:
from sklearn.feature_extraction.text import CountVectorizer

In [31]:
cv = CountVectorizer()

In [32]:
vc = cv.fit_transform(corpus)

In [33]:
print(vc[0])

  (0, 9)	1
  (0, 27)	1
  (0, 47)	1
  (0, 21)	1
  (0, 19)	1
  (0, 54)	1
  (0, 15)	1
  (0, 1)	1
  (0, 49)	1


In [34]:
vc = vc.toarray()
print(vc)
print(cv.vocabulary_)

[[0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1]
 [1 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 1 0 0 1 1 0 0 2 0 0
  0 1 0 1 0 0 2 1 1 0 1 0 0 0 1 0 0 1 0]
 [2 0 1 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 2 2 1 0 1 1 0 1 0 0 1 0 0 1 1 0 1 1
  0 0 2 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 1 0 1 0 0 0 1 1 0 0 0 1 0 0 4 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0
  1 1 0 0 1 1 0 0 0 1 2 0 1 0 0 2 1 0 0]]
{'dan': 9, 'morgan': 27, 'told': 47, 'himself': 21, 'he': 19, 'would': 54, 'forget': 15, 'ann': 1, 'turner': 49, 'sometimes': 39, 'woke': 53, 'up': 50, 'in': 24, 'the': 42, 'middle': 26, 'of': 33, 'night': 29, 'thinking': 44, 'and': 0, 'then': 43, 'could': 8, 'not': 30, 'get': 17, 'back': 6, 'to': 46, 'sleep': 37, 'his': 22, 'plans': 34, 'dreams': 10, 'had': 18, 'revolved': 35, 'around': 2, 'her': 20, 'so': 38, 'much': 28, 'for': 14, 'long': 25, 'that': 41, 'now': 32, 'felt': 13, 'as': 3, 'if': 23, 'nothing': 31, 'found': 16, 'was': 

In [35]:
print(len(cv.vocabulary_))

55


In [36]:
numbers = vc[2]

In [37]:
numbers

array([2, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 2, 2, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 2, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [38]:
len(vc[1])

55

In [39]:
def myTokenizer(document):
    words = tokenizer.tokenize(document.lower())
    # remove the stopwords
    words = remove_stoprwords(words, sw)
    return words

In [40]:
myTokenizer('this is a random text')

['random', 'text']

In [41]:
cv = CountVectorizer(tokenizer=myTokenizer)

In [42]:
vc = cv.fit_transform(corpus).toarray()

In [43]:
print(vc)

[[0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1]
 [1 1 0 0 1 1 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 1 0]
 [1 0 1 0 0 0 0 1 0 0 1 0 0 0 1 0 0 1 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 1 0 0 0 0 1 1 0 0 1 0 0 0 0 0 1 0 0 0 1 1 0 1 0 1 0 0 1 0 0]]


In [44]:
len(vc[0])

33

In [45]:
cv.vocabulary_

{'dan': 6,
 'morgan': 16,
 'told': 28,
 'would': 32,
 'forget': 11,
 'ann': 1,
 'turner.': 29,
 'sometimes': 24,
 'woke': 31,
 'middle': 15,
 'night': 18,
 'thinking': 26,
 'could': 5,
 'get': 13,
 'back': 4,
 'sleep': 23,
 '.': 0,
 'plans': 20,
 'dreams': 7,
 'revolved': 21,
 'around': 2,
 'much': 17,
 'long': 14,
 'felt': 10,
 'nothing': 19,
 'found': 12,
 'tired': 27,
 'enough': 8,
 'went': 30,
 'simply': 22,
 'exhausted': 9,
 'stay': 25,
 'awake': 3}

In [46]:
len(cv.transform([sent]).toarray()[0])

33

In [47]:
cv.vocabulary_

{'dan': 6,
 'morgan': 16,
 'told': 28,
 'would': 32,
 'forget': 11,
 'ann': 1,
 'turner.': 29,
 'sometimes': 24,
 'woke': 31,
 'middle': 15,
 'night': 18,
 'thinking': 26,
 'could': 5,
 'get': 13,
 'back': 4,
 'sleep': 23,
 '.': 0,
 'plans': 20,
 'dreams': 7,
 'revolved': 21,
 'around': 2,
 'much': 17,
 'long': 14,
 'felt': 10,
 'nothing': 19,
 'found': 12,
 'tired': 27,
 'enough': 8,
 'went': 30,
 'simply': 22,
 'exhausted': 9,
 'stay': 25,
 'awake': 3}