In [3]:
from nltk.corpus import brown

In [4]:
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [5]:
data = brown.sents(categories = 'editorial')[:]
print(data)

[['Assembly', 'session', 'brought', 'much', 'good'], ['The', 'General', 'Assembly', ',', 'which', 'adjourns', 'today', ',', 'has', 'performed', 'in', 'an', 'atmosphere', 'of', 'crisis', 'and', 'struggle', 'from', 'the', 'day', 'it', 'convened', '.'], ...]


# Basic NLP pipeline
- Data collection
- Tokenization, Stopword and Stemming
- Building a common Vocab
- Vectorizing the Documents
- Performing Classification/Clustering

## Tokenization

In [21]:
text = 'It was very pleasent day, the weather was cool and there were light showers. I went to market to buy some fruits.'
print(text)

It was very pleasent day, the weather was cool and there were light showers. I went to market to buy some fruits.


In [7]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [8]:
sents = sent_tokenize(text)
print(sents)

['It was very pleasent day, the weather was cool and there were light showers.', 'I went to market to buy some fruits.']


In [9]:
words = word_tokenize(sents[0])
print(words)

['It', 'was', 'very', 'pleasent', 'day', ',', 'the', 'weather', 'was', 'cool', 'and', 'there', 'were', 'light', 'showers', '.']


## Stopward Removal

In [10]:
from nltk.corpus import stopwords

In [11]:
sw = set(stopwords.words('english'))
print(sw)

{'the', 'or', 'below', 'theirs', 'hasn', 'has', "didn't", 'between', 'll', 'no', 'under', 'those', 'had', 'on', 'yourself', 'if', 'in', 'from', 'isn', 'do', 'i', 'themselves', 'such', 'ourselves', 'here', 'itself', 'are', 'each', 'o', 'which', "needn't", "haven't", 'to', 'didn', "that'll", 'myself', 'her', 'once', 'at', 'our', 'have', 'own', 'ma', 'mustn', 'needn', 'yourselves', 'any', 'having', 'this', 'only', "hadn't", 'mightn', 'all', 'my', 'how', 'as', 'you', 'was', 'most', 'will', "you'll", 'herself', "doesn't", "shan't", 'before', 're', 'me', 'while', 'ain', 'both', 'these', 'm', 'out', "hasn't", 'then', 'further', "don't", 'with', 'shan', 'wasn', 't', 'aren', 'and', 'other', 'yours', 'being', 'more', 'after', 'now', 's', 'your', 'of', 'against', 'by', "you'd", 'won', 'it', 'am', 'off', "wouldn't", "aren't", 'weren', 'were', 'don', "it's", 'when', 'why', 'be', 'doesn', 'doing', 'too', "mustn't", 'their', 'for', 'd', 'an', 'who', "shouldn't", 'ours', 'hadn', "you've", 'over', 'is'

In [12]:
useful_words = [w for w in words if w not in sw]
print(useful_words)

['It', 'pleasent', 'day', ',', 'weather', 'cool', 'light', 'showers', '.']


## Tokenization using regular expression
- Problem with tokenizer- Can't handle complex tokenizations. So we use regexp Tokenizer class in NLTK. 

In [13]:
from nltk.tokenize import RegexpTokenizer

In [30]:
tokenizer = RegexpTokenizer('[a-zA-Z@]+')

In [31]:
text2 = "Hi! My contact number is 0000000000 and mail id is abc@gmail.com"
print(tokenizer.tokenize(text2))

['Hi', 'My', 'contact', 'number', 'is', 'and', 'mail', 'id', 'is', 'abc@gmail', 'com']


## Stemming
- Process that transforms particular words(verbs, plurals) into their regular form.
- Preserve the semanticsof the sentance without increasing the number of unique tokens.
- jumps, jumping, jumps => jump

In [39]:
text3 = "Fox loves to make jump. the quick brown fox was jumping over the lovely dog from 6ft"
word_list = tokenizer.tokenize(text3)
print(word_list)

['Fox', 'loves', 'to', 'make', 'jump', 'the', 'quick', 'brown', 'fox', 'was', 'jumping', 'over', 'the', 'lovely', 'dog', 'from', 'ft']


In [40]:
word_list = [w for w in word_list if w not in sw]
print(word_list)

['Fox', 'loves', 'make', 'jump', 'quick', 'brown', 'fox', 'jumping', 'lovely', 'dog', 'ft']


### Stemming methods
- Snowball Stemmer(multilingual)
- Porter Stemmer
- Lancaster Stemmer

In [50]:
from nltk.stem.snowball import PorterStemmer, SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer

In [44]:
ps = PorterStemmer()
ps.stem('jumps')

'jump'

In [48]:
ls = LancasterStemmer()
ls.stem('teeth')

'tee'

In [51]:
ss = SnowballStemmer('english')
ss.stem('jumped')

'jump'

## Another method for stemming called lemmatization

In [1]:
from nltk.stem import WordNetLemmatizer

In [3]:
l = WordNetLemmatizer()
l.lemmatize('crying')

'cry'

## Building Common Vocabulary and Vectorizing Documents (based upon Bag of Words Model)

In [54]:
corpus = [
        'Indian cricket team will wins World Cup, says Capt. Virat Kohli. World cup will be held at Sri Lanka.',
        'We will win next Lok Sabha Elections, says confident Indian PM',
        'The nobel laurate won the hearts of the people',
        'The movie Raazi is an exciting Indian Spy thriller based upon a real story'
]

In [55]:
from sklearn.feature_extraction.text import CountVectorizer

In [62]:
cv = CountVectorizer()
vectorized_corpus = cv.fit_transform(corpus).toarray()

In [63]:
print(vectorized_corpus)
print(len(vectorized_corpus[0]))

[[0 1 0 1 1 0 1 2 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1
  0 2 0 1 0 2]
 [0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 1 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0
  1 1 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 3 0 0 0
  0 0 0 0 1 0]
 [1 0 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 1 0 0 0 0 0 1 1 0 0 1 0 1 0 1 1 1 0
  0 0 0 0 0 0]]
42


In [65]:
print(cv.vocabulary_) #Dictionary - Word -> Index

{'indian': 12, 'cricket': 6, 'team': 31, 'will': 37, 'wins': 39, 'world': 41, 'cup': 7, 'says': 27, 'capt': 4, 'virat': 35, 'kohli': 14, 'be': 3, 'held': 11, 'at': 1, 'sri': 29, 'lanka': 15, 'we': 36, 'win': 38, 'next': 19, 'lok': 17, 'sabha': 26, 'elections': 8, 'confident': 5, 'pm': 23, 'the': 32, 'nobel': 20, 'laurate': 16, 'won': 40, 'hearts': 10, 'of': 21, 'people': 22, 'movie': 18, 'raazi': 24, 'is': 13, 'an': 0, 'exciting': 9, 'spy': 28, 'thriller': 33, 'based': 2, 'upon': 34, 'real': 25, 'story': 30}
