In [1]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.6.2-py3-none-any.whl (1.5 MB)
Collecting regex
  Downloading regex-2021.7.6-cp36-cp36m-win_amd64.whl (270 kB)
Collecting tqdm
  Downloading tqdm-4.61.2-py2.py3-none-any.whl (76 kB)
Installing collected packages: tqdm, regex, nltk
Successfully installed nltk-3.6.2 regex-2021.7.6 tqdm-4.61.2
You should consider upgrading via the 'c:\users\manu\appdata\local\programs\python\python36\python.exe -m pip install --upgrade pip' command.


In [2]:
import nltk

In [22]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

### Data Collection

In [3]:
from nltk.corpus import brown

In [4]:
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [5]:
data = brown.sents(categories="news")

In [6]:
len(data)

4623

In [7]:
" ".join(data[0])

"The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced `` no evidence '' that any irregularities took place ."

In [8]:
c = 0
for i in range(len(data)):
    c+=len(data[i])

print(c)

100554


In [9]:
words = brown.words(categories="news")

In [10]:
len(words)

100554

### Removal of Stopwords

In [11]:
from nltk.corpus import stopwords

In [12]:
stop_words = stopwords.words('english')

In [13]:
len(stop_words)

179

In [14]:
preprocess_data = []
for sent in data:
    preprocess_sent = []
    for word in sent:
        if word.lower() not in stop_words:
            preprocess_sent.append(word)
    preprocess_data.append(preprocess_sent)

In [15]:
" ".join(preprocess_data[0])

"Fulton County Grand Jury said Friday investigation Atlanta's recent primary election produced `` evidence '' irregularities took place ."

### Regx Tokenization (Data-Cleaning)

In [16]:
from nltk.tokenize import RegexpTokenizer, sent_tokenize, word_tokenize

In [17]:
"Return a sentence-tokenized copy of *text*,using NLTK's recommended sentence tokenizer".split(" ")

['Return',
 'a',
 'sentence-tokenized',
 'copy',
 'of',
 '*text*,using',
 "NLTK's",
 'recommended',
 'sentence',
 'tokenizer']

In [18]:
word_tokenize("Return a sentence-tokenized copy of *text*,using NLTK's recommended sentence tokenizer.")

['Return',
 'a',
 'sentence-tokenized',
 'copy',
 'of',
 '*',
 'text',
 '*',
 ',',
 'using',
 'NLTK',
 "'s",
 'recommended',
 'sentence',
 'tokenizer',
 '.']

In [19]:
sent_tokenize("Return a sentence-tokenized copy of *text*,using NLTK's recommended sentence tokenizer. Return a sentence-tokenized copy of *text*,using NLTK's recommended sentence tokenizer.")

["Return a sentence-tokenized copy of *text*,using NLTK's recommended sentence tokenizer.",
 "Return a sentence-tokenized copy of *text*,using NLTK's recommended sentence tokenizer."]

In [20]:
sentence = "my name is manu my email id is manupillai308@gmail.com 99829991029 prime minister's home okay bye"

In [21]:
regx = RegexpTokenizer(pattern="[a-zA-Z]+")

In [22]:
regx.tokenize(sentence)

['my',
 'name',
 'is',
 'manu',
 'my',
 'email',
 'id',
 'is',
 'manupillai',
 'gmail',
 'com',
 'prime',
 'minister',
 's',
 'home',
 'okay',
 'bye']

In [23]:
regx.tokenize(" ".join(preprocess_data[0]))

['Fulton',
 'County',
 'Grand',
 'Jury',
 'said',
 'Friday',
 'investigation',
 'Atlanta',
 's',
 'recent',
 'primary',
 'election',
 'produced',
 'evidence',
 'irregularities',
 'took',
 'place']

In [24]:
for i in range(len(preprocess_data)):
    preprocess_data[i] = regx.tokenize(" ".join(preprocess_data[i]))

In [25]:
preprocess_data[0]

['Fulton',
 'County',
 'Grand',
 'Jury',
 'said',
 'Friday',
 'investigation',
 'Atlanta',
 's',
 'recent',
 'primary',
 'election',
 'produced',
 'evidence',
 'irregularities',
 'took',
 'place']

### Stemming & Lemmatization


In [26]:
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer  

In [27]:
from nltk import stem

In [28]:
stemmer = PorterStemmer()

In [29]:
stemmer.stem("establishment")

'establish'

In [30]:
lemma = WordNetLemmatizer()

In [31]:
lemma.lemmatize("establishment")

'establishment'

In [32]:
for i in range(len(preprocess_data)):
    preprocess_sent = []
    for word in preprocess_data[i]:
        preprocess_sent.append(stemmer.stem(word))
    preprocess_data[i] = preprocess_sent

In [33]:
preprocess_data[0]

['fulton',
 'counti',
 'grand',
 'juri',
 'said',
 'friday',
 'investig',
 'atlanta',
 's',
 'recent',
 'primari',
 'elect',
 'produc',
 'evid',
 'irregular',
 'took',
 'place']

In [34]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [35]:
for i in range(len(preprocess_data)):
    preprocess_data[i] = " ".join(preprocess_data[i])

In [37]:
c_vectorizer = CountVectorizer(max_features=2000, binary=False, ngram_range=(1, 2))

In [38]:
c_vectorizer.fit(preprocess_data)

CountVectorizer(max_features=2000, ngram_range=(1, 2))

In [39]:
len(c_vectorizer.vocabulary_)

2000

In [41]:
# c_vectorizer.vocabulary_

In [42]:
x_data_count = c_vectorizer.transform(preprocess_data).todense()

In [43]:
x_data_count.shape

(4623, 2000)

In [44]:
x_data_count.max()

16

In [47]:
tfidf_vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1,2))

In [48]:
x_data_tfidf = tfidf_vectorizer.fit_transform(preprocess_data).todense()

In [None]:
x_data_tfidf[0].max()

### Modelling