# `Bag of words using Naive Bayes`
- Uses Bayes theorem

#### [Deep Learning Krish naik playlist](https://www.youtube.com/playlist?list=PLZoTAELRMXVPGU70ZGsckrMdr0FteeRUi)
#### [Flutter](https://flutter.dev/)

In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.feature_extraction.text import CountVectorizer  # help us to count the tokens
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score, precision_score
from nltk.corpus import stopwords
import nltk
import re
import string
from nltk.stem import WordNetLemmatizer

In [2]:
phrases = ['I want to learn machine learning','I am working at innomatics','I am learning machine learning at innomatics']
vect = CountVectorizer()
vect.fit(phrases)

CountVectorizer()

In [3]:
print('vocab size = {}'.format(len(vect.vocabulary_)))

vocab size = 9


In [4]:
print('vocabulary =',vect.vocabulary_)

vocabulary = {'want': 7, 'to': 6, 'learn': 3, 'machine': 5, 'learning': 4, 'am': 0, 'working': 8, 'at': 1, 'innomatics': 2}


### it had just ordered the tokens 

In [5]:
bag_of_words = vect.transform(phrases)
print(bag_of_words)

  (0, 3)	1
  (0, 4)	1
  (0, 5)	1
  (0, 6)	1
  (0, 7)	1
  (1, 0)	1
  (1, 1)	1
  (1, 2)	1
  (1, 8)	1
  (2, 0)	1
  (2, 1)	1
  (2, 2)	1
  (2, 4)	2
  (2, 5)	1


### it is the frequency now
### there are some tokens which countvectorizer automatically removes, it is a kind of preprocessing, removes some stop words

In [6]:
print(bag_of_words.toarray())

[[0 0 0 1 1 1 1 1 0]
 [1 1 1 0 0 0 0 0 1]
 [1 1 1 0 2 1 0 0 0]]


## `apply same thing on dataset`

In [7]:
data = pd.read_csv('E:\INNO\IMDB Dataset.csv')
data.head()
data = data[:25000]

In [8]:
from sklearn.preprocessing import LabelBinarizer
le = LabelBinarizer()
data['sentiment'] = le.fit_transform(data['sentiment'])

In [9]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [10]:
data.shape

(25000, 2)

# `Preprocessing`

In [11]:
data.__len__()

25000

In [12]:
data['sentiment'].value_counts()

0    12526
1    12474
Name: sentiment, dtype: int64

### break down the given tokens, tokenization

In [None]:
words_count = [nltk.word_tokenize(data['review'][i]).__len__() for i in range(len(data['review']))]

In [None]:
print(words_count)

### counting the number of words

In [None]:
words_count.__len__()

### total nmuber of words

In [None]:
sum(words_count)

### sample review

In [None]:
data['review'][0]

In [None]:
nltk.download('stopwords')

In [None]:
stop_words = set(stopwords.words('english'))
print(stop_words.__len__())
print(stop_words)

### remove stopwords
### regex is for http links

In [None]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

In [None]:
def cleaning_review(data):
    all_reviews = list()
    lines = data['review'].values.tolist()
    for i in lines:
        text = i.lower()
        pattern = re.compile('http[s]?://(?:[A-Za-z]|[0-9]|[$-_@,.&*]|[(\)!]|[?:0-9a-fA-F])+')
        text = pattern.sub('',text)
        tokens = nltk.word_tokenize(text)
        table = str.maketrans('','',string.punctuation)
        stripped = [w.translate(table) for w in tokens]
        words = [word for word in stripped if word.isalpha()]
        stop_words = set(stopwords.words('english'))
        stop_words.discard('not')
        words =  [lemmatizer.lemmatize(w,'v') for w in words if not w in stop_words]
        words = ' '.join(words)
        all_reviews.append(words)
    return (all_reviews)

In [None]:
vals = cleaning_review(data)

## `Note: In industry we write code in class`

In [None]:
vals[0]

### stopwords are removed
### lemmatization based on verb is done to remove tenses

In [None]:
words_count = [nltk.word_tokenize(vals[i]).__len__() for i in range(len(vals))]

In [None]:
print(sum(words_count))

### size of tokens decreased by 7805564

In [None]:
vector = CountVectorizer(min_df=3)
# ignore those tokens which appear in less than 3 sentences
x = vector.fit_transform(vals).toarray()
y = data['sentiment'].to_numpy()
print(x.shape)
print(y.shape)