## Naive Bayes

### Loading Libraries and dataset

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score,classification_report,confusion_matrix,accuracy_score
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv("attachment_train.tsv", delimiter="\t")

In [3]:
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [4]:
df['Sentiment'].value_counts()

2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64

In [5]:
df.isnull().sum() ##this tells we dont have any missing values Task 1 done

PhraseId      0
SentenceId    0
Phrase        0
Sentiment     0
dtype: int64

# Task 2 : making table of the category : defining the x and y
 STEP 1: first remove all unwanted words<br>
 Step 2: Vectorize the dataset according to the TF_IDF : call the instance of Tfidfvectorizr and set the parameter to vectorise <br>
 Step 3: fit the dataset with tfidf vectorizer <br>
 Step 4: Seperate the dependent variable that is Y our target variable<br>
 Step 5: now define the X variable by transforming it in tfidf vectorizer format.<br>
 Step 6: split the text/phrase inside the x into individual words<br>
 
 **NOTE : FIT means setting the rules for dataframe and transform means changing the datframe according to the set rule.**

In [6]:
setword = set(stopwords.words('english'))  #step 1

In [7]:
vectoriz_data = TfidfVectorizer(use_idf = True,lowercase= True, stop_words=setword,strip_accents='ascii') #Step 2:

In [8]:
vectoriz_data.fit(df) #Step 3:

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words={"couldn't", 'or', 'to', 'that', 'my', 're', 'below', 'same', 'too', 'again', 'other', 'out', "mustn't", 'ourselves', "she's", 'during', "shouldn't", 'being', "hasn't", 'about', 'more', "that'll", 'shan', 'over', 'above', 's', 'her', 'hers', 'ain', 'he', 'his', 'them', 'in', 'won', 'by', ...nor', 'just', 'our', 'she', 'who', 'hadn', 'me', 'mustn', 'haven', 'your', 'against', 'had', 'been'},
        strip_accents='ascii', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [9]:
df.shape

(156060, 4)

### TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document)

### IDF(t) = log_e(Total number of documents / Number of documents with term t in it).

## tf-idf score=TF(t)*IDF(t)

In [10]:
y = df["Sentiment"]
x = df["Phrase"]       # Step4

In [11]:
print(y.shape)
print(x.shape)

(156060,)
(156060,)


**only phrase and sentiment type is important in the dataset**

In [12]:
x = vectoriz_data.fit_transform(x)    #Step : 5

In [13]:
x.shape

(156060, 15115)

In [14]:
x

<156060x15115 sparse matrix of type '<class 'numpy.float64'>'
	with 623022 stored elements in Compressed Sparse Row format>

In [15]:
split_1st_phrase = df["Phrase"][1].split()
split_1st_phrase

['A',
 'series',
 'of',
 'escapades',
 'demonstrating',
 'the',
 'adage',
 'that',
 'what',
 'is',
 'good',
 'for',
 'the',
 'goose']

In [16]:
print(max(split_1st_phrase))
print(len(split_1st_phrase))

what
14


In [17]:
x[1]

<1x15115 sparse matrix of type '<class 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>

** It means in the first phrase there are 14(len(split_1st_phrase)) words & out of which only 6 elements have been taken which are giving meaning and removing useless words which is removed by "stopwords", that;s why we'll get only 6 tf-idf values for the first the phrase.Likewise elements or words of all other phrases are taken into consideration**

In [18]:
x[156059]   # for last phrase there is only one elemnt stored to judge the value of word in it

<1x15115 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [19]:
print(df["Phrase"][0])
print(df["Phrase"][1])
print(df["Phrase"][2])

A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .
A series of escapades demonstrating the adage that what is good for the goose
A series


In [20]:
print(x) # here 0 is 1st phrase 11756 is the word, 0.21 is tf_idf value which tell the word holds 21.40% importance in 1st phrase

  (0, 11756)	0.21406381187
  (0, 4545)	0.338395250784
  (0, 3466)	0.301908610865
  (0, 286)	0.305101582652
  (0, 5785)	0.318946757333
  (0, 5801)	0.278014250714
  (0, 523)	0.196130467085
  (0, 5559)	0.322914576695
  (0, 9138)	0.234490943134
  (0, 595)	0.320026182374
  (0, 9022)	0.230649706998
  (0, 586)	0.268114018887
  (0, 8747)	0.164482161821
  (0, 12772)	0.155332083718
  (1, 11756)	0.320071242718
  (1, 4545)	0.505973370755
  (1, 3466)	0.451418030086
  (1, 286)	0.456192206715
  (1, 5785)	0.238446853975
  (1, 5801)	0.415690844436
  (2, 11756)	1.0
  (4, 11756)	1.0
  (5, 4545)	0.534068877641
  (5, 3466)	0.476484207687
  (5, 286)	0.481523483075
  :	:
  (156049, 11385)	0.539505216234
  (156049, 9127)	0.552565277797
  (156049, 12999)	0.635299720942
  (156050, 11385)	0.698600543069
  (156050, 9127)	0.715511901525
  (156051, 11385)	0.698600543069
  (156051, 9127)	0.715511901525
  (156052, 11385)	1.0
  (156053, 1294)	0.400689647833
  (156053, 5294)	0.385382441783
  (156053, 6204)	0.4554009782

In [21]:
vectoriz_data.get_feature_names()[11756]  # word at 11756 index

'series'

In [37]:
#print(vectoriz_data.vocabulary_)

**a sample of output for vocabulury method is shown below ** <br>

{'series': 11756, 'escapades': 4545, 'demonstrating': 3466, 'adage': 286, 'good': 5785, 'goose': 5801, 'also': 523, 'gander': 5559, 'occasionally': 9138,..........}

# Task 3: predict the sentiment using Multinomial Naïve Bayes algorithm

Step 1: train and test the data <br>
Step 2: instantiate the naive bayes algorithm class MultinomialNB <br>
Step 3: fit the model and predict <br>
Step 4: now predict the probabilty of a phrase to be in any of the given category by predict proba<br>

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,random_state=42)

In [27]:
clf = MultinomialNB()

In [28]:
clf.fit(xtrain,ytrain)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [29]:
pred = clf.predict(xtest)

In [30]:
ytest[:10]

95722     2
147312    4
36991     2
150211    2
140655    1
154731    1
124991    1
140469    3
94770     4
12599     1
Name: Sentiment, dtype: int64

In [31]:
pred.shape

(39015,)

In [32]:
pred

array([2, 3, 2, ..., 2, 2, 2], dtype=int64)

In [33]:
print(confusion_matrix(ytest,pred))

[[   81   747   880    32     0]
 [   40  1832  4755   226     1]
 [    8   829 17530  1253    15]
 [    0   119  4912  3297    56]
 [    0    10   833  1430   129]]


In [34]:
print(classification_report(ytest,pred))

             precision    recall  f1-score   support

          0       0.63      0.05      0.09      1740
          1       0.52      0.27      0.35      6854
          2       0.61      0.89      0.72     19635
          3       0.53      0.39      0.45      8384
          4       0.64      0.05      0.10      2402

avg / total       0.58      0.59      0.53     39015



In [35]:
print(accuracy_score(ytest,pred))

0.58615916955


In [36]:
##this accuracy is not good, if we do this problem with deep learning then the score will improve