In [1]:
from sklearn import datasets,metrics
from sklearn.naive_bayes import GaussianNB

In [2]:
dataset=datasets.load_iris()
type(dataset)

sklearn.utils.Bunch

In [3]:
model=GaussianNB()
model.fit(dataset.data,dataset.target)

GaussianNB(priors=None, var_smoothing=1e-09)

In [4]:
print(model)

GaussianNB(priors=None, var_smoothing=1e-09)


In [5]:
expected=dataset.target
predicted=model.predict(dataset.data)

In [6]:
print(metrics.classification_report(expected,predicted))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       0.94      0.94      0.94        50
           2       0.94      0.94      0.94        50

    accuracy                           0.96       150
   macro avg       0.96      0.96      0.96       150
weighted avg       0.96      0.96      0.96       150



In [7]:
print(metrics.confusion_matrix(expected,predicted))

[[50  0  0]
 [ 0 47  3]
 [ 0  3 47]]


## 02,January 2020

## Training the Model to check the given message is spam or ham

In [8]:
import numpy as np
import pandas as pd
import nltk

In [9]:
sms=pd.read_csv('/home/pankaj/Downloads/sms.tsv',delimiter='\t',
                header=None,names=['Target','Message'])

In [10]:
sms.head()

Unnamed: 0,Target,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
sms['Target']=sms['Target'].map({'ham':0,'spam':1})

In [12]:
sms.shape

(5572, 2)

In [13]:
#  cleaning function
import re
def clean(x):
    x=re.sub('<.*>','',x)#remove HTML tags
    x=re.sub('[^a-zA-Z]',' ',x)#remove all character other than alphabets
    x=re.sub(r'\s+[a-zA-Z]\s+',' ',x)#
    x=re.sub(r'\s+',' ',x)#removes the extra spaces
    return x.lower()

In [14]:
sms['Message']=sms['Message'].apply(clean)
sms.head()

Unnamed: 0,Target,Message
0,0,go until jurong point crazy available only in ...
1,0,ok lar joking wif oni
2,1,free entry in wkly comp to win fa cup final tk...
3,0,u dun say so early hor c already then say
4,0,nah don think he goes to usf he lives around h...


In [15]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
def get_wordnet_pos(word):
    '''Map POS tag to first character lemmatize() accepts'''
    tag=nltk.pos_tag([word])[0][1][0].upper()
    tag_dict={'J':wordnet.ADJ,
             'N':wordnet.NOUN,
             'V':wordnet.VERB,
             'R':wordnet.ADV
        
    }
    return tag_dict.get(tag,wordnet.NOUN)
def lemmatize(message):
    lem=WordNetLemmatizer()
    sent=[lem.lemmatize(w,get_wordnet_pos(w)) for w in nltk.word_tokenize(message)]
    return ' '.join(sent)

In [16]:
x=sms.Message
y=sms.Target.values

In [17]:
np.unique(y,return_counts=True)

(array([0, 1]), array([4825,  747]))

In [18]:
#to split the dataset
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.20,random_state=0)
#Here test_size .20 means that data is divided into 20 and 80 ratio

In [19]:
x_train.shape

(4457,)

In [20]:
#convertin text into no.
from sklearn.feature_extraction.text import TfidfVectorizer
tvect=TfidfVectorizer(max_df=.8,min_df=3,stop_words='english')

In [21]:
tvect.fit(x_train)# It gives the tokens from all the data


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.8, max_features=None,
                min_df=3, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [22]:
len(tvect.get_feature_names())

2040

In [23]:
x_train_dot=tvect.transform(x_train).toarray()

In [24]:
x_train_dot

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [25]:
from sklearn.naive_bayes import BernoulliNB
bern=BernoulliNB()# this is machine/model

In [26]:
bern.fit(x_train_dot,y_train)#it finds the hidden relationship b/w x and y

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [27]:
#To save data of the model
import pickle
with open('Spam_Model','wb') as f:
    pickle.dump(bern,f)

In [28]:
# To reload
with open('Spam_Model','rb') as f:
    classfier=pickle.load(f)

In [29]:
#converting x_test text into the no.
x_test_dot=tvect.transform(x_test).toarray()

In [30]:
x_test_dot[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [31]:
classfier.predict(x_test_dot[0].reshape(1,-1))

array([0])

In [32]:
y_test[0]

0

In [33]:
y_pred=classfier.predict(x_test_dot)

In [34]:
#Report
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       955
           1       0.99      0.89      0.94       160

    accuracy                           0.98      1115
   macro avg       0.99      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [35]:
#Confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

array([[954,   1],
       [ 18, 142]])

In [36]:
#Confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

array([[954,   1],
       [ 18, 142]])

## 03rd,January 2020

In [41]:
import re
msg='hello sir<kdjfhjkd.../> foiurjkr..>how are you ?'
re.sub(r'<.*>','',msg)

'hello sirhow are you ?'