In [43]:
import numpy as np
import pandas as pd

In [44]:
sms = pd.read_csv('/home/pankaj/Downloads/sms.tsv', delimiter='\t', header=None,
                  names= ['Target', 'Message'])
sms.head()

Unnamed: 0,Target,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [45]:
sms['Target'] = sms['Target'].map({'ham':0, 'spam':1})
sms.head()

Unnamed: 0,Target,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [46]:
sms.Message[0]   # Non-Spam Sample

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [47]:
sms.Message[2]    # Spam Sample

"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

In [48]:
import re
def clean(x):
    x = re.sub(r'<.*>', '', x) # substitute HTML tags by empty string
    x = re.sub(r'\d+', '#', x) # substitute all digit character by #
    x = re.sub(r'[^a-zA-Z#]', ' ', x) # substitute all characters other than alphabets with space
    x = re.sub(r'\s+[a-zA_Z]\s+',' ', x) # substitute single character by space 
    x = re.sub(r'\s+',' ', x) # remove extra whitespaces by space
    return x.lower()

sms['Message'] = sms['Message'].apply(clean)
sms.head()

Unnamed: 0,Target,Message
0,0,go until jurong point crazy available only in ...
1,0,ok lar joking wif oni
2,1,free entry in # wkly comp to win fa cup final ...
3,0,u dun say so early hor u already then say
4,0,nah i don think he goes to usf he lives around...


In [49]:
sms.Message[2]

'free entry in # wkly comp to win fa cup final tkts #st may # text fa to # to receive entry question std txt rate t c apply #over# s'

In [50]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def lemmatize(message):
    lem = WordNetLemmatizer()
    sent= [lem.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(message)]
    return ' '.join(sent)

In [51]:
sms['Message']= sms['Message'].apply(lemmatize)
sms.head()

Unnamed: 0,Target,Message
0,0,go until jurong point crazy available only in ...
1,0,ok lar joking wif oni
2,1,free entry in # wkly comp to win fa cup final ...
3,0,u dun say so early hor u already then say
4,0,nah i don think he go to usf he life around he...


In [52]:
sms.Message[2]

'free entry in # wkly comp to win fa cup final tkts # st may # text fa to # to receive entry question std txt rate t c apply # over # s'

In [53]:
sms.shape

(5572, 2)

In [54]:
sms.isnull().sum()

Target     0
Message    0
dtype: int64

In [60]:
x = sms.Message#Message
y = sms.Target.values # target as array(ham or spam)
print('_________________________Message _________________________')
print(x)
print('__________________________Target__________________________')
print(y)

_________________________Message _________________________
0       go until jurong point crazy available only in ...
1                                   ok lar joking wif oni
2       free entry in # wkly comp to win fa cup final ...
3               u dun say so early hor u already then say
4       nah i don think he go to usf he life around he...
                              ...                        
5567    this be the # nd time we have try # contact u ...
5568                         will go to esplanade fr home
5569     pity be in mood for that so any other suggestion
5570    the guy do some bitching but i act like d be i...
5571                              rofl it true to it name
Name: Message, Length: 5572, dtype: object
__________________________Target__________________________
[0 0 1 ... 0 0 0]


In [14]:
# Check for balanced or Imbalanced dataset
print(np.unique(y, return_counts=True))

(array([0, 1]), array([4825,  747]))


In [15]:
# splitting of dataset into train and test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=.20,
                                                   random_state = 0)

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
tvect = TfidfVectorizer(max_df=.8, min_df=3, stop_words='english')

In [17]:
# Extract features from train set
tvect.fit(x_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.8, max_features=None,
                min_df=3, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [18]:
# Total no. of features created
print(len(tvect.get_feature_names()))

# see first fifty feature
print(tvect.get_feature_names()[:50])

1843
['aah', 'aathi', 'abi', 'abiola', 'able', 'abt', 'abta', 'ac', 'acc', 'accept', 'access', 'accordingly', 'account', 'ache', 'act', 'action', 'activate', 'actually', 'ad', 'add', 'addie', 'address', 'admirer', 'adore', 'adult', 'advance', 'adventure', 'advise', 'ae', 'affair', 'affection', 'afraid', 'aft', 'afternoon', 'aftr', 'age', 'ago', 'ah', 'aha', 'ahead', 'ahmad', 'aight', 'ain', 'aint', 'air', 'airport', 'aiya', 'aiyah', 'aiyar', 'aiyo']


In [19]:
# convert training text to array
x_train_t = tvect.transform(x_train).toarray()

In [20]:
# Apply Supervised ML classification algorithm
from sklearn.naive_bayes import BernoulliNB
naive = BernoulliNB() # 'naive' object is untrained model

In [21]:
naive.fit(x_train_t, y_train) # train the model using fit()

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [22]:
# convert testing text to array
x_test_t = tvect.transform(x_test).toarray()

In [23]:
# Checking 1st sample
print(x_test_t[0]) # 1-D
print(x_test_t[0].reshape(1,-1)) # 2-D ,here 1 is row and -1 automatically fits required column for the data

[0. 0. 0. ... 0. 0. 0.]
[[0. 0. 0. ... 0. 0. 0.]]


In [24]:
# prediction for 1st test sample and corresponding actual label
print(naive.predict(x_test_t[0].reshape(1,-1)), y_test[0], sep=' ---> ')

[0] ---> 0


In [25]:
# prediction for first 10 test sample and corresponding actual label
print('Pred','Act',sep=' -> ')
for index,x in enumerate(x_test_t[:10]):
    print(naive.predict(x.reshape(1,-1)), y_test[index], sep=' ---> ')

Pred -> Act
[0] ---> 0
[0] ---> 1
[0] ---> 0
[0] ---> 0
[0] ---> 0
[0] ---> 0
[1] ---> 1
[0] ---> 0
[0] ---> 0
[0] ---> 0


In [26]:
# Prediction of complete testing sample
y_pred = naive.predict(x_test_t)

In [27]:
# Model Evaluation: Score Card
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       955
           1       0.99      0.90      0.94       160

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.97      1115
weighted avg       0.98      0.98      0.98      1115



In [28]:
from sklearn.metrics import confusion_matrix, recall_score

In [29]:
confusion_matrix(y_test, y_pred)     # CM for TfidfVectorizer

array([[953,   2],
       [ 16, 144]])

# Model Evaluation using Confusion Matrix:
- A confusion matrix is a table that is used to evaluate the performance of a classification model.
- The fundamental of a confusion matrix is the number of correct and incorrect predictions are summed up class-wise.
<img src='confusion.png' width=80% height=50%/>

### Terms associated with Confusion matrix:
- **1. True Positives(TP):** True positives are the cases when the actual class of the data point was 1(True) and the predicted is also 1(True).
<br/><br/>
- **2. True Negatives(TN):** True negatives are the cases when the actual class of the data point was 0(False) and the predicted is also 0(False).
<br/><br/>
- **3. False Positives(FP):** False positives are the cases when the actual class of the data point was 0(False) and the predicted is 1(True). False is because the model has predicted incorrectly and positive because the class predicted was a positive one(1).
<br/><br/>
- **4. False Negatives(FN):** False negatives are the cases when the actual class of the data point was 1(True) and the predicted is 0(False). False is because the model has predicted incorrectly and negative because the class predicted was a negative one(0).
<br/><br/>
- **5. Accuracy:** Accuracy measures how well the test predicts both Positive and Negative classes i.e, Overall correctness of model.
<br/><br/>
- **6. Precision:** Precision is intuitively the ability of the classifier not to label as positive a sample that is negative i.e, How Many predicted 1 are actually 1. Precision can be thought of as a measure of a classifiers exactness. A low precision can also indicate a large number of False Positives.
<br/><br/>
- **7. Sensitivity or Recall or True Positive Rate:** Recall measures the proportion of positives that are correctly identified as such i.e, Accuracy of class 1. Recall can be thought of as a measure of a classifiers completeness. A low recall indicates many False Negatives.
<br/><br/>
- **8. Specificity:** Specificity measures the proportion of negatives that are correctly identified as such i.e, Accuracy of class 0.
<br/><br/>
- **9. FalsePositive Rate:** The false positive rate is the proportion of all negatives that still yield positive test outcomes.
<br/><br/>
- **10. F-1 Score:** F1 score conveys the balance between the precision and the recall. F1 score is harmonic average of the precision and recall, where an F1 score reaches its best value at 1 (perfect precision and recall) and worst at 0. 
    - With immbalanced data we should test model performance by using F-1 score.
        - `F1 = 2 * ((precision * recall) / (precision + recall))`

In [31]:
# Load the Vectorizer and classification trained model
with open('SpamVectorizer','rb') as vect_pkl:
    vectorizer= pickle.load(vect_pkl)
    
with open('SpamModel','rb') as model_pkl:
    classifier= pickle.load(model_pkl)

In [32]:
test_doc = ['FREE....FREE entry into our £250 weekly competition just text the word WIN to 80086 NOW. Hurry, last chance to become billionaire!!!!']   # Spam message
test_doc[0]= clean(test_doc[0])
test_doc[0]= lemmatize(test_doc[0])
test_doc[0]

'free free entry into our # weekly competition just text the word win to # now hurry last chance to become billionaire'

In [33]:
test_doc_tf = vectorizer.transform(test_doc).toarray()
pred = classifier.predict(test_doc_tf)
pred

array([1])