Importing of Data

In [1]:
# Importing DataSet 
import pandas as pd
data = pd.read_csv('SMSSpamCollection.txt', sep="\t",names=["label","msg"])

In [2]:
# Viewing the Data Set Imported
data.describe

<bound method NDFrame.describe of      label                                                msg
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
...    ...                                                ...
5567  spam  This is the 2nd time we have tried 2 contact u...
5568   ham               Will ü b going to esplanade fr home?
5569   ham  Pity, * was in mood for that. So...any other s...
5570   ham  The guy did some bitching but I acted like i'd...
5571   ham                         Rofl. Its true to its name

[5572 rows x 2 columns]>

In [3]:
data.head()

Unnamed: 0,label,msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Pre- Processing of Data

In [4]:
# Drop of Duplicate Data and remove of Null Values
data.isnull().sum()

label    0
msg      0
dtype: int64

In [5]:
# Convert the label value to 1-> spam and 0-> ham
data['label_num'] = data.label.map({'ham':0, 'spam':1})
data.head()

Unnamed: 0,label,msg,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [6]:
# Getting required frame works
import re
import nltk
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
wordNet = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
pre_pro = []

for i in range(len(data['msg'])):
    
    # Substituting all letters other than Alphabets
    rev = re.sub('[^a-zA-Z]',' ',data['msg'][i])
    
    # Converting all to lower case
    rev = rev.lower()
    rev = rev.split()
    
    # Lemmatizer is used to obtain the meaningfull words from words in sentences
    rev = [wordNet.lemmatize(word) for word in rev if word not in set(stopwords.words('english'))]
    rev = ' '.join(rev)
    pre_pro.append(rev)

Model Creation

In [8]:
# Using Tfid Vectorizer to transform the string array to vector of frequency count
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
X = cv.fit_transform(pre_pro).toarray()

In [9]:
y = data['label_num']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 0)

In [10]:
# Using Naive Bayes to create model
from sklearn.naive_bayes import MultinomialNB
spam_model = MultinomialNB().fit(X_train,y_train)
y_pred = spam_model.predict(X_test)

In [11]:
# Getting Accuracy Score and Confusion matrix for test set
from sklearn.metrics import accuracy_score, confusion_matrix
acc = accuracy_score(y_test, y_pred)
print('Accuracy :', acc)

conf = confusion_matrix(y_test, y_pred)
print('\n Confusion Matrix')
conf

Accuracy : 0.9721973094170404

 Confusion Matrix


array([[955,   0],
       [ 31, 129]], dtype=int64)

Getting User Input and Checking the Model

In [16]:
no_of_inp = int(input("Enter the No.of Strings to Be Entered: "))
mail = []
for s in range(no_of_inp):
    mail.append(input("Enter String: "))
    

check=[]
for i in range(len(mail)):  
    rev = re.sub('[^a-zA-Z]',' ',mail[i])
    rev = rev.lower()
    rev = rev.split()
    rev = [wordNet.lemmatize(word) for word in rev if word not in set(stopwords.words('english'))]
    rev = ' '.join(rev)
    check.append(rev)
    
pred=spam_model.predict(cv.transform(check))

print("\nResults:")
for i in range(len(pred)):
    if pred[i]==0:
        print(mail[i],' is "NOT SPAM"')
        
    else:
        print(mail[i],' is "SPAM"')



Enter the No.of Strings to Be Entered: 1
Enter String: YOU ARE CHOSEN TO RECEIVE A å£350 AWARD! Pls call claim to collect your award which you are selected

Results:
YOU ARE CHOSEN TO RECEIVE A å£350 AWARD! Pls call claim to collect your award which you are selected  is "SPAM"
