In [13]:
#importing all required libraries
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
#chardet is used to detect the character encoding in the given dataset
import chardet
with open('/home/harsha/Desktop/ML/spam/spam.csv','rb') as f:
    result=chardet.detect(f.read(10000))
print(result)

{'encoding': 'Windows-1252', 'confidence': 0.7261670208776098, 'language': ''}


In [15]:
#reading data set as encoding was found to be Windows-1252
df=pd.read_csv('/home/harsha/Desktop/ML/spam/spam.csv',encoding='Windows-1252')
print(df.shape,df.duplicated().sum(),df.drop_duplicates(inplace=True),df.shape)
print(df.head())
#counting the null values in each ccolumn
print(df.isna().sum())

(5572, 5) 403 None (5169, 5)
     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  
v1               0
v2               0
Unnamed: 2    5126
Unnamed: 3    5159
Unnamed: 4    5164
dtype: int64


In [16]:
#printing or knowing the columns present in the dataset 
print(df.columns)
#droping the unnecessary columns in the dataset as they are Nan
df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1,inplace=True)
#printing the updated columns in the data set
print(df.columns)

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')
Index(['v1', 'v2'], dtype='object')


In [17]:
#rename the columns v1 as Output and v2 as Message
df.rename(columns={'v1':'Output','v2':'Message'},inplace=True)
df['Output']=df['Output'].map({'ham':'0','spam':'1'})
df.head()


Unnamed: 0,Output,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [18]:
# Features are Inputs and Target is the Output whether spam or ham 
features=df['Message']
target=df['Output']
#splitting the data for training and testing 
x_train,x_test,y_train,y_test=train_test_split(features,target,test_size=0.2,random_state=3)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)


(4135,)
(1034,)
(4135,)
(1034,)


In [20]:
#For Spam detection common words also play a crucial role in finding spam or ham so we dont use stopwords for removing the common words
#converting the words into numbers as Machine only computes or undderstands the Numbers/binary
cv=CountVectorizer()
x_train_vector=cv.fit_transform(x_train.values)
x_test_vector=cv.transform(x_test.values)
x_train_vector.toarray()


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(4135, 7642))

In [None]:
#LogisticRegression Model 
model_LR=LogisticRegression()
model_LR.fit(x_train_vector,y_train)
y_pred_LR=model_LR.predict(x_test_vector)
print(accuracy_score(y_test,y_pred_LR))
print(classification_report(y_test,y_pred_LR))

0.9758220502901354
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       894
           1       0.98      0.84      0.90       140

    accuracy                           0.98      1034
   macro avg       0.98      0.92      0.95      1034
weighted avg       0.98      0.98      0.98      1034



In [None]:
#Naive Bayes Model
model_NB=MultinomialNB()
model_NB.fit(x_train_vector,y_train)
y_pred_NB=model_NB.predict(x_test_vector)
accuracy_score(y_test,y_pred_NB)
print(classification_report(y_test,y_pred_NB))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       894
           1       0.97      0.89      0.93       140

    accuracy                           0.98      1034
   macro avg       0.98      0.94      0.96      1034
weighted avg       0.98      0.98      0.98      1034



In [None]:
#Support Vector Machine model
#It needs a parameter kernel based on the type of dataset (Linear,Poly,Rbf)
model_SV=svm.SVC(kernel='linear')
model_SV.fit(x_train_vector,y_train)
y_pred_SV=model_SV.predict(x_test_vector)
accuracy_score(y_test,y_pred_SV)
print(classification_report(y_test,y_pred_SV))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       894
           1       0.99      0.84      0.91       140

    accuracy                           0.98      1034
   macro avg       0.98      0.92      0.95      1034
weighted avg       0.98      0.98      0.98      1034



In [None]:
#Choosing the best model based on Accuracy 
print(accuracy_score(y_test,y_pred_LR))
print(accuracy_score(y_test,y_pred_NB))
print(accuracy_score(y_test,y_pred_SV))

0.9758220502901354
0.9806576402321083
0.9777562862669246


In [None]:
#Naive Bayes model is the best model for this it has 0.9806576402321083 Accuracy
#Testing the model by giving some Input Texts
sample="Hey there how is the day order some food"
sample_vector=cv.transform([sample])
sample_predict=model_NB.predict(sample_vector)
print(sample_predict)


['0']


In [None]:
#giving some random inputs to check the model
#Spam or Ham
#Spam: 1 and Ham: 0
sample_list=["Get cheap medications online without prescription! Limited time offer!",
    "Hi Mom, just letting you know I arrived safely. Love you!",
    "URGENT: Update your bank account details to avoid suspension.",
    "Hey John, can we reschedule our meeting to 3 PM tomorrow?",
    "You have been selected for a FREE vacation to the Bahamas! Respond now.",
    "Dear Sarah, please find attached the project report for your review.",
    "Earn $5000 per week working from home. Sign up today!",
    "Reminder: Your dentist appointment is scheduled for next Monday at 10 AM.",
    "Congratulations! You have won a $1000 Walmart gift card. Click here to claim.",
    "Hi team, the weekly report has been uploaded to the shared drive."]
for i in sample_list:
    sample=str(i)
    sample_vector=cv.transform([sample])
    sample_predict=model_NB.predict(sample_vector)
    sample_predict
    if sample_predict==str(1):
        print(i+" --> Spam")
    else:
        print(i+" --> Ham")

Get cheap medications online without prescription! Limited time offer! --> Ham
Hi Mom, just letting you know I arrived safely. Love you! --> Ham
URGENT: Update your bank account details to avoid suspension. --> Spam
Hey John, can we reschedule our meeting to 3 PM tomorrow? --> Ham
You have been selected for a FREE vacation to the Bahamas! Respond now. --> Spam
Dear Sarah, please find attached the project report for your review. --> Ham
Earn $5000 per week working from home. Sign up today! --> Ham
Reminder: Your dentist appointment is scheduled for next Monday at 10 AM. --> Ham
Congratulations! You have won a $1000 Walmart gift card. Click here to claim. --> Spam
Hi team, the weekly report has been uploaded to the shared drive. --> Ham
