<a href="https://colab.research.google.com/github/miacarroll1207/NLP-Practice/blob/main/Spam_Ham_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [None]:
#import the data with pandas
data = pd.read_csv('spam.tsv', sep='\t')

In [None]:
data.head

<bound method NDFrame.head of      label                                            message  length  punct
0      ham  Go until jurong point, crazy.. Available only ...     111      9
1      ham                      Ok lar... Joking wif u oni...      29      6
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...     155      6
3      ham  U dun say so early hor... U c already then say...      49      6
4      ham  Nah I don't think he goes to usf, he lives aro...      61      2
...    ...                                                ...     ...    ...
5567  spam  This is the 2nd time we have tried 2 contact u...     160      8
5568   ham               Will ü b going to esplanade fr home?      36      1
5569   ham  Pity, * was in mood for that. So...any other s...      57      7
5570   ham  The guy did some bitching but I acted like i'd...     125      1
5571   ham                         Rofl. Its true to its name      26      1

[5572 rows x 4 columns]>

In [None]:
data.describe()

Unnamed: 0,length,punct
count,5572.0,5572.0
mean,80.48995,4.177495
std,59.942907,4.623919
min,2.0,0.0
25%,36.0,2.0
50%,62.0,3.0
75%,122.0,6.0
max,910.0,133.0


In [None]:
#count hams and spams
data['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [None]:
#now extract ham
hams = data[data['label']=='ham']
hams

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2
6,ham,Even my brother is not like to speak with me. ...,77,2
...,...,...,...,...
5565,ham,Huh y lei...,12,3
5568,ham,Will ü b going to esplanade fr home?,36,1
5569,ham,"Pity, * was in mood for that. So...any other s...",57,7
5570,ham,The guy did some bitching but I acted like i'd...,125,1


In [None]:
#and spams
spams = data[data['label']=='spam']
spams

Unnamed: 0,label,message,length,punct
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
5,spam,FreeMsg Hey there darling it's been 3 week's n...,147,8
8,spam,WINNER!! As a valued network customer you have...,157,6
9,spam,Had your mobile 11 months or more? U R entitle...,154,2
11,spam,"SIX chances to win CASH! From 100 to 20,000 po...",136,8
...,...,...,...,...
5537,spam,Want explicit SEX in 30 secs? Ring 02073162414...,90,3
5540,spam,ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...,158,5
5547,spam,Had your contract mobile 11 Mnths? Latest Moto...,160,8
5566,spam,REMINDER FROM O2: To get 2.50 pounds free call...,147,3


In [None]:
spams.shape
#(747, 4) --> 4 columns, 747 spam values


(747, 4)

In [None]:
hams.shape
#(4825, 4) --> 4 columns, 4825 spam values

(4825, 4)

In [None]:
#we need the same number of values in both. we will use the pandas sample() method
hams = hams.sample(spams.shape[0])
hams.shape

(747, 4)

In [None]:
#combine the ham and spam
newData = pd.concat([hams, spams], ignore_index=True)
newData.shape

(1494, 4)

In [None]:
#use sklearn to train the data
from sklearn.model_selection import train_test_split

In [None]:
#first split the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(newData['message'],newData['label'],test_size=0.3,
                                                    random_state=0, shuffle=True)

In [None]:
y_test

799     spam
1047    spam
1436    spam
54       ham
793     spam
        ... 
824     spam
822     spam
644      ham
1034    spam
1356    spam
Name: label, Length: 449, dtype: object

In [None]:
#Build RANDOM FOREST Model
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import TfidfVectorizer
# we are importing TfidfVectorizer to utilize bag of words model in sklearn

from sklearn.ensemble import RandomForestClassifier

In [None]:
classifier = Pipeline([('tfidf', TfidfVectorizer()), ('classifier',RandomForestClassifier(n_estimators=100))])

In [None]:
classifier.fit(x_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
y_pred = classifier.predict(x_test)


In [None]:
# confusion_matrix
confusion_matrix(y_test, y_pred)

array([[224,   3],
       [ 21, 201]])

In [None]:
# classification_report
print(classification_report(y_test, y_pred))
# we are getting almost 95% accuracy

              precision    recall  f1-score   support

         ham       0.91      0.99      0.95       227
        spam       0.99      0.91      0.94       222

    accuracy                           0.95       449
   macro avg       0.95      0.95      0.95       449
weighted avg       0.95      0.95      0.95       449



In [None]:
accuracy_score(y_test, y_pred)

0.9465478841870824

In [None]:
classifier.predict(['Hi, my name is Mia'])

array(['ham'], dtype=object)

In [None]:
classifier.predict(['When is your birthday?'])

array(['ham'], dtype=object)

In [None]:
classifier.predict(['Please click the link below to redeem your rewards!'])

array(['ham'], dtype=object)

In [None]:
classifier.predict(['Congratulations, You won a lottery ticket worth $1 Million ! To claim call on 446677'])

array(['spam'], dtype=object)

In [None]:
classifier.predict(['hi this is john here. I contacted you earlier about an Amazon delivery.'])

array(['ham'], dtype=object)

In [None]:
classifier.predict(['you can win lots of money if you call me on 912u128312390'])

array(['ham'], dtype=object)

In [None]:
classifier.predict(['give me your phone number to win money'])

array(['spam'], dtype=object)

In [None]:
classifier.predict(['I will need your personal information to give you the prize'])

array(['spam'], dtype=object)

In [None]:
#now using SVM
from sklearn.svm import SVC

In [None]:
classifier = Pipeline([('tfidf', TfidfVectorizer()), ('classifier',SVC(C=100,gamma='auto'))])

In [None]:
classifier.fit(x_train, y_train)

In [None]:
y_pred = classifier.predict(x_test)

In [None]:
confusion_matrix(y_test, y_pred)

array([[225,   2],
       [ 17, 205]])

In [None]:
accuracy_score(y_test, y_pred)

0.9576837416481069

In [None]:
classifier.predict(['Hello, You are learning atural Language Processing'])

array(['ham'], dtype=object)

In [None]:
classifier.predict(['Hope you are doing good and learning new things !'])

array(['ham'], dtype=object)

In [None]:
classifier.predict(['Congratulations, You won a lottery ticket worth $1 Million ! To claim call on 446677'])

array(['spam'], dtype=object)

In [None]:
classifier.predict(['give me your phone number to win money'])

array(['spam'], dtype=object)