Explore Data

In [2]:
import pandas as pd

data = pd.read_csv("spam.csv", encoding = 'ISO-8859-1')

In [3]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
data.drop(["Unnamed: 2",	"Unnamed: 3",	"Unnamed: 4"], axis=1, inplace=True)

In [5]:
df = data.copy()

In [6]:
df.shape

(5572, 2)

In [7]:
df.isnull().sum()

v1    0
v2    0
dtype: int64

In [8]:
print(df.columns)

Index(['v1', 'v2'], dtype='object')


Model Implementation

NAIVE BAYES

In [13]:
#Import Libraries
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [14]:
#Assign variable
X = df['v2']
y = df['v1']

In [15]:
#split data into train and test
X_train, X_test, Y_train, Y_test = train_test_split(X,y,test_size= 0.25 , random_state= 42)

In [18]:
#TF-IDF Vectorization
Vectorizer = TfidfVectorizer()
X_train_tfidf = Vectorizer.fit_transform(X_train)
X_test_tfidf = Vectorizer.transform(X_test)

In [22]:
#Train a Naive Bayes Classifier
classifier = MultinomialNB()
classifier.fit(X_train_tfidf,Y_train)

In [23]:
#predictions
y_pred = classifier.predict(X_test_tfidf)

In [30]:
#Evaluation
accuracy = accuracy_score(Y_test,y_pred)
print("Accuracy :",accuracy)
print(classification_report(Y_test, y_pred))

Accuracy : 0.9597989949748744
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1202
        spam       1.00      0.71      0.83       191

    accuracy                           0.96      1393
   macro avg       0.98      0.85      0.90      1393
weighted avg       0.96      0.96      0.96      1393



Logistic Regression

In [31]:
#import libraries
from sklearn.linear_model import LogisticRegression

In [33]:
#Train Model
classifier_log = LogisticRegression(max_iter = 1000)
classifier_log.fit(X_train_tfidf, Y_train)

In [34]:
#Predictions
y_pred_log = classifier_log.predict(X_test_tfidf)

In [38]:
#Evalation
accuracy_log = accuracy_score(Y_test,y_pred_log)
print("Accuracy:",accuracy_log)
print(classification_report(Y_test, y_pred_log))

Accuracy: 0.9633883704235463
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1202
        spam       0.99      0.74      0.85       191

    accuracy                           0.96      1393
   macro avg       0.98      0.87      0.91      1393
weighted avg       0.96      0.96      0.96      1393



Support Vector Machine

In [39]:
#import libraries
from sklearn.svm import SVC

In [41]:
#train Model 
classifier_SVC = SVC()
classifier_SVC.fit(X_train_tfidf, Y_train)

In [43]:
#predictions
y_pred_SVC = classifier_SVC.predict(X_test_tfidf)

In [48]:
#evaluation
accuracy_SVC  = accuracy_score(Y_test , y_pred_SVC)
print("Accuracy:",accuracy_SVC)
print(classification_report(Y_test, y_pred_SVC))

Accuracy: 0.9791816223977028
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1202
        spam       1.00      0.85      0.92       191

    accuracy                           0.98      1393
   macro avg       0.99      0.92      0.95      1393
weighted avg       0.98      0.98      0.98      1393



Input Testing

In [None]:
def predict_email_loop():
    while True:
        input_email = input("Please enter the email text to check or type 'exit' to quit:")
        if input_email.lower() == 'exit':
            print("Exiting the email classifier")
            break

        email_tfidf = Vectorizer.transform([input_email])
        prediction = classifier.predict(email_tfidf)
        print(f"The email is: {prediction[0]}\n")

predict_email_loop()


Please enter the email text to check or type 'exit' to quit: Dear Muhammad Hasaan,  Congratulations on taking the first step towards advancing your career in Data Science with your interest in Applied Data Science Program: Leveraging AI for Effective Decision-Making by MIT Professional Education. As promised, here is a link to the program brochure to help you explore the program in detail.  If the program’s unique design, including live virtual sessions with MIT faculty, personalized mentorship, continuous program support interests you, please feel free to take the next step and complete your application form.


The email is: ham

