IMPORTING THE DEPENDENCIES

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

DATA COLLECTION & PRE-PROCESSING

In [5]:
# loading the data from csv file to pandas dataframe
raw_mail_data = pd.read_csv('datasets/mail_data.csv')

In [13]:
# replacing null values with null strings
mail_data = raw_mail_data.fillna('')

In [14]:
mail_data.shape

(5572, 2)

LABEL ENCODING

In [15]:
# spam mail => 0 ; ham mail =>1

mail_data.loc[mail_data['Category'] == 'spam', 'Category'] = 0
mail_data.loc[mail_data['Category'] == 'ham', 'Category'] = 1

In [17]:
# separating the features and targets
X = mail_data['Message']
Y = mail_data['Category'] 

TRAIN TEST SPLIT

In [22]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 3)

In [25]:
# transform text data to feature vectors that can be used as input to the logistic regression

feature_extraction = TfidfVectorizer(min_df = 1, stop_words = 'english', lowercase = True)

X_train_features = feature_extraction.fit_transform(X_train)

X_test_features = feature_extraction.transform(X_test)

# convert Y_train and Y_test as integers

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

TRAINING THE MODEL - LOGISTIC REGRESSION

In [26]:
model = LogisticRegression()

In [27]:
# training the model with training data
model.fit(X_train_features, Y_train)

EVALUATING THE MODEL

In [30]:
# prediction on training data

pred_on_train_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, pred_on_train_data)

In [31]:
accuracy_on_training_data

0.9676912721561588

In [34]:
# prediction on test data

pred_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, pred_on_test_data)

In [35]:
accuracy_on_test_data

0.9668161434977578

BUILDING A PREDICTIVE SYSTEM

In [38]:
input_mail = ["WINNER!! As a valued network customer you have been selected to receivea Â£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only."]

# convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

# making prediction

prediction = model.predict(input_data_features)

if prediction[0] == 1:
    print("Ham Mail")
else:
    print("Spam Mail")

Spam Mail
