# Spam E-Mail Prediction
Dataset available at: https://www.kaggle.com/datasets/venky73/spam-mails-dataset

### Loading data


In [20]:
import numpy as np
import pandas as pd

In [21]:
dataset = pd.read_csv('spam_ham_dataset.csv')
dataset.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


### Data exploration


In [22]:
# # checking the data types and if there is any Null values
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


##### **Features explained**
0. Unnamed: think this as ID
1. label: Labels of Emails which can be either Spam or Ham
2. Emails data
3. if spam it's 1, or else it's 0

we only need two feathures, which are text and label_num

In [23]:
# 28.9 % of emails are spam mails
dataset["label"].value_counts(normalize=True) * 100

ham     71.01141
spam    28.98859
Name: label, dtype: float64

### Data Pre Processing

In [24]:
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import TfidfVectorizer # convert text into feature vectors i.e: numarical values

In [25]:
# splitting data into x and y
X = dataset['text'].values
Y = dataset['label_num'].values

In [26]:
# Splitting the data into Training data and Test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

In [27]:
# initializing TfidfVectorizer
feature_vectors = TfidfVectorizer(min_df = 1, stop_words='english', lowercase='True')

In [28]:
X_train = feature_vectors.fit_transform(X_train)
X_test = feature_vectors.transform(X_test)

### Modelling and training


In [29]:
from sklearn.linear_model import LogisticRegression

In [30]:
model = LogisticRegression() # initializing Logistic Regression Model
model.fit(X_train, Y_train) # training the model

LogisticRegression()

### Evaluation

In [31]:
from sklearn.metrics import accuracy_score

In [32]:
Ypred = model.predict(X_test)
print(f'Accuracy Score: {accuracy_score(Y_test, Ypred) * 100}')

Accuracy Score: 98.06763285024155


### Realtime prediction

In [33]:
def realtime(text):
  text = feature_vectors.transform([text])
  pred = model.predict(text)[0]
  if pred == 0:
    return 'ham'
  else:
    return 'spam'

In [34]:
email_text = "Subject: enron methanol ; meter # : 988291\r\nthis is a follow up to the note i gave you on monday , 4 / 3 / 00 { preliminary\r\nflow data provided by daren } .\r\nplease override pop ' s daily volume { presently zero } to reflect daily\r\nactivity you can obtain from gas control .\r\nthis change is needed asap for economics purposes"
print(realtime(email_text))

ham
