### Importing Necessary Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

### Data Collection and Preprocessing

In [2]:
# Loading the data
raw_mail_data=pd.read_csv(r"C:\Users\DELL\Downloads\Spam Email Detection - spam.csv")

In [3]:
raw_mail_data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will �_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [4]:
# removing unnecesary columns from raw_mail_data
raw_mail_data=raw_mail_data.iloc[:,:2]

In [5]:
raw_mail_data

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will �_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
raw_mail_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [7]:
#checking null values
print(sum(raw_mail_data["v1"].isna()))
print(sum(raw_mail_data["v2"].isna()))

0
0


There are no null values

In [8]:
mail_data=raw_mail_data.copy()

In [9]:
# printing the first 5 rows of the dataframe
mail_data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
#checking the number of rows and columns in the dataframe
mail_data.shape

(5572, 2)

In [11]:
#checking our data is balanced or not
mail_data["v1"].value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

#### Label Encoding
changing the labels of the data spam mail to 0 and ham mail to 1 using label encoding

In [12]:
mail_data["v1"]=mail_data["v1"].map({"spam":0,"ham":1})

In [13]:
mail_data.head()

Unnamed: 0,v1,v2
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [14]:
# Seperating the data as text and label
feature_variable=mail_data.iloc[:,1]
class_label=mail_data.iloc[:,0]

In [15]:
feature_variable

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will �_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: v2, Length: 5572, dtype: object

In [16]:
class_label

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: v1, Length: 5572, dtype: int64

#### Splitting the data into training data and test data

In [17]:
x_train,x_test,y_train,y_test=train_test_split(feature_variable,class_label,test_size=0.2,random_state=2,stratify=class_label)

In [18]:
x_train.shape

(4457,)

In [19]:
x_test.shape

(1115,)

In [20]:
y_train.shape

(4457,)

In [21]:
y_test.shape

(1115,)

#### Feature Extraction
Transform the text data to feature vectors(numerical values) that can be used as input to our model

In [24]:
feature_extraction=TfidfVectorizer(min_df=1,stop_words="english",lowercase=True)

In [25]:
x_train_final=feature_extraction.fit_transform(x_train)
x_test_final=feature_extraction.transform(x_test)

In [26]:
x_train_final

<4457x7409 sparse matrix of type '<class 'numpy.float64'>'
	with 34599 stored elements in Compressed Sparse Row format>

#### Training our model

In [27]:
model=LogisticRegression()

In [28]:
model.fit(x_train_final,y_train)

Evaluating our trained model

In [30]:
# Prediction on training data
prediction_on_training_data=model.predict(x_train_final)
accuracy_on_training_data=accuracy_score(y_train,prediction_on_training_data)

In [31]:
accuracy_on_training_data

0.9679156383217411

In [32]:
# prediction on test data
prediction_on_test_data=model.predict(x_test_final)
accuracy_on_test_data=accuracy_score(y_test,prediction_on_test_data)

In [33]:
accuracy_on_test_data

0.9713004484304932

Building a predictive system

In [39]:
input_mail=["I HAVE A DATE ON SUNDAY WITH WILL!!"]

# converting text to feature vector

feature=feature_extraction.transform(input_mail)

# making prediction

prediction=model.predict(feature)

print(prediction)

if prediction[0]==1:
    print("Ham")
else:
    print("Spam")

[1]
Ham
