<a href="https://colab.research.google.com/github/maheshkumarreddypothu/Machine-Learning-Projects-and-Python-Projects/blob/main/EmailSpamDetection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing the Dependencies

In [64]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

Data collection and Pre-processing

In [30]:
#loading the data from csv file to a pandas dataframe\
raw_mail_data=pd.read_csv('/content/mail_data.csv')

In [31]:
print(raw_mail_data)

     Category                                            Message
0         ham  Go until jurong point, crazy.. Available only ...
1         ham                      Ok lar... Joking wif u oni...
2        spam  Free entry in 2 a wkly comp to win FA Cup fina...
3         ham  U dun say so early hor... U c already then say...
4         ham  Nah I don't think he goes to usf, he lives aro...
...       ...                                                ...
5567     spam  This is the 2nd time we have tried 2 contact u...
5568      ham               Will ü b going to esplanade fr home?
5569      ham  Pity, * was in mood for that. So...any other s...
5570      ham  The guy did some bitching but I acted like i'd...
5571      ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [32]:
#printing the colums in dataframe created
print(raw_mail_data.columns)

Index(['Category', 'Message'], dtype='object')


In [33]:
#Getting the top five elements of the data set
print(raw_mail_data.head())

  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...


DATA CLEANING

In [34]:
#Replace the null values with a null string
#mail_data=raw_mail_data.where((pd.notnull(raw_mail_data)),'')
mail_data=raw_mail_data.fillna('')

In [35]:
#printng the first five rows of the data
mail_data.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [36]:
#Printing the first five rows of data
mail_data.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [37]:
#Checking the number of rows and columns
mail_data.shape

(5572, 2)

Label Encoding

In [38]:
#label spam mail as 0 and ham mail as 1
mail_data.loc[mail_data['Category']=='spam','Category',]=0
mail_data.loc[mail_data['Category']=='ham','Category',]=1

In [39]:
#separating the data as text and labels
X=mail_data['Message']
Y=mail_data['Category']

In [40]:
print(X)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [41]:
print(Y)

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


Splitting the data into training data and test data

In [42]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=3) 

In [43]:
print(X.shape)

(5572,)


In [44]:
print(X_train.shape)

(4457,)


In [45]:
print(X_test.shape)

(1115,)


Feature Extraction

In [46]:
#Transform the data to feature vectors that can be used as input to the logistic regression
feature_extraction=TfidfVectorizer(min_df=1,stop_words='english',lowercase='lower')
X_train_features=feature_extraction.fit_transform(X_train)
X_test_features=feature_extraction.transform(X_test)
#convert y_train and y_test as integers
Y_train=Y_train.astype('int')
Y_test=Y_test.astype('int')

In [47]:
print(X_test_features)

  (0, 7271)	0.1940327008179069
  (0, 6920)	0.20571591693537986
  (0, 5373)	0.2365698724638063
  (0, 5213)	0.1988547357502182
  (0, 4386)	0.18353336340308998
  (0, 1549)	0.2646498848307188
  (0, 1405)	0.3176863938914351
  (0, 1361)	0.25132445289897426
  (0, 1082)	0.2451068436245027
  (0, 1041)	0.28016206931555726
  (0, 405)	0.2381316303003606
  (0, 306)	0.23975986557206702
  (0, 20)	0.30668032384591537
  (0, 14)	0.26797874471323896
  (0, 9)	0.2852706805264544
  (0, 1)	0.2381316303003606
  (1, 7368)	0.29957800964520975
  (1, 6732)	0.42473488678029325
  (1, 6588)	0.3298937975962767
  (1, 6507)	0.26731535902873493
  (1, 6214)	0.3621564482127515
  (1, 4729)	0.22965776503163893
  (1, 4418)	0.3457696891316818
  (1, 3491)	0.496093956101028
  (2, 7205)	0.22341717215670331
  :	:
  (1110, 3167)	0.5718357066163949
  (1111, 7353)	0.4991205841293424
  (1111, 6787)	0.40050175714278885
  (1111, 6033)	0.4714849709283488
  (1111, 3227)	0.44384935772735523
  (1111, 2440)	0.4137350055985486
  (1112, 7071)

In [48]:
print(X_train_features)

  (0, 5413)	0.6198254967574347
  (0, 4456)	0.4168658090846482
  (0, 2224)	0.413103377943378
  (0, 3811)	0.34780165336891333
  (0, 2329)	0.38783870336935383
  (1, 4080)	0.18880584110891163
  (1, 3185)	0.29694482957694585
  (1, 3325)	0.31610586766078863
  (1, 2957)	0.3398297002864083
  (1, 2746)	0.3398297002864083
  (1, 918)	0.22871581159877646
  (1, 1839)	0.2784903590561455
  (1, 2758)	0.3226407885943799
  (1, 2956)	0.33036995955537024
  (1, 1991)	0.33036995955537024
  (1, 3046)	0.2503712792613518
  (1, 3811)	0.17419952275504033
  (2, 407)	0.509272536051008
  (2, 3156)	0.4107239318312698
  (2, 2404)	0.45287711070606745
  (2, 6601)	0.6056811524587518
  (3, 2870)	0.5864269879324768
  (3, 7414)	0.8100020912469564
  (4, 50)	0.23633754072626942
  (4, 5497)	0.15743785051118356
  :	:
  (4454, 4602)	0.2669765732445391
  (4454, 3142)	0.32014451677763156
  (4455, 2247)	0.37052851863170466
  (4455, 2469)	0.35441545511837946
  (4455, 5646)	0.33545678464631296
  (4455, 6810)	0.29731757715898277
  (4

Training the model

Logistic Regression

In [49]:
model=LogisticRegression()

In [50]:
#Training the LogisticRegressionmodel with training data
model.fit(X_train_features,Y_train)

LogisticRegression()

Evaluating the model

In [58]:
#prediction on training data and accuracy score for training data
prediction_on_training_data=model.predict(X_train_features)

In [59]:
accuracy_on_training_data=accuracy_score(Y_train,prediction_on_training_data)
print(accuracy_on_training_data)

0.9670181736594121


In [60]:
#prediction on test data and accuracy score for testing data
prediction_on_test_data=model.predict(X_test_features)
accuracy_on_testing_data=accuracy_score(Y_test,prediction_on_test_data)
print(accuracy_on_testing_data)

0.9659192825112107


In [55]:
#Precision score for testing data
precision_on_testing_data=precision_score(Y_test,prediction_on_test_data)
print(precision_on_testing_data)

0.9619238476953907


In [57]:
#Recall score for testing data
recall_on_testing_data=recall_score(Y_test,prediction_on_test_data)
print(recall_on_testing_data)

1.0


In [67]:
#f1_score for testing data
f1_score_on_testing_data=f1_score(Y_test,prediction_on_test_data)
print(f1_score_on_testing_data)

0.9805924412665985


In [63]:
confusion_matrix_for_testing_data=confusion_matrix(Y_test,prediction_on_test_data)
print(confusion_matrix_for_testing_data)

[[117  38]
 [  0 960]]


Building the predictive system

In [27]:
input_mail=["I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today."]
#now convert this text into feature vectors
input_data_features=feature_extraction.transform(input_mail)
#making the predictions
prediction=model.predict(input_data_features)
if prediction[0]==1:
  print('Ham')
else:
  print('Spam')

Ham
