## Spam Mail Prediction system

#### Import Dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import  TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Load The Data

In [2]:
df = pd.read_csv('mail_data.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.shape

(5572, 2)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


As from info the data type is string(object) so we will convert it into numeric or int type data

In [5]:
df.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [6]:
df["Category"].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

#### Label Encoding, if mail will spam then label will be '1' if not spam or  'ham'then label will be '0'

In [7]:
spam = df.loc[df['Category'] == 'spam', 'Category']=1
ham = df.loc[df['Category'] == 'ham', 'Category']=0

In [8]:
print("Label for spam mail : ",spam)

Label for spam mail :  1


In [9]:
print("Label for ham mail : ",ham)


Label for ham mail :  0


so our label are 0 and 1

#### Separating features or getting features(x) and label(y) 

In [10]:
 x = df['Message']
 y= df['Category']

In [11]:
print('Feature are :',x)

Feature are : 0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [12]:
print('Labels are',y)


Labels are 0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: Category, Length: 5572, dtype: object


train_test splitting

In [13]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2, random_state=3)

check theShape both test and split data for both x and y

In [14]:
print('shape of x : ',x.shape)
print('shape of x_train : ',x_train.shape)
print('shape of x_test : ',x_test.shape)
print('shape of y : ',y.shape)
print('shape of y_train : ',y_train.shape)
print('shape of y_test : ',y_test.shape)

shape of x :  (5572,)
shape of x_train :  (4457,)
shape of x_test :  (1115,)
shape of y :  (5572,)
shape of y_train :  (4457,)
shape of y_test :  (1115,)


Now we are going to convert text data into numeric data for further process or 'FEATURE EXTRACTION'

In [15]:
feature_extraction = TfidfVectorizer(min_df=1,stop_words='english', lowercase=True)

feature extraction for features(x)

In [16]:
x_train_feature = feature_extraction.fit_transform(x_train)
x_test_feature = feature_extraction.transform(x_test)

In [17]:
print(x_train_feature)

  (0, 5413)	0.6198254967574347
  (0, 4456)	0.4168658090846482
  (0, 2224)	0.413103377943378
  (0, 3811)	0.34780165336891333
  (0, 2329)	0.38783870336935383
  (1, 4080)	0.18880584110891163
  (1, 3185)	0.29694482957694585
  (1, 3325)	0.31610586766078863
  (1, 2957)	0.3398297002864083
  (1, 2746)	0.3398297002864083
  (1, 918)	0.22871581159877646
  (1, 1839)	0.2784903590561455
  (1, 2758)	0.3226407885943799
  (1, 2956)	0.33036995955537024
  (1, 1991)	0.33036995955537024
  (1, 3046)	0.2503712792613518
  (1, 3811)	0.17419952275504033
  (2, 407)	0.509272536051008
  (2, 3156)	0.4107239318312698
  (2, 2404)	0.45287711070606745
  (2, 6601)	0.6056811524587518
  (3, 2870)	0.5864269879324768
  (3, 7414)	0.8100020912469564
  (4, 50)	0.23633754072626942
  (4, 5497)	0.15743785051118356
  :	:
  (4454, 4602)	0.2669765732445391
  (4454, 3142)	0.32014451677763156
  (4455, 2247)	0.37052851863170466
  (4455, 2469)	0.35441545511837946
  (4455, 5646)	0.33545678464631296
  (4455, 6810)	0.29731757715898277
  (4

In [18]:
print(x_test_feature)

  (0, 7271)	0.1940327008179069
  (0, 6920)	0.20571591693537986
  (0, 5373)	0.2365698724638063
  (0, 5213)	0.1988547357502182
  (0, 4386)	0.18353336340308998
  (0, 1549)	0.2646498848307188
  (0, 1405)	0.3176863938914351
  (0, 1361)	0.25132445289897426
  (0, 1082)	0.2451068436245027
  (0, 1041)	0.28016206931555726
  (0, 405)	0.2381316303003606
  (0, 306)	0.23975986557206702
  (0, 20)	0.30668032384591537
  (0, 14)	0.26797874471323896
  (0, 9)	0.2852706805264544
  (0, 1)	0.2381316303003606
  (1, 7368)	0.29957800964520975
  (1, 6732)	0.42473488678029325
  (1, 6588)	0.3298937975962767
  (1, 6507)	0.26731535902873493
  (1, 6214)	0.3621564482127515
  (1, 4729)	0.22965776503163893
  (1, 4418)	0.3457696891316818
  (1, 3491)	0.496093956101028
  (2, 7205)	0.22341717215670331
  :	:
  (1110, 3167)	0.5718357066163949
  (1111, 7353)	0.4991205841293424
  (1111, 6787)	0.40050175714278885
  (1111, 6033)	0.4714849709283488
  (1111, 3227)	0.44384935772735523
  (1111, 2440)	0.4137350055985486
  (1112, 7071)

now change the type of label(y) as the data in numeric form but object so we just convert type as int

In [19]:
y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [20]:
y_train.info()

<class 'pandas.core.series.Series'>
Int64Index: 4457 entries, 3075 to 1688
Series name: Category
Non-Null Count  Dtype
--------------  -----
4457 non-null   int32
dtypes: int32(1)
memory usage: 52.2 KB


In [21]:
y_test.info()

<class 'pandas.core.series.Series'>
Int64Index: 1115 entries, 2632 to 1136
Series name: Category
Non-Null Count  Dtype
--------------  -----
1115 non-null   int32
dtypes: int32(1)
memory usage: 13.1 KB


Now Train the modedl over feature and label of train data

In [22]:
model = LogisticRegression()
model.fit(x_train_feature,y_train)

LogisticRegression()

### Prediction of training data

In [23]:
prediction_of_train_data = model.predict(x_train_feature)

Check the accuracy of train data

In [24]:
accuracy_of_train_data = accuracy_score(prediction_of_train_data,y_train)
print("Accuracy of train Data : ", accuracy_of_train_data)

Accuracy of train Data :  0.9670181736594121


### Prediction of test data

In [25]:
prediction_of_test_data = model.predict(x_test_feature)

Check the accuracy of test data

In [26]:
accuracy_of_test_data = accuracy_score(prediction_of_test_data,y_test)
print("Accuracy of test Data : ", accuracy_of_test_data)

Accuracy of test Data :  0.9659192825112107


## Now build a system for evaulation and deployment

In [27]:
recieved_mail = ["As a valued customer, I am pleased to advise you that following recent review of your Mob No. you are awarded with a £1500 Bonus Prize, call 09066364589"]
conversion = feature_extraction.transform(recieved_mail)

prediction = model.predict(conversion)

if prediction[0] == 1 :
    print("The recieved mail is spam mail")
else:
    print("The Recieved mail is not spam mail or it is ham mail")

The recieved mail is spam mail


In [28]:
recieved_mail = ["Sorry my roommates took forever, it ok if I come by now?"]
conversion = feature_extraction.transform(recieved_mail)

prediction = model.predict(conversion)

if prediction[0] == 1 :
    print("The recieved mail is spam mail")
else:
    print("The Recieved mail is not spam mail or it is ham mail")

The Recieved mail is not spam mail or it is ham mail
