Importing the dependencies

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

Data collection and Pre-PROCESSING

In [None]:
raw_mail_data=pd.read_csv('/content/mail_data.csv')

In [None]:
raw_mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
#finding the null values
raw_mail_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [None]:
raw_mail_data.isnull().sum()

Category    0
Message     0
dtype: int64

In [None]:
#replacing the null values with a null string
mail_data=raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [None]:
#printing first 5 rows
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
mail_data.shape


(5572, 2)

Encoding

In [None]:
mail_data.loc[mail_data['Category']=='spam','Category']=0
mail_data.loc[mail_data['Category']=='ham','Category']=1
mail_data.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
#separating the data as text and label
X=mail_data['Message']
Y=mail_data['Category']
print(X)


0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [None]:
print(Y)

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


Spilitting the data into training data and test data

In [None]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y, test_size=0.2,random_state=2)

In [None]:
print(X.shape,X_train.shape,X_test.shape)

(5572,) (4457,) (1115,)


Feature Extraction

In [None]:
from sklearn import feature_extraction
#transforming the text data to features vectorsthat can be used as input to the logistic regression
feature_extraction=TfidfVectorizer(min_df=1,lowercase=True, stop_words='english')#min_df=1 if the score is less than 1 than ignore it
X_train_features=feature_extraction.fit_transform(X_train)
X_test_features=feature_extraction.transform(X_test)
print(X_test_features)
X_test_features.astype

  (0, 6619)	0.33077540807715927
  (0, 4752)	0.44421921026428457
  (0, 2494)	0.359541012283057
  (0, 2313)	0.37081499071603014
  (0, 2110)	0.2538341210056606
  (0, 1623)	0.47755798461662824
  (0, 1153)	0.3660464944955722
  (1, 4140)	0.7724156535136
  (1, 3802)	0.40629294786687964
  (1, 3352)	0.4881599110135932
  (2, 3179)	0.3405136304031059
  (2, 3169)	0.9402395798463798
  (3, 6670)	0.4948874540031021
  (3, 6543)	0.5505088255084791
  (3, 2900)	0.6723291165103608
  (4, 7417)	0.4582086641273852
  (4, 6613)	0.6612385994559425
  (4, 5583)	0.3946308162640678
  (4, 1764)	0.443931136059295
  (5, 7144)	0.2525030795568811
  (5, 6017)	0.3435042181615311
  (5, 5522)	0.37192637792006283
  (5, 4761)	0.3253891605505013
  (5, 4161)	0.4423344697815598
  (5, 4048)	0.23654956954038084
  :	:
  (1111, 5132)	0.4888630580390552
  (1111, 5071)	0.3867437918860694
  (1111, 4094)	0.24494882973980492
  (1111, 3138)	0.24402169398619392
  (1111, 3084)	0.24749503861730665
  (1111, 1031)	0.4888630580390552
  (1112, 7

<bound method _data_matrix.astype of <1115x7458 sparse matrix of type '<class 'numpy.float64'>'
	with 7728 stored elements in Compressed Sparse Row format>>

In [None]:
#convert X_train and X_test as integers
Y_train=Y_train.astype('int')
Y_test=Y_test.astype('int')
print(Y_train)

3890    1
5553    1
4366    1
3968    0
3771    1
       ..
3335    1
1099    1
2514    0
3606    1
2575    0
Name: Category, Length: 4457, dtype: int64


In [None]:
print(Y_test)

5086    1
2120    1
2318    1
2917    1
1352    1
       ..
884     1
3821    1
1066    1
208     1
1378    0
Name: Category, Length: 1115, dtype: int64


Training the model

Logistic Regression

In [None]:
model=LogisticRegression()

In [None]:
# training the LR model with the training data
model.fit(X_train_features,Y_train)


LogisticRegression()

Evaluating the trained models

In [None]:
# Prediction on the training data
Prediction_on_training_data=model.predict(X_train_features)
accuracy_on_the_training_data=accuracy_score(Y_train,Prediction_on_training_data)
print('The accuracy score of the traning data',accuracy_on_the_training_data)

The accuracy score of the traning data 0.9683643706529056


In [None]:
# Prediction on the test data
Prediction_on_test_data=model.predict(X_test_features)
accuracy_on_the_test_data=accuracy_score(Y_test,Prediction_on_test_data)
print('The accuracy score of the traning data',accuracy_on_the_test_data)

The accuracy score of the traning data 0.9524663677130045


In [None]:
input_mail=["PRIVATE! Your 2004 Account Statement for 07742676969 shows 786 unredeemed Bonus Points. To claim call 08719180248 Identifier Code: 45239 Expires"]
#convert text to fetures vectors
input_data_features=feature_extraction.transform(input_mail)

#making prediction
prediction=model.predict(input_data_features)
print(prediction)

if prediction[0]==1:#first element in the prediction
  print('Ham mail')
else:
  print("Scam mail")


[0]
Scam mail
