In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [14]:
df = pd.read_csv(r'C:\Users\Muhammad_Talha\Desktop\Spam_emails\mail_data.csv')

In [15]:
print(df)

     Category                                            Message
0         ham  Go until jurong point, crazy.. Available only ...
1         ham                      Ok lar... Joking wif u oni...
2        spam  Free entry in 2 a wkly comp to win FA Cup fina...
3         ham  U dun say so early hor... U c already then say...
4         ham  Nah I don't think he goes to usf, he lives aro...
...       ...                                                ...
5567     spam  This is the 2nd time we have tried 2 contact u...
5568      ham               Will ü b going to esplanade fr home?
5569      ham  Pity, * was in mood for that. So...any other s...
5570      ham  The guy did some bitching but I acted like i'd...
5571      ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [16]:
data = df.where((pd.notnull(df)),'')

In [17]:
data.head(10)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [19]:
data.shape

(5572, 2)

In [20]:
data.loc[data['Category'] == 'spam', 'Category',] = 0
data.loc[data['Category'] == 'ham', 'Category',] = 1

In [21]:
X= data['Message']
Y= data['Category']

In [22]:
print(X)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [23]:
print(Y)

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


In [24]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.5,random_state = 6)

In [25]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5572,)
(2786,)
(2786,)


In [26]:
print(Y.shape)
print(Y_train.shape)
print(Y_test.shape)

(5572,)
(2786,)
(2786,)


In [27]:
feature_extraction = TfidfVectorizer(min_df = 1, stop_words = 'english', lowercase =True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [28]:
print(X_train)

4426                  Just now saw your message.it k da:)
4346                                  Enjoy urself tmr...
2042                  Ü dun wan to watch infernal affair?
67      Urgent UR awarded a complimentary trip to Euro...
628     Yup i thk they r e teacher said that will make...
                              ...                        
335     Valentines Day Special! Win over £1000 in our ...
4714    Big brother‘s really scraped the barrel with t...
2004    S....s...india going to draw the series after ...
227     Will u meet ur dream partner soon? Is ur caree...
2761           I dont thnk its a wrong calling between us
Name: Message, Length: 2786, dtype: object


In [29]:
print(X_train_features)

  (0, 1576)	0.4513433653937027
  (0, 3318)	0.505475621368701
  (0, 4393)	0.64125251245926
  (0, 2864)	0.35997052376784827
  (1, 5120)	0.5666962257908098
  (1, 5328)	0.6099037183098603
  (1, 1924)	0.5539610474268547
  (2, 633)	0.5560141759458145
  (2, 2714)	0.49418201742949575
  (2, 5474)	0.40774339939444404
  (2, 5453)	0.3755601154214415
  (2, 1844)	0.37327240689810814
  (3, 329)	0.2609571121059985
  (3, 3157)	0.2609571121059985
  (3, 183)	0.18648625863872828
  (3, 4546)	0.2609571121059985
  (3, 3413)	0.2609571121059985
  (3, 389)	0.17737502494795554
  (3, 232)	0.18501291436622774
  (3, 516)	0.24811554946727302
  (3, 1729)	0.20420995680827672
  (3, 5236)	0.14533641986718113
  (3, 1335)	0.1571660373049042
  (3, 185)	0.18501291436622774
  (3, 1936)	0.2609571121059985
  :	:
  (2783, 4693)	0.3817949314802585
  (2783, 2707)	0.3407676930237244
  (2783, 1806)	0.30859123410517086
  (2783, 4467)	0.3817949314802585
  (2783, 5674)	0.3116584040858447
  (2783, 2349)	0.22898314870801897
  (2784, 776

In [30]:
model = LogisticRegression()

In [31]:
model.fit(X_train_features, Y_train)

In [32]:
prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [33]:
print("Accuracy on training data : ",accuracy_on_training_data)

Accuracy on training data :  0.9605168700646087


In [34]:
prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)


In [35]:
print('Accuracy on test data : ',accuracy_on_test_data)

Accuracy on test data :  0.9576453697056713


In [36]:
input_your_mail = ["WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only."]
input_data_features = feature_extraction.transform(input_your_mail)
prediction = model.predict(input_data_features)
print(prediction)

if(prediction[0]==1):
    print('Ham mail')
else:
    print('Spam Mails')

[0]
Spam Mails
