In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
raw_mail_data = pd.read_csv("mail_data.csv")

In [3]:
raw_mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

In [5]:
raw_mail_data.isnull().sum()

Category    0
Message     0
dtype: int64

In [6]:
df = raw_mail_data.where((pd.notnull(raw_mail_data)), '')

In [7]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

In [8]:
df.shape

(96310, 2)

In [9]:
df['Category'] = df['Category'].map({'spam': 0, 'ham': 1})

In [10]:
df.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
df.loc[df['Category'] == 'spam', 'Category',] = 0
df.loc[df['Category'] == 'ham', 'Category',] = 1

In [12]:
df.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [13]:
X = df['Message']
Y = df['Category']

In [14]:
X

0        Go until jurong point, crazy.. Available only ...
1                            Ok lar... Joking wif u oni...
2        Free entry in 2 a wkly comp to win FA Cup fina...
3        U dun say so early hor... U c already then say...
4        Nah I don't think he goes to usf, he lives aro...
                               ...                        
96305    Subject: re : research and development charges...
96306    Subject: re : receipts from visit  jim ,  than...
96307    Subject: re : enron case study update  wow ! a...
96308    Subject: re : interest  david ,  please , call...
96309    Subject: news : aurora 5 . 2 update  aurora ve...
Name: Message, Length: 96310, dtype: object

In [15]:
Y

0        1
1        1
2        0
3        1
4        1
        ..
96305    1
96306    1
96307    1
96308    1
96309    1
Name: Category, Length: 96310, dtype: int64

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2, random_state = 42)

In [17]:
X_train.shape

(77048,)

In [18]:
y_train.shape

(77048,)

In [19]:
X_test.shape

(19262,)

In [20]:
feature_extraction = TfidfVectorizer(min_df = 1, stop_words="english", binary=True)
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96310 entries, 0 to 96309
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  96310 non-null  int64 
 1   Message   96310 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.5+ MB


In [22]:
y_train = y_train.astype("int")
y_test = y_test.astype("int")

In [23]:
X_train

45757    Subject: Claim Your Free Gift Now\n\nLose weig...
94141    Subject: energy book  hi grant ,  hope all is ...
37392                Don't worry about it, I'll handle it.
52792    Subject: Invitation to Team Event\n\nHi there,...
24938             Hey, are we still on for lunch tomorrow?
                               ...                        
6265     Exclusive offer! Buy one get one free. Limited...
54886    Subject: Re: Your Inquiry\n\nDear Professor,\n...
76820    Subject: Re: Payment Receipt\n\nHi Alex,\n\nIt...
860                Did he just say somebody is named tampa
15795    Congratulations! You've won a $1,000 gift card...
Name: Message, Length: 77048, dtype: object

In [24]:
print(X_train_features)

  (0, 32286)	0.11189112623712257
  (0, 8950)	0.21272957841066775
  (0, 15242)	0.21407995186698028
  (0, 15879)	0.25134516219027137
  (0, 21086)	0.2863648432354431
  (0, 36062)	0.26192699720600265
  (0, 14398)	0.22590624254808708
  (0, 536)	0.25918572945316565
  (0, 16431)	0.2605497072513179
  (0, 31301)	0.26133716787681904
  (0, 14015)	0.2868718249782433
  (0, 28667)	0.27153684759071933
  (0, 24688)	0.2803266625775151
  (0, 17497)	0.1670262831292782
  (0, 36068)	0.2875839464355492
  (0, 29859)	0.23674635129447355
  (0, 9337)	0.16418066565466535
  (1, 32286)	0.012003961169726851
  (1, 14015)	0.030776330201822688
  (1, 24688)	0.03007414873354973
  (1, 13310)	0.04580120461164661
  (1, 7149)	0.05324258339399668
  (1, 17106)	0.02483715601133046
  (1, 16235)	0.053676262423229405
  (1, 17352)	0.03445429636821145
  :	:
  (77045, 22110)	0.22114721341128754
  (77045, 20576)	0.1848162797481827
  (77045, 8653)	0.2506983559972213
  (77045, 4696)	0.301964906145067
  (77045, 16269)	0.2944153792406509

In [25]:
print(X_test_features)

  (0, 0)	0.13930762063471092
  (0, 215)	0.1522151078709765
  (0, 233)	0.1527293049173748
  (0, 668)	0.13861408265523878
  (0, 773)	0.15612857283478657
  (0, 819)	0.14653390214382528
  (0, 1114)	0.1229165402037212
  (0, 1294)	0.15871119804616368
  (0, 2636)	0.17723804963014986
  (0, 4451)	0.1696441743430287
  (0, 5692)	0.19281264537104634
  (0, 7311)	0.23881000573147698
  (0, 8340)	0.11768930800927227
  (0, 9378)	0.09633827020908997
  (0, 11037)	0.22172254284964374
  (0, 11403)	0.19252090234526292
  (0, 12800)	0.12551197337158623
  (0, 15139)	0.13391410254577474
  (0, 15313)	0.15084061601174986
  (0, 17426)	0.12703162515491095
  (0, 19230)	0.23698309615304994
  (0, 19324)	0.1683052752922227
  (0, 19552)	0.12059986392683414
  (0, 19713)	0.1665249521804995
  (0, 20285)	0.0961630598200752
  :	:
  (19258, 20215)	0.25286436698900466
  (19258, 20899)	0.22122882059292076
  (19258, 21130)	0.24932058638665586
  (19258, 22790)	0.25272132154322835
  (19258, 29619)	0.2528166496212476
  (19258, 3228

In [26]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train_features, y_train)

In [27]:
prediction_train_data = model.predict(X_train_features)
accuracy_train_data = accuracy_score(y_train, prediction_train_data)

In [28]:
print("Accuarcy on train data: ", accuracy_train_data)

Accuarcy on train data:  0.9956131242861592


In [29]:
prediction_test_data = model.predict(X_test_features)
accuracy_test_data = accuracy_score(y_test, prediction_test_data)

In [30]:
print("Accuarcy on test data: ", accuracy_test_data)

Accuarcy on test data:  0.9946526840411172


In [31]:
input_user_mail = ["Congratulations! you won a baby. Text WON to 44255 to claim your prize.",]

input_data_features = feature_extraction.transform(input_user_mail)

prediction = model.predict(input_data_features)

if prediction[0] == 1:
    print("This is a ham mail")
else:
    print("This is a spam mail")

This is a spam mail


In [32]:
import pickle
pickle.dump(model, open("logistic_regression.pkl", "wb"))
pickle.dump(feature_extraction, open("feature_extraction.pkl", "wb"))

In [33]:
print("Model and feature extraction saved successfully")

Model and feature extraction saved successfully


In [34]:
print("Test Accuracy: ", accuracy_test_data)

Test Accuracy:  0.9946526840411172
