In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, mean_squared_error, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

In [2]:
df = pd.read_csv("mail_data.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [4]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
data = df.where((pd.notnull(df)),'')

In [6]:
data.loc[data['Category'] == 'ham', 'Category'] = 1
data.loc[data['Category'] == 'spam', 'Category'] = 0

In [7]:
data.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [9]:
x_train = train_data['Message']
y_train = train_data['Category']
x_train.shape

(4457,)

In [10]:
x_test = test_data['Message']
y_test = test_data['Category']
y_test.shape

(1115,)

In [11]:
feature_extration = TfidfVectorizer(min_df=1, stop_words="english", lowercase='True')

In [12]:
x_train_features = feature_extration.fit_transform(x_train)
x_test_features = feature_extration.transform(x_test)

In [13]:
y_train = y_train.astype('int8')
y_test = y_test.astype('int8')

In [14]:
print(x_train_features)

  (0, 5818)	0.22682143517864364
  (0, 2497)	0.2442158912653505
  (0, 694)	0.3171299579602537
  (0, 6264)	0.1898892037332199
  (0, 5800)	0.17558937755823417
  (0, 3262)	0.33791755486732394
  (0, 2049)	0.3034375179183143
  (0, 7300)	0.24288153842988894
  (0, 2724)	0.3544175987866074
  (0, 354)	0.3544175987866074
  (0, 7162)	0.2550284465664535
  (0, 258)	0.2379428657041507
  (0, 7222)	0.2173884735352799
  (0, 5512)	0.1898892037332199
  (1, 2555)	0.3840709491751004
  (1, 3804)	0.1902902346515268
  (1, 3932)	0.24325511357721427
  (1, 4509)	0.4028245991060671
  (1, 2440)	0.33870544648398715
  (1, 3333)	0.20665394084233096
  (1, 5650)	0.360444144470318
  (1, 2335)	0.2162321275166079
  (1, 6738)	0.28986069568918
  (1, 6109)	0.3239762634465801
  (1, 3267)	0.2678713077029217
  :	:
  (4452, 2438)	0.4574160733416501
  (4452, 7280)	0.3968991650168732
  (4452, 3978)	0.4574160733416501
  (4452, 3290)	0.26370969643076225
  (4452, 3084)	0.22948428918295163
  (4452, 2236)	0.2676662072392096
  (4453, 387

In [15]:
model = SGDClassifier()

In [16]:
model.fit(x_train_features,y_train)

SGDClassifier()

In [17]:
pred = model.predict(x_train_features)
mse = mean_squared_error(y_train, pred)
rmse = np.sqrt(mse)
rmse

0.014978857285595261

In [18]:
acc_scores = accuracy_score(y_train ,pred)
acc_scores

0.9997756338344178

In [19]:
scores = cross_val_score(model,x_train_features, y_train, scoring="accuracy", cv=10)

In [20]:
scores.mean()

0.9813795535849247

In [21]:
confusion_matrix(y_train, pred)

array([[ 598,    0],
       [   1, 3858]])

In [22]:
model.fit(x_test_features, y_test)

SGDClassifier()

In [23]:
pred_test = model.predict(x_test_features)
pred_test[:15]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0], dtype=int8)

In [24]:
x_test[:15]

3245    Squeeeeeze!! This is christmas hug.. If u lik ...
944     And also I've sorta blown him off a couple tim...
1044    Mmm thats better now i got a roast down me! i...
2484        Mm have some kanji dont eat anything heavy ok
812     So there's a ring that comes with the guys cos...
2973    Sary just need Tim in the bollox &it hurt him ...
2991    Love isn't a decision, it's a feeling. If we c...
2942    My supervisor find 4 me one lor i thk his stud...
230                    Dear good morning now only i am up
1181                           I'm in chennai velachery:)
1912    Lol grr my mom is taking forever with my presc...
1992    No other Valentines huh? The proof is on your ...
5435                    I'm wif him now buying tix lar...
4805    Er, hello, things didn‘t quite go to plan – is...
401     FREE RINGTONE text FIRST to 87131 for a poly o...
Name: Message, dtype: object

In [25]:
y_test[:15]

3245    1
944     1
1044    1
2484    1
812     1
2973    1
2991    1
2942    1
230     1
1181    1
1912    1
1992    1
5435    1
4805    1
401     0
Name: Category, dtype: int8

In [26]:
acc_scores = accuracy_score(y_test ,pred_test)
acc_scores

1.0

In [27]:
scores = cross_val_score(model,x_test_features, y_test, scoring="accuracy", cv=10)

In [28]:
scores.mean()

0.9722329472329474

In [29]:
input_mail = input("enter the mail: ")

input_mail = [input_mail]

feature = feature_extration.transform(input_mail)

pred_real = model.predict(feature)


if (pred_real[0] == 0):
    print("Spam Mail")
elif (pred_real[0] == 1):
    print("Real Mail")

enter the mail: Someone has contacted our dating service and entered your phone because they fancy you! To find out who it is call from a landline 09111032124 . PoBox12n146tf150p
Spam Mail
