In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('mail_data.csv')

In [3]:
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
df.isnull().sum()

Category    0
Message     0
dtype: int64

### Label Encoding

spam = 1, ham = 0

In [5]:
df.Category = df.Category.replace(['ham','spam'],[0,1])

In [6]:
df

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


### separating features and targets

In [7]:
X = df['Message']
Y = df['Category']

### splitting the data into train and test data

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.05, random_state=3)

In [9]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5572,)
(5293,)
(279,)


### transform the text data to feature vectors

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english')#, lowercase='True'

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# convert Y_train and Y_test values as integers

#Y_train = Y_train.astype('int')
#Y_test = Y_test.astype('int')

In [11]:
X_train

236     I have printed it oh. So  &lt;#&gt;  come upst...
4368    I like dis sweater fr mango but no more my siz...
4298    thesmszone.com lets you send free anonymous an...
5219    Pls she needs to dat slowly or she will vomit ...
203     Your account has been refilled successfully by...
                              ...                        
789     5 Free Top Polyphonic Tones call 087018728737,...
968     What do u want when i come back?.a beautiful n...
1667    Guess who spent all last night phasing in and ...
3321    Eh sorry leh... I din c ur msg. Not sad alread...
1688    Free Top ringtone -sub to weekly ringtone-get ...
Name: Message, Length: 5293, dtype: object

In [12]:
print(X_train_features)

  (0, 7639)	0.5718297894933473
  (0, 2039)	0.2861680331687694
  (0, 3483)	0.2781853804101716
  (0, 4535)	0.2781853804101716
  (0, 5221)	0.3306887187982115
  (0, 5758)	0.5718297894933473
  (1, 3992)	0.4092300817352799
  (1, 6591)	0.4236081538504681
  (1, 4639)	0.44387290832365106
  (1, 3175)	0.3431700015086182
  (1, 7079)	0.44387290832365106
  (1, 2513)	0.31197537199842806
  (1, 4386)	0.2096336615096896
  (2, 839)	0.3514421880405683
  (2, 5675)	0.324013276413347
  (2, 4752)	0.20124877966836952
  (2, 6400)	0.2805394800486066
  (2, 3854)	0.20909094885426538
  (2, 4754)	0.25149517862893395
  (2, 4663)	0.3514421880405683
  (2, 1046)	0.3514421880405683
  (2, 3188)	0.16579825831951736
  (2, 6398)	0.1722935031563946
  (2, 4360)	0.26675644261558307
  (2, 2033)	0.21555397525906075
  :	:
  (5290, 3492)	0.31506881480455223
  (5290, 5083)	0.26227639260755153
  (5291, 7524)	0.30455078249353734
  (5291, 2495)	0.37148059021526847
  (5291, 7855)	0.30455078249353734
  (5291, 5211)	0.3092023221405082
  (

### model - Logistic Regression

In [13]:
model = LogisticRegression()

In [14]:
model.fit(X_train_features, Y_train)

### prediction on train data

In [15]:
prediction_train = model.predict(X_train_features)
accuracy_train = accuracy_score(Y_train, prediction_train)

In [16]:
print('Accuracy on training data : ', accuracy_train)

Accuracy on training data :  0.9710938976006046


### prediction on test data

In [17]:
prediction_test = model.predict(X_test_features)
accuracy_test = accuracy_score(Y_test, prediction_test)

In [18]:
print('Accuracy on test data : ', accuracy_test)

Accuracy on test data :  0.9605734767025089


In [19]:
prediction_test

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0], dtype=int64)

In [20]:
np.array(Y_test)

array([1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0], dtype=int64)