In [1]:
#importing the dependencies 
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [4]:
#Data collection and pre-processing 
#load data from csv file to pandas data frame
raw_mail_data=pd.read_csv('mail_data.csv')

In [5]:
raw_mail_data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
#replace the null values with a null string 
mail_data=raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [7]:
raw_mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
#no.of rows and columns in data frame 
raw_mail_data.shape

(5572, 2)

In [9]:
#label spam mail as 0,ham mail as 1.
mail_data.loc[mail_data['Category']=='spam','Category',]=0
mail_data.loc[mail_data['Category']=='ham','Category',]=1

In [10]:
#seprating the data as texts and label 
X=mail_data['Message'] 
Y=mail_data['Category']

In [11]:
print(X)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [12]:
print(Y)

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


In [13]:
#spliting the data into training and test data 
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=3)

In [15]:
X.shape

(5572,)

In [17]:
X_train.shape,X_test.shape

((4457,), (1115,))

In [18]:
Y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object

In [27]:
#Feature Extraction 
#Transform the text data to feature vectors that can be used as input to the logistic regression 
feature_extraction=TfidfVectorizer(min_df=1,stop_words='english', lowercase=True)
X_train_features=feature_extraction.fit_transform(X_train) 
X_test_features=feature_extraction.transform(X_test) 

#convert Y_train and Y_test as integers 
Y_train=Y_train.astype('int')
Y_test=Y_test.astype('int')

In [28]:
X_train

3075                  Don know. I did't msg him recently.
1787    Do you know why god created gap between your f...
1614                         Thnx dude. u guys out 2nite?
4304                                      Yup i'm free...
3266    44 7732584351, Do you want a New Nokia 3510i c...
                              ...                        
789     5 Free Top Polyphonic Tones call 087018728737,...
968     What do u want when i come back?.a beautiful n...
1667    Guess who spent all last night phasing in and ...
3321    Eh sorry leh... I din c ur msg. Not sad alread...
1688    Free Top ringtone -sub to weekly ringtone-get ...
Name: Message, Length: 4457, dtype: object

In [29]:
X_train_features

<4457x7431 sparse matrix of type '<class 'numpy.float64'>'
	with 34775 stored elements in Compressed Sparse Row format>

In [30]:
X_test_features

<1115x7431 sparse matrix of type '<class 'numpy.float64'>'
	with 7687 stored elements in Compressed Sparse Row format>

In [31]:
#Training the model 
#logistical regression 
model=LogisticRegression() 

In [32]:
#training the logistical regression model with training data 
model.fit(X_train_features,Y_train)

In [33]:
#Evaluting the trained model 
#prediction on training data 
prediction_on_training_data=model.predict(X_train_features) 
accuracy_on_training_data=accuracy_score(Y_train,prediction_on_training_data)

In [34]:
print(accuracy_on_training_data)

0.9670181736594121


In [36]:
#prediction on test data 
prediction_on_test_data=model.predict(X_test_features) 
accuracy_on_test_data=accuracy_score(Y_test,prediction_on_test_data)

In [37]:
print(accuracy_on_test_data)

0.9659192825112107


In [39]:
#building a predictive System 
input_mail=["I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today."]
#convert text to feature vector 
input_data_feature=feature_extraction.transform(input_mail)
#making predictions 
prediction=model.predict(input_data_feature)
print(prediction)
if prediction[0]==1:
    print("ham mail")
else:
    print("spam mail")

[1]
ham mail
