In [3]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer  # Fix the typo here
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [5]:
raw_mail_data = pd.read_csv('spam.csv',encoding='latin')

In [6]:
raw_mail_data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [7]:
raw_mail_data.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [8]:
#replace the null values with a null string 
#creating a new data frame
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [9]:
#printing the first 5 rows of data frame 
mail_data.head()


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [10]:
#checking the number of rows and columns in data frame
mail_data.shape

(5572, 5)

In [12]:
#rename the columns 
mail_data= mail_data.rename(columns={'v1':'Category','v2':'Message'})

In [13]:
mail_data.head()

Unnamed: 0,Category,Message,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [15]:
#label spam mail as 0; ham mail as 1 

mail_data.loc[mail_data['Category']== 'spam', 'Category']=0
mail_data.loc[mail_data['Category']== 'ham', 'Category']=1

In [24]:
#seperating the data as texts and label
X = mail_data['Message']
Y = mail_data['Category']


In [25]:
print(X)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object


In [26]:
print(Y)

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


In [27]:
#splitting the data into training and test data 

In [28]:
X_train,X_test,Y_train,Y_test= train_test_split(X,Y, test_size=0.5, random_state=4)

In [29]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5572,)
(2786,)
(2786,)


In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Transform the text data to feature vectors that can be used as input to logistic regression
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# Convert Y_train and Y_test values to integers
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')


In [34]:
Y_train

1498    1
1812    1
1814    1
3919    0
357     0
       ..
3671    1
709     0
2487    1
174     1
1146    1
Name: Category, Length: 2786, dtype: int32

In [35]:
Y_test

4004    1
2276    1
4498    0
3755    1
111     1
       ..
352     1
876     1
3832    1
3229    1
726     1
Name: Category, Length: 2786, dtype: int32

In [36]:
print(X_train)

1498    Time n Smile r the two crucial things in our l...
1812    \Getting tickets 4 walsall tue 6 th march. My ...
1814    HI ITS JESS I DONT KNOW IF YOU ARE AT WORK BUT...
3919    FREE>Ringtone! Reply REAL or POLY eg REAL1 1. ...
357     Ur cash-balance is currently 500 pounds - to m...
                              ...                        
3671     came to look at the flat, seems ok, in his 50...
709     4mths half price Orange line rental & latest c...
2487    K ill drink.pa then what doing. I need srs mod...
174     Well, i'm gonna finish my bath now. Have a goo...
1146                            Babe ? I lost you ... :-(
Name: Message, Length: 2786, dtype: object


In [43]:
print(X_train_features)

  (0, 3589)	0.21622828424877352
  (0, 2455)	0.17688600126345402
  (0, 2214)	0.4015209732626
  (0, 3225)	0.39047958140366895
  (0, 3054)	0.16114417466665576
  (0, 5079)	0.17977199308436093
  (0, 1572)	0.24949425946729994
  (0, 4662)	0.5784482936075931
  (0, 5125)	0.3913157903455126
  (1, 5037)	0.30052585688475697
  (1, 911)	0.26045565355021877
  (1, 5266)	0.16690304781623538
  (1, 5477)	0.16168366336079898
  (1, 5219)	0.24182407002987533
  (1, 3805)	0.22247021595921976
  (1, 2708)	0.20989268651938206
  (1, 4410)	0.22468140633850187
  (1, 3270)	0.24996288985392023
  (1, 3248)	0.245666933731099
  (1, 5057)	0.24996288985392023
  (1, 5248)	0.30052585688475697
  (1, 5471)	0.30052585688475697
  (1, 5115)	0.245666933731099
  (1, 2348)	0.39879984564616705
  (2, 5670)	0.36622559528317505
  :	:
  (2782, 1576)	0.2026473606481941
  (2782, 1215)	0.21384843246667015
  (2782, 2478)	0.21201668998822182
  (2782, 3857)	0.16779292330872558
  (2782, 3073)	0.21201668998822182
  (2782, 2242)	0.30492192647012

In [44]:
model = LogisticRegression()

In [45]:
#training the logisticregression model with the training data 

model.fit(X_train_features,Y_train)

In [46]:
#prediction on training data 
from sklearn.metrics import accuracy_score

# Prediction on training data
Prediction_on_training_data = model.predict(X_train_features)

# Calculate accuracy on training data
accuracy_on_training_data = accuracy_score(Y_train, Prediction_on_training_data)


In [49]:
print('Accuracy on training data: ', accuracy_on_training_data)

Accuracy on training data:  0.9583632447954056


In [50]:
#prediction on test data 

prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test,prediction_on_test_data)

In [51]:
print('Accuracy on test data: ', accuracy_on_test_data)

Accuracy on test data:  0.9472361809045227


In [53]:
input_mail = ["hello lucky how are you hope you're doing good and giving your best for your preparation"]

# Convert text features to vector
input_data_features = feature_extraction.transform(input_mail)

# Making prediction
prediction = model.predict(input_data_features)
print(prediction)

if prediction[0] == 1:
    print('Ham mail')
else:
    print('Spam mail')


[1]
Ham mail
