**Import** **packages**

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

**Data** **processing**

In [2]:
raw_mail_data = pd.read_csv('/content/spam.csv',encoding = 'ISO-8859-1')
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')
mail_data.shape

(5572, 5)

In [3]:
mail_data.head()

Unnamed: 0,category,message,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
# label spam mail as 0 and ham mail as 1
mail_data.loc[mail_data['category'] == 'spam','category',]=0
mail_data.loc[mail_data['category'] == 'ham','category',]=1

In [5]:
#seperate the data as text and labels
x = mail_data['message']
y = mail_data['category']

In [6]:
print(x)
print("...........")
print(y)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: message, Length: 5572, dtype: object
...........
0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: category, Length: 5572, dtype: object


In [7]:
#train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,train_size = 0.8,test_size=0.2,random_state =3)

**feature** **extraction**

In [30]:
feature_extraction = TfidfVectorizer(min_df=1,stop_words = 'english',lowercase='True')
x_train_feature = feature_extraction.fit_transform(x_train)
x_test_feature = feature_extraction.transform(x_test)

In [22]:
#convert y_train and y_test values as integer
y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [27]:
#training the model
model=LinearSVC()
model.fit(x_train_feature,y_train)


LinearSVC()

**Evaluation**  **of** **the** **model**

In [28]:
prediction_on_training_data=model.predict(x_train_feature)
accuracy_on_training_data = accuracy_score(y_train,prediction_on_training_data)
print("Accuracy on training data: ",accuracy_on_training_data)

Accuracy on training data:  0.9995512676688355


In [32]:
prediction_on_test_data=model.predict(x_test_feature)
accuracy_on_test_data = accuracy_score(y_test,prediction_on_test_data)
print("Accuracy on test data: ",accuracy_on_test_data)

Accuracy on test data:  0.9856502242152466


**Prediction** **on** **new** **mail**

In [45]:
new_data = ["WINNER! you have won "]
new_data_feature = feature_extraction.transform(new_data)
prediction = model.predict(new_data_feature)
print(prediction)
if(prediction[0]==1):
  print("Ham mail")
else:
  print("Spam mail")

[0]
Spam mail
