**IMPORTING MODULE**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

**LOADING THE DATASET**

In [2]:
data=pd.read_csv("/content/spam.csv",encoding = 'ISO-8859-1')
data.head(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


**DATA ANALYSIS**

In [3]:
data.tail(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,
5571,ham,Rofl. Its true to its name,,,


In [4]:
data.ndim

2

In [5]:
data.shape

(5572, 5)

In [6]:
data.size

27860

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [8]:
data.describe()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


**DATA PREPROCESSING**

In [9]:
data.isna().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [10]:
data.drop(columns=data[['Unnamed: 2','Unnamed: 3','Unnamed: 4']],axis=1,inplace=True)
print(data.head(5))

     v1                                                 v2
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [11]:
data.columns=['spam/ham','sms']
data

Unnamed: 0,spam/ham,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [12]:
data.shape

(5572, 2)

In [13]:
data.dtypes

spam/ham    object
sms         object
dtype: object

In [14]:
data.nunique()

spam/ham       2
sms         5169
dtype: int64

In [15]:
data.max()

spam/ham                                    spam
sms         ÌÏ wait 4 me in sch i finish ard 5..
dtype: object

In [16]:
data.min()

spam/ham                                     ham
sms          &lt;#&gt;  in mca. But not conform.
dtype: object

In [17]:
data['spam/ham'].value_counts()

ham     4825
spam     747
Name: spam/ham, dtype: int64

In [18]:
data['spam/ham'] = data['spam/ham'].map({'spam':0, 'ham':1})
data

Unnamed: 0,spam/ham,sms
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,0,This is the 2nd time we have tried 2 contact u...
5568,1,Will Ì_ b going to esplanade fr home?
5569,1,"Pity, * was in mood for that. So...any other s..."
5570,1,The guy did some bitching but I acted like i'd...


**SPLITTING THE DATASET INTO TEST AND TRAIN DATA**

In [19]:
x=data['sms']
x

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: sms, Length: 5572, dtype: object

In [20]:
y=data['spam/ham']
y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: spam/ham, Length: 5572, dtype: int64

In [21]:
x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.2,random_state=3)

In [22]:
print("The shape of the training set=(",x_train.shape,y_train.shape,")")
print("The Shape of the testing set=(",x_test.shape,y_test.shape,")")

The shape of the training set=( (4457,) (4457,) )
The Shape of the testing set=( (1115,) (1115,) )


**DEVELOPMENT OF MODEL**

In [23]:
feature=TfidfVectorizer(min_df=1,stop_words="english",lowercase=True)
feature

In [24]:
y_train=y_train.astype('int')
y_test=y_test.astype('int')

In [25]:
xtrain=feature.fit_transform(x_train)
xtest=feature.transform(x_test)
xtrain,xtest

(<4457x7510 sparse matrix of type '<class 'numpy.float64'>'
 	with 34758 stored elements in Compressed Sparse Row format>,
 <1115x7510 sparse matrix of type '<class 'numpy.float64'>'
 	with 7766 stored elements in Compressed Sparse Row format>)

**LOGISITIC REGRESSION**

In [26]:
model= LogisticRegression()
model.fit(xtrain, y_train)

In [27]:
model.score(xtrain,y_train)*100

96.61207089970833

In [28]:
model.score(xtest,y_test)*100

96.23318385650225

**PREDICITION MAKING**

In [29]:
from sklearn.metrics import accuracy_score
y_pred=model.predict(xtest)
y_pred

array([1, 1, 1, ..., 1, 1, 1])

In [31]:
inp = ["Rofl. Its true to its name"]
inp=feature.transform(inp)
predict=model.predict(inp)
print(predict)

[1]


In [30]:
print("Mean Squared error : %.2f" % np.mean(y_pred-y_test)**2)

Mean Squared error : 0.00
