In [56]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [57]:
df = pd.read_csv("spam.csv", encoding="latin-1")

In [58]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [59]:
df.shape

(5572, 5)

In [60]:
df=df[['v1','v2']]

In [61]:
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [62]:
# Null check
df['v1'].isnull().any()

np.False_

In [63]:
df['v2'].isnull().any()

np.False_

In [64]:
# find empty string

In [65]:
(df['v1'].str.strip()=='').sum()

np.int64(0)

In [66]:
(df['v2'].str.strip()=='').sum()

np.int64(0)

In [67]:
# rename columns
df=df.rename(columns={
    'v1':'category',
    'v2':'Message'
})

In [68]:
df.head()

Unnamed: 0,category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [69]:
df.shape

(5572, 2)

In [70]:
df['category'].unique()

array(['ham', 'spam'], dtype=object)

In [71]:
#Lavel Encoding

In [79]:
y=df['category'].map({'ham':0,'spam':1})

In [80]:
y.shape

(5572,)

In [81]:
x=df['Message']

In [82]:
x

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object

In [83]:
y

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: category, Length: 5572, dtype: int64

In [86]:
#Splitting training and test data
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=2)

In [91]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)


(4457,)
(4457,)
(1115,)
(1115,)


In [92]:
#Feature Extraction - convert text to vector

In [96]:
from sklearn.feature_extraction.text import TfidfVectorizer
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english',lowercase=True)

In [112]:
x_train_featured = feature_extraction.fit_transform(x_train)

#Test data should not be exposed to the model so I will not lear from that too and perform well during the test run
x_test_featured = feature_extraction.transform(x_test)

In [113]:
x_train_featured

<4457x7364 sparse matrix of type '<class 'numpy.float64'>'
	with 34797 stored elements in Compressed Sparse Row format>

In [114]:
model = LogisticRegression()

In [116]:
model.fit(x_train_featured,y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [117]:
#Evaluating the model

In [118]:
y_predicted = model.predict(x_test_featured)

In [123]:
print(y_predicted)

[0 0 0 ... 0 0 0]


In [126]:
#Checking accuracy score
accuracy = accuracy_score(y_predicted,y_test)

In [127]:
print(accuracy)

0.9560538116591928
