In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("smsspamcollection.tsv",sep="\t")

In [3]:
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [4]:
df.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [5]:
df["label"].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [6]:
#splitting data into train and test set

In [9]:
from sklearn.model_selection import train_test_split

X = df["message"]
y = df["label"]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.33,random_state = 42)

In [14]:
%%html
<a href="Countvectorizer.ipynb">Count Vectorizer'a Git</a>

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

In [16]:
X 

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: message, Length: 5572, dtype: object

In [17]:
X_train

3235                                  Yup ü not comin :-(
945     I sent my scores to sophas and i had to do sec...
5319                         Kothi print out marandratha.
5528    Its just the effect of irritation. Just ignore it
247                        I asked you to call him now ok
                              ...                        
3772    Hi, wlcome back, did wonder if you got eaten b...
5191                               Sorry, I'll call later
5226        Prabha..i'm soryda..realy..frm heart i'm sory
5390                           Nt joking seriously i told
860               Did he just say somebody is named tampa
Name: message, Length: 3733, dtype: object

In [18]:
# FIT vectorizer to the data (build a vocab, count the number of words...)
# count_vect.fit(X_train)
# X_train.counts = count_vect.transform(X_train)
# Transform the original text message into vector
X_train_counts = count_vect.fit_transform(X_train)

In [19]:
X_train_counts

<3733x7082 sparse matrix of type '<class 'numpy.int64'>'
	with 49992 stored elements in Compressed Sparse Row format>

In [21]:
X_train.shape

(3733,)

In [22]:
X_train_counts.shape

(3733, 7082)

## TfidfTransformer

In [23]:
from sklearn.feature_extraction.text import TfidfTransformer

In [24]:
tfidftransformer = TfidfTransformer()

In [25]:
X_train_tfidf = tfidftransformer.fit_transform(X_train_counts)

In [26]:
X_train_tfidf.shape

(3733, 7082)

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [28]:
vectorizer = TfidfVectorizer()

In [29]:
X_train_tfidf = vectorizer.fit_transform(X_train)

In [30]:
from sklearn.svm import LinearSVC

In [31]:
clf = LinearSVC()

In [32]:
clf.fit(X_train_tfidf,y_train)

In [33]:
from sklearn.pipeline import Pipeline

In [36]:
text_clf = Pipeline([("tfidf",TfidfVectorizer()),("clf",LinearSVC())])

In [37]:
#firstly make tfidf then make linearSVC
text_clf.fit(X_train,y_train)

In [38]:
predictions =text_clf.predict(X_test)

In [39]:
X_test

3245    Squeeeeeze!! This is christmas hug.. If u lik ...
944     And also I've sorta blown him off a couple tim...
1044    Mmm thats better now i got a roast down me! i...
2484        Mm have some kanji dont eat anything heavy ok
812     So there's a ring that comes with the guys cos...
                              ...                        
4944    Check mail.i have mailed varma and kept copy t...
3313    I know you are serving. I mean what are you do...
3652         Want to send me a virtual hug?... I need one
14                    I HAVE A DATE ON SUNDAY WITH WILL!!
4758    hey, looks like I was wrong and one of the kap...
Name: message, Length: 1839, dtype: object

In [40]:
from sklearn.metrics import confusion_matrix,classification_report

In [41]:
print(confusion_matrix(y_test,predictions))

[[1586    7]
 [  12  234]]


In [43]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1593
        spam       0.97      0.95      0.96       246

    accuracy                           0.99      1839
   macro avg       0.98      0.97      0.98      1839
weighted avg       0.99      0.99      0.99      1839



In [44]:
from sklearn import metrics

In [46]:
metrics.accuracy_score(y_test,predictions)

0.989668297988037

In [47]:
text_clf.predict(["how are you doing today"])

array(['ham'], dtype=object)

In [50]:
text_clf.predict(["congratulations you are the winner. Winner no is 12412 congratulations. Text won to free entry 4555"])

array(['spam'], dtype=object)