SMS Spam collection

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('smsspamcollection.tsv', sep='\t')

In [3]:
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [4]:
df.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [5]:
df['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X = df['message']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [8]:
X_train.shape

(3900,)

In [9]:
X_test.shape

(1672,)

In [10]:
y_train.shape

(3900,)

In [11]:
X_train.count()

3900

In [13]:
X_train

708     Quite late lar... Ard 12 anyway i wun b drivin...
4338                        on a Tuesday night r u 4 real
5029    Go chase after her and run her over while she'...
4921     G says you never answer your texts, confirm/deny
2592         Still work going on:)it is very small house.
                              ...                        
3772    Hi, wlcome back, did wonder if you got eaten b...
5191                               Sorry, I'll call later
5226        Prabha..i'm soryda..realy..frm heart i'm sory
5390                           Nt joking seriously i told
860               Did he just say somebody is named tampa
Name: message, Length: 3900, dtype: object

In [14]:
X_test

3245    Squeeeeeze!! This is christmas hug.. If u lik ...
944     And also I've sorta blown him off a couple tim...
1044    Mmm thats better now i got a roast down me! i...
2484        Mm have some kanji dont eat anything heavy ok
812     So there's a ring that comes with the guys cos...
                              ...                        
2505    Hello, my boytoy! I made it home and my consta...
2525    FREE entry into our £250 weekly comp just send...
4975    Aiyo u so poor thing... Then u dun wan 2 eat? ...
650     You have won ?1,000 cash or a ?2,000 prize! To...
4463    Sorry I flaked last night, shit's seriously go...
Name: message, Length: 1672, dtype: object

In [15]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import Pipeline

In [16]:
vect_count =  CountVectorizer()

In [17]:
X_train_count = vect_count.fit_transform(X_train)

In [18]:
tfid_vect = TfidfTransformer()

In [19]:
X_train_tf = tfid_vect.fit_transform(X_train_count)

In [20]:
X_train_tf

<3900x7263 sparse matrix of type '<class 'numpy.float64'>'
	with 52150 stored elements in Compressed Sparse Row format>

In [21]:
vect_tfidf = TfidfVectorizer()

In [22]:
X_train_tfidf = vect_tfidf.fit_transform(X_train)

In [23]:
X_train_tfidf

<3900x7263 sparse matrix of type '<class 'numpy.float64'>'
	with 52150 stored elements in Compressed Sparse Row format>

In [24]:
from sklearn.svm import LinearSVC

In [25]:
clf = LinearSVC()

In [26]:
clf.fit(X_train_tfidf,y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [27]:
Text_clf = Pipeline([('tfid_vect', TfidfVectorizer()), ('clf', LinearSVC())])

In [28]:
Text_clf.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('tfid_vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
           

In [29]:
prediction = Text_clf.predict(X_test)

In [30]:
prediction

array(['ham', 'ham', 'ham', ..., 'ham', 'spam', 'ham'], dtype=object)

In [31]:
Text_clf.predict(['how are you today ?'])

array(['ham'], dtype=object)

In [37]:
Text_clf.predict(['Won!!!!! !!!!! LINK'])

array(['spam'], dtype=object)

In [63]:
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [69]:
print(confusion_matrix(y_test,prediction))

[[1445    3]
 [  10  214]]


In [71]:
conf_df = pd.DataFrame(metrics.confusion_matrix(y_test,prediction), index=['ham', 'Spam'], columns=['ham','spam'])

In [72]:
conf_df

Unnamed: 0,ham,spam
ham,1445,3
Spam,10,214


In [65]:
print(classification_report(y_test,prediction))

              precision    recall  f1-score   support

         ham       0.99      1.00      1.00      1448
        spam       0.99      0.96      0.97       224

    accuracy                           0.99      1672
   macro avg       0.99      0.98      0.98      1672
weighted avg       0.99      0.99      0.99      1672



In [73]:
print(metrics.accuracy_score(y_test,prediction))

0.9922248803827751


# Predictions with random messages

In [75]:
Text_clf.predict(['How are you doing today???'])

array(['ham'], dtype=object)

In [82]:
Text_clf.predict(['Congrats!!! you have won the prize'])

array(['spam'], dtype=object)