# SVM classifier

Importing the dataset

In [1]:
import pandas as pd
from nltk.corpus import stopwords
import gensim
import numpy as np


dataset=pd.read_csv("sms_spam.csv")

print(dataset.head())
print ("Shape:", dataset.shape, '\n')

   type                                               text
0   ham  Hope you are having a good week. Just checking in
1   ham                            K..give back my thanks.
2   ham        Am also doing in cbe only. But have to pay.
3  spam  complimentary 4 STAR Ibiza Holiday or £10,000 ...
4  spam  okmail: Dear Dave this is your final notice to...
Shape: (5559, 2) 



Preprocessing function

In [2]:
def transformText(text):
    stops = set(stopwords.words("english"))
    # Convert text to lowercase
    text = text.lower()
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    # Removing all the stopwords
    filtered_words = [word for word in text.split() if word not in stops]
    # Preprocessed text after stop words removal
    text = " ".join(filtered_words)
    # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation(text)
    # Strip all the numerics
    text = gensim.parsing.preprocessing.strip_numeric(text)
    # Removing all the words with < 3 characters
    text = gensim.parsing.preprocessing.strip_short(text, minsize=3)
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    # Stemming
    return gensim.parsing.preprocessing.stem_text(text)

Preprocessing

In [3]:
#applies transformText to all rows of text
dataset['text'] = dataset['text'].map(transformText)
print(dataset['text'].head())

0                                 hope good week check
1                                      give back thank
2                                    also cbe onli pai
3    complimentari star ibiza holidai cash need urg...
4    okmail dear dave final notic collect tenerif h...
Name: text, dtype: object


Creating training and test set

In [4]:
## Split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dataset['text'], dataset['type'],
                                                    test_size=0.33, random_state=10)

print ("Training Sample Size:", len(X_train), ' ', "Test Sample Size:" ,len(X_test))

Training Sample Size: 3724   Test Sample Size: 1835


Creating a tf-idf model

In [5]:
#Build the counting corpus
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)

## Get the TF-IDF vector representation of the data
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print ('Dimension of TF-IDF vector :' , X_train_tfidf.shape)

Dimension of TF-IDF vector : (3724, 5056)


Creating the classifier

In [6]:
#Creating the classifier
from sklearn import svm
clf = svm.SVC(kernel='linear')


# the fit() function of any classifier takes the features from the 
# training set X_train_tfidf and the labels from the training set
# y_train
clf.fit(X_train_tfidf, y_train)

Folding the test set into the train space

In [7]:
#indexing the test set
X_new_counts = count_vect.transform(X_test)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

Performing the actual prediction

In [8]:
#performing the actual prediction
predicted = clf.predict(X_new_tfidf)

Showing the results

In [9]:
print(predicted)
print(np.mean(predicted==y_test))

['ham' 'ham' 'ham' ... 'spam' 'ham' 'spam']
0.9841961852861035


In [10]:
from sklearn import metrics
print(metrics.classification_report(y_test,predicted))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1583
        spam       0.97      0.91      0.94       252

    accuracy                           0.98      1835
   macro avg       0.98      0.95      0.97      1835
weighted avg       0.98      0.98      0.98      1835

