# Grid Search Hyperparameter Calibration

Importing the dataset

In [1]:
import pandas as pd
from nltk.corpus import stopwords
import gensim
import numpy as np


dataset=pd.read_csv("sms_spam.csv")

print(dataset.head())
print ("Shape:", dataset.shape, '\n')

   type                                               text
0   ham  Hope you are having a good week. Just checking in
1   ham                            K..give back my thanks.
2   ham        Am also doing in cbe only. But have to pay.
3  spam  complimentary 4 STAR Ibiza Holiday or Â£10,000 ...
4  spam  okmail: Dear Dave this is your final notice to...
Shape: (5559, 2) 



Preprocessing function

In [2]:
def transformText(text):
    stops = set(stopwords.words("english"))
    # Convert text to lowercase
    text = text.lower()
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    # Removing all the stopwords
    filtered_words = [word for word in text.split() if word not in stops]
    # Preprocessed text after stop words removal
    text = " ".join(filtered_words)
    # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation(text)
    # Strip all the numerics
    text = gensim.parsing.preprocessing.strip_numeric(text)
    # Removing all the words with < 3 characters
    text = gensim.parsing.preprocessing.strip_short(text, minsize=3)
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    # Stemming
    return gensim.parsing.preprocessing.stem_text(text)

Preprocessing

In [3]:
#applies transformText to all rows of text
dataset['text'] = dataset['text'].map(transformText)
print(dataset['text'].head())

0                                 hope good week check
1                                      give back thank
2                                    also cbe onli pai
3    complimentari star ibiza holidai cash need urg...
4    okmail dear dave final notic collect tenerif h...
Name: text, dtype: object


Creating training and test set

In [4]:
## Split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dataset['text'], dataset['type'],
                                                    test_size=0.33, random_state=10)

print ("Training Sample Size:", len(X_train), ' ', "Test Sample Size:" ,len(X_test))

Training Sample Size: 3724   Test Sample Size: 1835


Creating a tf-idf model

In [5]:
#Build the counting corpus
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)

## Get the TF-IDF vector representation of the data
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print ('Dimension of TF-IDF vector :' , X_train_tfidf.shape)

Dimension of TF-IDF vector : (3724, 5056)


Creating the classifier

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from time import time

#creates a model instance with no parameters
svc=svm.SVC()

# prints the list of parameters for the model
print(svm.SVC().get_params().keys())



dict_keys(['C', 'break_ties', 'cache_size', 'class_weight', 'coef0', 'decision_function_shape', 'degree', 'gamma', 'kernel', 'max_iter', 'probability', 'random_state', 'shrinking', 'tol', 'verbose'])


Create a dictionary with possible values for some parameters

In [7]:

parameters = {
    'C': [1, 10, 100, 1000],
    'gamma': [0.001, 0.0001],
    'kernel': ['rbf','linear']
}

Instantiating and running Grid Search

In [8]:
#instantiates the grid search
# using the svc model and the parameters above defined
grid_search = GridSearchCV(svc, parameters, n_jobs=-1, verbose=10)

print("Performing grid search...")
print("parameters:")
print(parameters)
t0 = time()
# Starts the grid search
grid_search.fit(X_train_tfidf, y_train)
# Prints the required time
print("done in %0.3fs" % (time() - t0))
print()

Performing grid search...
parameters:
{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf', 'linear']}
Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 3/5; 2/16] START C=1, gamma=0.001, kernel=linear............................[CV 1/5; 2/16] START C=1, gamma=0.001, kernel=linear............................[CV 2/5; 1/16] START C=1, gamma=0.001, kernel=rbf...............................
[CV 2/5; 2/16] START C=1, gamma=0.001, kernel=linear............................
[CV 5/5; 1/16] START C=1, gamma=0.001, kernel=rbf...............................


[CV 1/5; 1/16] START C=1, gamma=0.001, kernel=rbf...............................
[CV 4/5; 1/16] START C=1, gamma=0.001, kernel=rbf...............................
[CV 3/5; 1/16] START C=1, gamma=0.001, kernel=rbf...............................
[CV 2/5; 2/16] END C=1, gamma=0.001, kernel=linear;, score=0.972 total time=   0.3s
[CV 3/5; 2/16] END C=1, gamma=0.001, kernel=linear;, score=0.969 total time=   0.4s
[CV 

Printing the best score and configuration

In [9]:
# Prints the best score
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best score: 0.977
Best parameters set:
	C: 1000
	gamma: 0.001
	kernel: 'rbf'


Instantiating and using the best model

In [10]:
#instantiating the model using the grid search best estimator
clf= grid_search.best_estimator_
clf.fit(X_train_tfidf, y_train)

#indexing the test set
X_new_counts = count_vect.transform(X_test)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

#performing the actual prediction
predicted = clf.predict(X_new_tfidf)

from sklearn import metrics
print(pd.crosstab(y_test,predicted))
print(metrics.classification_report(y_test, predicted))


col_0   ham  spam
type             
ham    1573    10
spam     20   232
              precision    recall  f1-score   support

         ham       0.99      0.99      0.99      1583
        spam       0.96      0.92      0.94       252

    accuracy                           0.98      1835
   macro avg       0.97      0.96      0.96      1835
weighted avg       0.98      0.98      0.98      1835

