# Support Vector Machine

In [1]:
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, ShuffleSplit
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from pprint import pprint

In [2]:
df_train = pd.read_csv('./../dataset/sentiment_analysis_train.csv')
df_test = pd.read_csv('./../dataset/sentiment_analysis_test.csv')

In [3]:
df_train.shape, df_test.shape

((60672, 3), (58544, 3))

In [4]:
vectorizer = TfidfVectorizer()

features_train = df_train['features']
features_train = vectorizer.fit_transform(features_train.values.astype('U')) # to convert to unicode
labels_train = df_train['labels']

features_test = df_test['features']
features_test = vectorizer.transform(features_test.values.astype('U')) # to convert to unicode
labels_test = df_test['labels']

# Cross-Validation for Hyperparameter tuning

First, we can see what hyperparameters the model has:

In [5]:
svc_0 = svm.SVC(random_state=8)

print('Parameters currently in use:\n')
pprint(svc_0.get_params())

Parameters currently in use:

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': 8,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}



We'll tune the following ones:

- C: Penalty parameter C of the error term.
- kernel: Specifies the kernel type to be used in the algorithm.
- gamma: Kernel coefficient.
- degree: Degree of the polynomial kernel function.

### Randomized Search Cross Validation

We first need to define the grid:

In [6]:
# C
C = [.0001, .001, .01]

# gamma
gamma = [.0001, .001, .01, .1, 1, 10, 100]

# degree
degree = [1, 2, 3, 4, 5]

# kernel
kernel = ['linear', 'rbf', 'poly']

# probability
probability = [True]

# Create the random grid
random_grid = {'C': C,
              'kernel': kernel,
              'gamma': gamma,
              'degree': degree,
              'probability': probability
             }

pprint(random_grid)

{'C': [0.0001, 0.001, 0.01],
 'degree': [1, 2, 3, 4, 5],
 'gamma': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
 'kernel': ['linear', 'rbf', 'poly'],
 'probability': [True]}


In [None]:
# First create the base model to tune
svc = svm.SVC(random_state=8)

# Definition of the random search
random_search = RandomizedSearchCV(estimator=svc,
                                   param_distributions=random_grid,
                                   n_iter=50,
                                   scoring='accuracy',
                                   cv=3, 
                                   verbose=1, 
                                   random_state=8)

# Fit the random search model
random_search.fit(features_train, labels_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


In [None]:
print("The best hyperparameters from Random Search are:")
print(random_search.best_params_)
print("")
print("The mean accuracy of a model with these hyperparameters is:")
print(random_search.best_score_)