In [1]:
import pickle 

from pprint import pprint
import pandas as pd
from sklearn import svm
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns


In [2]:
DATA_PATH = "../input/bbc-tfidf/tfidf/"
df = pd.read_csv('../input/bbc-tfidf/contents.csv', dtype={'label': 'category'})

with open(DATA_PATH+'features_train.pickle', 'rb') as data:
    features_train = pickle.load(data)

with open(DATA_PATH+'labels_train.pickle', 'rb') as data:
    labels_train = pickle.load(data)

with open(DATA_PATH+'features_test.pickle', 'rb') as data:
    features_test = pickle.load(data)

with open(DATA_PATH+'labels_test.pickle', 'rb') as data:
    labels_test = pickle.load(data)

In [3]:
print(len(features_train), len(features_test))

1891 334


# Default parameters
Text have high dimensional feature spaces and should usually be linearly separable.  
LinearSVC should therefore give good results.  
cf. Text Categorization with Support Vector Machines: Learning with Many Relevant Features Thorsten Joachims  
et https://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf

In [4]:
base_model = svm.LinearSVC(random_state=8)
base_model.fit(features_train, labels_train)
accuracy_score(labels_test, base_model.predict(features_test))

0.9550898203592815

In [5]:
base_model = svm.SVC(random_state=8)
base_model.fit(features_train, labels_train)
accuracy_score(labels_test, base_model.predict(features_test))

0.9610778443113772

In [6]:
pprint(base_model.get_params())

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': 8,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}


# Random Search

In [7]:
# Random Search
# how much samples inside the margin contribute to the overall error
# large value: smaller margin (misclassification are not allowed), small value: larger margin
C = [.0001, .001, .01, 0.1, 1, 10]
#
gamma = [.00001, .0001, .001, .01, .1, 1, 10, 20, 50, 100]
degree = [1, 2, 3, 4, 5]
kernel = ['linear', 'rbf', 'poly']
probability = [True]

random_grid = {'C': C,
              'kernel': kernel,
              'gamma': gamma,
              'degree': degree,
              'probability': probability
             }

pprint(random_grid)

{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10],
 'degree': [1, 2, 3, 4, 5],
 'gamma': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10, 20, 50, 100],
 'kernel': ['linear', 'rbf', 'poly'],
 'probability': [True]}


In [17]:
# First create the base model to tune
svc = svm.SVC(random_state=8)

# Definition of the random search
random_search = RandomizedSearchCV(estimator=svc,
                                   param_distributions=random_grid,
                                   n_iter=10,
                                   scoring='accuracy',
                                   cv=3, 
                                   verbose=1, 
                                   random_state=8,
                                   n_jobs=-1)

# Fit the random search model
random_search.fit(features_train, labels_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   56.9s finished


RandomizedSearchCV(cv=3, estimator=SVC(random_state=8), n_jobs=-1,
                   param_distributions={'C': [0.0001, 0.001, 0.01, 0.1, 1, 10],
                                        'degree': [1, 2, 3, 4, 5],
                                        'gamma': [1e-05, 0.0001, 0.001, 0.01,
                                                  0.1, 1, 10, 20, 50, 100],
                                        'kernel': ['linear', 'rbf', 'poly'],
                                        'probability': [True]},
                   random_state=8, scoring='accuracy', verbose=1)

In [18]:
print("The best hyperparameters from Random Search are:")
print(random_search.best_params_)
print("")
print("The mean accuracy of a model with these hyperparameters is:")
print(random_search.best_score_)

The best hyperparameters from Random Search are:
{'probability': True, 'kernel': 'poly', 'gamma': 50, 'degree': 2, 'C': 1}

The mean accuracy of a model with these hyperparameters is:
0.9592810605488893


In [19]:
best_svc = random_search.best_estimator_
best_svc

SVC(C=1, degree=2, gamma=50, kernel='poly', probability=True, random_state=8)

# Fit best model

In [20]:
best_svc.fit(features_train, labels_train)

SVC(C=1, degree=2, gamma=50, kernel='poly', probability=True, random_state=8)

# Performance analysis

In [21]:
# Training accuracy
print("The training accuracy is: ")
print(accuracy_score(labels_train, best_svc.predict(features_train)))

The training accuracy is: 
1.0


In [22]:
svc_pred = best_svc.predict(features_test)

In [24]:
# Test accuracy
print("The test accuracy is: ")
print(accuracy_score(labels_test, svc_pred))

The test accuracy is: 
0.9670658682634731


In [25]:
# Classification report
print("Classification report")
print(classification_report(labels_test,svc_pred))


Classification report
              precision    recall  f1-score   support

           0       0.93      0.97      0.95        76
           1       0.98      1.00      0.99        58
           2       0.97      0.92      0.94        63
           3       0.97      0.97      0.97        77
           4       1.00      0.97      0.98        60

    accuracy                           0.97       334
   macro avg       0.97      0.97      0.97       334
weighted avg       0.97      0.97      0.97       334



# Sauvegardes

In [26]:
with open('best_svc.pickle', 'wb') as output:
    pickle.dump(best_svc, output)