In [1]:
import pickle 

from pprint import pprint
import pandas as pd
from sklearn import svm
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns


In [2]:
DATA_PATH = "../input/bbc-tfidf/tfidf/"
df = pd.read_csv('../input/bbc-tfidf/contents.csv', dtype={'label': 'category'})

with open(DATA_PATH+'features_train.pickle', 'rb') as data:
    features_train = pickle.load(data)

with open(DATA_PATH+'labels_train.pickle', 'rb') as data:
    labels_train = pickle.load(data)

with open(DATA_PATH+'features_test.pickle', 'rb') as data:
    features_test = pickle.load(data)

with open(DATA_PATH+'labels_test.pickle', 'rb') as data:
    labels_test = pickle.load(data)

In [3]:
print(len(features_train), len(features_test))

# Default parameters
Text have high dimensional feature spaces and should usually be linearly separable.  
LinearSVC should therefore give good results.  
cf. Text Categorization with Support Vector Machines: Learning with Many Relevant Features Thorsten Joachims  
et https://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf

In [4]:
base_model = svm.LinearSVC(random_state=8)
base_model.fit(features_train, labels_train)
accuracy_score(labels_test, base_model.predict(features_test))

In [5]:
base_model = svm.SVC(random_state=8)
base_model.fit(features_train, labels_train)
accuracy_score(labels_test, base_model.predict(features_test))

In [6]:
pprint(base_model.get_params())

# Random Search

In [7]:
# Random Search
# how much samples inside the margin contribute to the overall error
# large value: smaller margin (misclassification are not allowed), small value: larger margin
C = [.0001, .001, .01, 0.1, 1, 10]
#
gamma = [.00001, .0001, .001, .01, .1, 1, 10, 20, 50, 100]
degree = [1, 2, 3, 4, 5]
kernel = ['linear', 'rbf', 'poly']
probability = [True]

random_grid = {'C': C,
              'kernel': kernel,
              'gamma': gamma,
              'degree': degree,
              'probability': probability
             }

pprint(random_grid)

In [17]:
# First create the base model to tune
svc = svm.SVC(random_state=8)

# Definition of the random search
random_search = RandomizedSearchCV(estimator=svc,
                                   param_distributions=random_grid,
                                   n_iter=10,
                                   scoring='accuracy',
                                   cv=3, 
                                   verbose=1, 
                                   random_state=8,
                                   n_jobs=-1)

# Fit the random search model
random_search.fit(features_train, labels_train)

In [18]:
print("The best hyperparameters from Random Search are:")
print(random_search.best_params_)
print("")
print("The mean accuracy of a model with these hyperparameters is:")
print(random_search.best_score_)

In [19]:
best_svc = random_search.best_estimator_
best_svc

# Fit best model

In [20]:
best_svc.fit(features_train, labels_train)

# Performance analysis

In [21]:
# Training accuracy
print("The training accuracy is: ")
print(accuracy_score(labels_train, best_svc.predict(features_train)))

In [22]:
svc_pred = best_svc.predict(features_test)

In [24]:
# Test accuracy
print("The test accuracy is: ")
print(accuracy_score(labels_test, svc_pred))

In [25]:
# Classification report
print("Classification report")
print(classification_report(labels_test,svc_pred))


# Sauvegardes

In [26]:
with open('best_svc.pickle', 'wb') as output:
    pickle.dump(best_svc, output)