# support vector machine classifier model

- assume training data in filepath
    + `data/processed/train.csv`
- assume testing data in filepath
    + `data/processed/test.csv`
- assume both have same structure:
    + first column is query 
    + second column is category
- assume input strings are 'clean':
    + in lower case
    + punctuation removed (stop words included)
    + words separated by spaces (no padding)
- output:
    + `models/supportvectormachine.pckl`

In [15]:
import numpy as np
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import make_column_transformer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC #(setting multi_class=”crammer_singer”)

split_training_data_filepath = '../data/processed/train.csv'
split_testing_data_filepath  = '../data/processed/test.csv'
model_filepath = '../models/supportvectormachine.pckl'

In [2]:
# fetch training data only:
df_train = pd.read_csv(split_training_data_filepath, index_col=0)
# do not want the class label to be numerical nor ordinal. 
# df_train['category'] = pd.Categorical(df_train['category'])
df_train['category'] = pd.Categorical(df_train['category'])
display(df_train.sample(3))

Unnamed: 0,query,category
421399,coding for schools,1277
408687,sepa direct debit,381
221795,hose pipe reel,1130


In [3]:
tfidf = TfidfVectorizer(
#     stop_words=stop_words,
    stop_words = {'english'},
    strip_accents= 'ascii',
    ngram_range=(1,1), # consider unigrams/bigrams/trigrams?
    min_df = 4,
    max_df = 0.80,
    binary=True, # count term occurance in each query only once
)

# column transformer
all_transforms = make_column_transformer(
    (tfidf, ['query'])
)

# classify using random forest (resistant to overfitting):
clf_svc = LinearSVC(
    dual=False,
    tol=1e-3,
#     multi_class='crammer_singer',
    multi_class='ovr',
#     verbose=3,
    max_iter=2500,
)
pipe = make_pipeline(all_transforms, clf_svc)

In [4]:
%%time
# scores = cross_val_score(pipe, df_train['query'], df_train['category'], cv=10, scoring='accuracy')
# scores = cross_val_score(clf_svc, X, y, cv=5, scoring='f1_macro')
# scores = cross_val_score(clf_svc, X, df_train['category'], cv=2, scoring='accuracy')
# cross_val_score(pipe, df_train['query'], df_train['category'], cv=5, scoring='accuracy')
# print("accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.77 µs


In [5]:
%%time
X = tfidf.fit_transform(df_train['query'])
display(X.shape)

(485458, 22366)

CPU times: user 2.65 s, sys: 33.8 ms, total: 2.69 s
Wall time: 2.69 s


`X` is a:
```
<546140x23695 sparse matrix of type '<class 'numpy.float64'>'
	with 1625786 stored elements in Compressed Sparse Row format>
```

In [6]:
%%time
# train model
clf_svc.fit(X, df_train['category'])
with open(model_filepath, 'wb') as filepath:
    pickle.dump(clf_svc, filepath)
print('trained svm model and exported to', model_filepath)

CPU times: user 2h 31min 25s, sys: 21min 5s, total: 2h 52min 30s
Wall time: 21min 48s


LinearSVC(dual=False, max_iter=200, tol=0.001)

In [7]:
%%time
# check model
df_test = pd.read_csv(split_testing_data_filepath, index_col=0)
X_test = tfidf.transform(df_test['query'])
print('read in test data', X_test.shape)
y_predicted = clf_svc.predict(X_test)
print('computed', len(y_predicted), 'predictions of test data')

read in test data (121365, 22366)
computed 121365 predictions of test data
CPU times: user 2 s, sys: 755 ms, total: 2.75 s
Wall time: 2.84 s


In [9]:
print('number of correct test predictions:', sum(y_predicted == df_test['category']))
print('number of incorrect predictions:', sum(y_predicted != df_test['category']))
print('ratio of correct test predictions:', round(sum(y_predicted == df_test['category'])/len(df_test),3))
print('')

number of correct test predictions: 75024
number of incorrect predictions: 46341
ratio of correct test predictions: 0.618

