In [25]:
import numpy as np
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import make_column_transformer
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC #(setting multi_class=”crammer_singer”)

split_training_data_filepath = '../data/processed/train.csv'
split_testing_data_filepath  = '../data/processed/test.csv'
model_filepath = '../models/supportvectormachine.pckl'

outputmodel_filepath = "../models/final_model_object.pckl"

In [18]:
# fetch training data only:
df_train = pd.read_csv(split_training_data_filepath, index_col=0)
# do not want the class label to be numerical nor ordinal. 
# df_train['category'] = pd.Categorical(df_train['category'])
df_train['category'] = df_train['category'].astype(str)
display(df_train.sample(3))

Unnamed: 0,query,category
546291,jobs in property,1049
401166,personalised stones,419
571441,employment law redundancy,1327


In [19]:
tfidf = TfidfVectorizer(
    stop_words = {'english'},
    strip_accents= 'ascii',
    ngram_range=(1,1), # consider unigrams/bigrams/trigrams?
    min_df = 8,
    max_df = 0.80,
    binary=True, # count term occurance in each query only once
)

# column transformer
all_transforms = make_column_transformer(
    (tfidf, ['query'])
)

# classify using random forest (resistant to overfitting):
clf_svc = LinearSVC(
    dual=False,
    tol=1e-3,
    multi_class='ovr',
    class_weight='balanced'
    max_iter=1000,
)
pipe = make_pipeline(all_transforms, clf_svc)

In [20]:
%%time
# kfolds = RepeatedStratifiedKFold(n_splits=2, n_repeats=2, random_state=42)
# scores = cross_val_score(
#     pipe, 
#     df_train['query'], 
#     df_train['category'], 
#     scoring='accuracy', 
#     cv=kfolds, 
#     n_jobs=-1, 
#     error_score='raise')

# scores = cross_val_score(pipe, df_train['query'], df_train['category'], cv=2, scoring='accuracy')
# scores = cross_val_score(clf_svc, X, y, cv=5, scoring='f1_macro')
# scores = cross_val_score(clf_svc, X, df_train['category'], cv=2, scoring='accuracy')
# cross_val_score(pipe, df_train['query'], df_train['category'], cv=5, scoring='accuracy')
print("accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

accuracy: nan (+/- nan)
CPU times: user 279 µs, sys: 79 µs, total: 358 µs
Wall time: 324 µs


In [21]:
%%time
X = tfidf.fit_transform(df_train['query'])
display(X.shape)

(485458, 14446)

CPU times: user 2.63 s, sys: 34.8 ms, total: 2.66 s
Wall time: 2.66 s


In [15]:
%%time
# train model
# pipe.fit(X, df_train['category'])

clf_svc.fit(X, df_train['category'])

with open(model_filepath, 'wb') as filepath:
    pickle.dump(clf_svc, filepath)
print('trained svm model and exported to', model_filepath)

trained svm model and exported to ../models/supportvectormachine.pckl
CPU times: user 2h 38min 8s, sys: 21min 32s, total: 2h 59min 40s
Wall time: 22min 41s


In [16]:
%%time
# check model
df_test = pd.read_csv(split_testing_data_filepath, index_col=0)
X_test = tfidf.transform(df_test['query'])
print('read in test data', X_test.shape)
y_predicted = clf_svc.predict(X_test)
print('computed', len(y_predicted), 'predictions of test data')

read in test data (121365, 14446)
computed 121365 predictions of test data
CPU times: user 1.86 s, sys: 693 ms, total: 2.55 s
Wall time: 2.55 s


In [17]:
print('number of correct test predictions:', sum(y_predicted == df_test['category']))
print('number of incorrect predictions:', sum(y_predicted != df_test['category']))
print('ratio of correct test predictions:', round(sum(y_predicted == df_test['category'])/len(df_test),3))
print('')

number of correct test predictions: 73139
number of incorrect predictions: 48226
ratio of correct test predictions: 0.603



In [22]:
%%time
scores = cross_val_score(clf_svc, X, df_train['category'], cv=3, scoring='accuracy')
# this is plenty stable:
print("accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

accuracy: 0.59 (+/- 0.00)


In [None]:
with open(outputmodel_filepath, 'wb') as filepath:
    pickle.dump(clf_svc, filepath)
print('trained final svm model and exported to', outputmodel_filepath)


In [None]:
%%time
parameters = {
    'max_iter':[300,1000,3000],
    'C': [0.5,0,7,1]
}
parameter_search = GridSearchCV(clf_svc,parameters)

In [26]:
with open(outputmodel_filepath, 'wb') as filepath:
    pickle.dump(clf_svc, filepath)
print('trained final svm model and exported to', outputmodel_filepath)

trained final svm model and exported to ../models/final_model_object.pckl


In [None]:
score(X, y[, sample_weight])