### This notebook includes the cross validation experiment on the MHA pairs.

In [1]:
# adding dir to the Python path

import os
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

In [2]:
from scripts.utilities import convert_to_pairwise, reader
import numpy as np
from sklearn import svm, model_selection
import pickle

In [3]:
vectors, ids, labels, pairs, bow_tags = reader('output', 'mha')
id_label_dict = dict(zip(ids, labels))

## Crossvalidation Experiments

The CV splitter is StratifiedKFold, which maintains the same class distribution in all folds.
I use 5-fold crossvalidation and a simple linear SVM model. Calculated are the average accuracy and the 95% confidence intervals.

In [4]:
cv = model_selection.StratifiedKFold(n_splits=5, random_state=3, shuffle=True)
clf = svm.SVC(kernel='linear', C=.1)

def accuracy_95ci(X, y):
    acc_scores = model_selection.cross_val_score(clf, X, y, cv=cv)
    print("Accuracy: {:.3f} (+/- {:.2f})".format(acc_scores.mean(), acc_scores.std() * 2))

In [5]:
# All features except tags BOW 

accuracy_95ci(*convert_to_pairwise(vectors, ids, id_label_dict, pairs))

Accuracy: 0.643 (+/- 0.03)


In [6]:
# Only length feature

accuracy_95ci(*convert_to_pairwise(vectors[:, 3:4], ids, id_label_dict, pairs))

Accuracy: 0.622 (+/- 0.02)


In [7]:
# All features except length and tags BOW

accuracy_95ci(*convert_to_pairwise(np.delete(vectors, 3, 1), ids, id_label_dict, pairs))

Accuracy: 0.617 (+/- 0.07)


In [8]:
# Only character features

accuracy_95ci(*convert_to_pairwise(vectors[:, 1:3], ids, id_label_dict, pairs))

Accuracy: 0.527 (+/- 0.06)


In [9]:
# Only relationship features

accuracy_95ci(*convert_to_pairwise(vectors[:, 5:7], ids, id_label_dict, pairs))

Accuracy: 0.490 (+/- 0.03)


In [10]:
# Only summary features

accuracy_95ci(*convert_to_pairwise(vectors[:, 7:27], ids, id_label_dict, pairs))

Accuracy: 0.605 (+/- 0.07)


In [11]:
# Only tag features

accuracy_95ci(*convert_to_pairwise(vectors[:, 27:33], ids, id_label_dict, pairs))

Accuracy: 0.575 (+/- 0.08)


In [12]:
# Only title features

accuracy_95ci(*convert_to_pairwise(vectors[:, 33:], ids, id_label_dict, pairs))

Accuracy: 0.474 (+/- 0.04)


In [13]:
# Only category feature

accuracy_95ci(*convert_to_pairwise(vectors[:, 0:1], ids, id_label_dict, pairs))

Accuracy: 0.528 (+/- 0.05)


In [14]:
# Only rating feature  - controlled for, so should have the same accuracy as a random baseline

accuracy_95ci(*convert_to_pairwise(vectors[:, 4:5], ids, id_label_dict, pairs))

Accuracy: 0.500 (+/- 0.00)


In [15]:
# Only tag BOW features

accuracy_95ci(*convert_to_pairwise(bow_tags, ids, id_label_dict, pairs))

Accuracy: 0.655 (+/- 0.08)


In [16]:
# ALL features

accuracy_95ci(*convert_to_pairwise(vectors, ids, id_label_dict, pairs, other_vectors=bow_tags))

Accuracy: 0.672 (+/- 0.05)


In [17]:
# ALL features except length

accuracy_95ci(*convert_to_pairwise(np.delete(vectors, 3, 1), ids, id_label_dict, pairs, other_vectors=bow_tags))

Accuracy: 0.666 (+/- 0.04)


In [18]:
# All features except length; discard all pairs with score difference < 20

accuracy_95ci(*convert_to_pairwise(np.delete(vectors, 3, 1), ids, id_label_dict, pairs, other_vectors=bow_tags, min_diff=20))

Accuracy: 0.683 (+/- 0.05)


In [19]:
# All tags features

accuracy_95ci(*convert_to_pairwise(bow_tags, ids, id_label_dict, pairs, other_vectors=vectors[:, 27:33]))

Accuracy: 0.640 (+/- 0.05)
