In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.base import TransformerMixin, ClassifierMixin, BaseEstimator
from sklearn.preprocessing import FunctionTransformer, label_binarize
from sklearn.pipeline import Pipeline
from sklearn.svm import *
from sklearn.linear_model import *
from sklearn.naive_bayes import *
from sklearn.ensemble import *
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics import accuracy_score
from scipy import sparse
import numpy as np
from collections import OrderedDict
import warnings
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

# `SVM` with `NB` features

This notebook implements the NB-SVM classifier from [this paper](https://nlp.stanford.edu/pubs/sidaw12_simple_sentiment.pdf) that has become a classic (and competitive) baseline.

## toy dataset

Working example borrowed from [Stanford IR online text](https://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html)

| docID | text                                | label     |
|-------|-------------------------------------|-----------|
| 1     | chinese beijing chinese             | CHINA     |
| 2     | chinese chinese shanghai            | CHINA     |
| 3     | chinese macao                       | CHINA     |
| 4     | tokyo japan chinese                 | NOT_CHINA |
| 5     | chinese chinese chinese tokyo japan | ???       |


In [2]:
doc_one = "chinese beijing chinese"
label_one = 1

doc_two = "chinese chinese shanghai"
label_two = 1

doc_three = "chinese macao"
label_three = 1

doc_four = "tokyo japan chinese"
label_four = 0

all_docs = [doc_one, doc_two, doc_three, doc_four]
all_labels = [label_one, label_two, label_three, label_four]

## traditional `bernoulli` `vectorizer`

The paper found binary features to be more effective than raw counts.

In [3]:
vectorizer_bernoulli = CountVectorizer(
    binary=True
)
X_bernoulli = vectorizer_bernoulli.fit_transform(all_docs)

In [4]:
vectorizer_bernoulli.vocabulary_

{'beijing': 0, 'chinese': 1, 'japan': 2, 'macao': 3, 'shanghai': 4, 'tokyo': 5}

In [5]:
X_bernoulli.toarray()

array([[1, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 1, 0],
       [0, 1, 0, 1, 0, 0],
       [0, 1, 1, 0, 0, 1]], dtype=int64)

## `NB` transformer

This is a custom `transformer` that will use the Naive Bayes conditional probabilities as feature values.  See [the paper](https://nlp.stanford.edu/pubs/sidaw12_simple_sentiment.pdf) for details.

The key thing to recognize is that this method is built off of the assumption of a `binary` (only two labels) problem.  It can be expanded to `multiclass` utilizing the `one-v-all` approach, but this also requires a **different** transformation for **each** label, thus the inclusion of `label_of_interest` as an argument to the `NBTransformer`.

In [6]:
class NBTransformer(TransformerMixin):
    """
    Feature transformation to utilize NaiveBayes conditional probabilities.
    From https://nlp.stanford.edu/pubs/sidaw12_simple_sentiment.pdf
    """
    def __init__(self, all_labels, label_of_interest, smoothing_factor=1):
        """
        :param all_labels: <list> of all labels in training set
        :param label_of_interest: label considered `positive`
        :param smoothing_factor: smoothing value to be applied to transformation
        """
        self.labels = all_labels
        self.label_of_interest = label_of_interest
        self.smoothing_factor = smoothing_factor
        self._r = None
        
    def _get_positive_rows(self):
        """
        Finds all rows of data considered to be the `label_of_interest`
        :return: <list> of indexes
        """
        # ensure labels as np array
        try:
            as_array = np.array(self.labels)
        except:
            as_array = self.labels
        # get indices of label_of_interest
        idxs = np.where(as_array == self.label_of_interest)[0]
        return idxs  
    
    def _get_negative_rows(self):
        """
        Finds all rows of data considered NOT to be the `label_of_interest`
        :return: <list> of indexes
        """
        # ensure labels as np array
        try:
            as_array = np.array(self.labels)
        except:
            as_array = self.labels
        # get indices of label_of_interest
        idxs = np.where(as_array != self.label_of_interest)[0]
        return idxs
    
    def _get_p_not_p(self, feature_matrix):
        """
        Builds the *unnormalized* `p` and `not_p` vector (representing the conditional probabilities)
        :param feature_matrix: sparse representation of features
        :param alpha: smoothing factor
        :return: <tuple> (p, not_p)
        """
        # find indices of p and not_p
        positive_rows = self._get_positive_rows()
        negative_rows = self._get_negative_rows()
        # get summed vectors
        positive_summed = np.zeros((1, feature_matrix.get_shape()[1]))
        negative_summed = np.zeros((1, feature_matrix.get_shape()[1]))
        for i in range(feature_matrix.get_shape()[0]):
            if i in positive_rows:
                positive_summed += feature_matrix.getrow(i).toarray()
            else:
                negative_summed += feature_matrix.getrow(i).toarray()
        # smooth
        p = positive_summed + self.smoothing_factor
        not_p = negative_summed + self.smoothing_factor
        return p, not_p

    def _get_r(self, un_normalized_p, un_normalized_not_p):
        """
        Builds the `r` vector representing the `log` ratio of *normalized* `p` to `not_p`
        :param un_normalized_p: unnormalized conditional probability vector for `p` (output of `._get_p_not_p()`)
        :param un_normalized_not_p: unnormalized conditional probability vector for ` not_p` (output of `._get_p_not_p()`)
        :return: `r`
        """
        # normalize
        p = un_normalized_p / np.sum(un_normalized_p)
        not_p = un_normalized_not_p / np.sum(un_normalized_not_p)
        # calculate r
        r = np.log(p/not_p)
        return r

#     def fit_transform(self, X, *_):
#         """
#         Applies the transformation of features
#         :param X: original `sparse feature matrix`
#         :return transformed `sparse feature matrix`
#         """
#         if not np.any(self._r):
#             # this is being called during training, so must build self._r
#             # get positive rows
#             pos_idxs = self._get_positive_rows()
#             # get negative rows
#             neg_idxs = self._get_negative_rows()
#             # get p, not_p
#             _p, _not_p = self._get_p_not_p(X)
#             # get r
#             self._r = self._get_r(_p, _not_p)
#         return sparse.csr_matrix(X.multiply(self._r))
    
    def transform(self, X, *_):
        """
        Applies the transformation of features
        :param X: original `sparse feature matrix`
        :return transformed `sparse feature matrix`
        """
        if not np.any(self._r):
            # this is being called during training, so must build self._r
            # get positive rows
            pos_idxs = self._get_positive_rows()
            # get negative rows
            neg_idxs = self._get_negative_rows()
            # get p, not_p
            _p, _not_p = self._get_p_not_p(X)
            # get r
            self._r = self._get_r(_p, _not_p)
        return sparse.csr_matrix(X.multiply(self._r))
    
    def fit(self, *_):
        return self

In [7]:
nb_transformer = NBTransformer(all_labels, 1)

### original feature space

In [8]:
original_features = vectorizer_bernoulli.fit_transform(all_docs)
original_features.toarray()

array([[1, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 1, 0],
       [0, 1, 0, 1, 0, 0],
       [0, 1, 1, 0, 0, 1]], dtype=int64)

### transformed feature space

In [9]:
nb_transformed = nb_transformer.transform(original_features)
nb_transformed.toarray()

array([[ 0.40546511,  0.40546511,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.        ,  0.40546511,  0.        ,  0.        ,  0.40546511,
         0.        ],
       [ 0.        ,  0.40546511,  0.        ,  0.40546511,  0.        ,
         0.        ],
       [ 0.        ,  0.40546511, -0.98082925,  0.        ,  0.        ,
        -0.98082925]])

## Train

The paper found `squared loss` and `l2` penalizing to be most effective.

In [10]:
svm_clf = LinearSVC(            
    loss='squared_hinge',
    penalty='l2',
    random_state = 1,         # to ensure reproducible results
    class_weight='balanced',
    dual=False
)

In [11]:
svm_clf.get_params()

{'C': 1.0,
 'class_weight': 'balanced',
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'loss': 'squared_hinge',
 'max_iter': 1000,
 'multi_class': 'ovr',
 'penalty': 'l2',
 'random_state': 1,
 'tol': 0.0001,
 'verbose': 0}

In [12]:
svm_clf.fit(nb_transformed, np.array(all_labels))

LinearSVC(C=1.0, class_weight='balanced', dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=1, tol=0.0001,
     verbose=0)

Next we interpolate the weights using the following equation: <br>
    $w' = (1 - \beta)\bar{w} + \beta w$ where $\bar{w}$ is the mean magnitude of all the weights

In [13]:
svm_clf.coef_

array([[ 0.17147871,  0.21355567,  0.7278366 ,  0.17147871,  0.17147871,
         0.7278366 ]])

In [14]:
interpolation_factor = 0.25

In [15]:
mean_magnitude = np.sum(svm_clf.coef_)/svm_clf.coef_.shape[1]
mean_magnitude_vector = np.full(svm_clf.coef_.shape, mean_magnitude)
mean_magnitude_vector

array([[ 0.36394416,  0.36394416,  0.36394416,  0.36394416,  0.36394416,
         0.36394416]])

In [16]:
svm_clf.coef_ = (1 - interpolation_factor) * mean_magnitude_vector + \
                                interpolation_factor * svm_clf.coef_
svm_clf.coef_

array([[ 0.3158278 ,  0.32634704,  0.45491727,  0.3158278 ,  0.3158278 ,
         0.45491727]])

## Test

In [17]:
test_doc = "chinese chinese chinese tokyo japan"

We first must `vectorize` the test item into the original feature space.

In [18]:
test_features_original = vectorizer_bernoulli.transform([test_doc])

Then we use the previously learned transformer to apply the NB-transformation.

In [19]:
test_features = nb_transformer.transform(test_features_original)
test_features.toarray()

array([[ 0.        ,  0.40546511, -0.98082925,  0.        ,  0.        ,
        -0.98082925]])

The model predicts the test document to be in the `not China` class with relative confidence.  

Note: This is opposite the prediction made in the Stanford IR worked example because they use `multinomial NB` and the presence of "China" three times outweighed the presence of the other words.

In [20]:
svm_clf.predict(test_features), svm_clf.decision_function(test_features)

(array([0]), array([-0.2333769]))

## custom `classifier`

As explained above, this transformation is built for `binary` classification.  The `multiclass` version requires the application of `one-v-all` to a traditional `SVM` classifier.  This is built below.

In [21]:
class NB_plus_Classifier(BaseEstimator, ClassifierMixin):
    """
    Implementation of this paper (https://nlp.stanford.edu/pubs/sidaw12_simple_sentiment.pdf) 
    using one-v-rest for multiclass datasets
    """
    def __init__(self, 
                 list_of_classes=[0,1],
                 clf=LinearSVC(     
                    loss='squared_hinge',
                    penalty='l2',
                    random_state = 1,         # to ensure reproducible results
                    class_weight='balanced'
                    ),
                 interpolation_factor=0.25
                ):
        """
        :param list_of_classes: <list> of all possible classes
        :param clf: instance of a `sklearn` `Classifier`
        :param interpolation_factor: amount to interpolate NB and Linear Classifier
        """
        self.list_of_classes = list_of_classes
        self.multiclass, self.ovr_classifiers = self._build_ovr_classifiers(list_of_classes, clf)
        self.interpolation_factor = interpolation_factor if interpolation_factor else 1.0
        self.nb_transformers = {}
        
    def _build_ovr_classifiers(self, list_of_all_possible_classes, classifier_instance):
        """
        Builds a classifier for each class in the 'one-v-all' approach
        """
        all_clfs = OrderedDict()
        for i in list_of_all_possible_classes:
            all_clfs[i] = classifier_instance
        if len(all_clfs) == 2:
            # is just a binary problem!
            return False, all_clfs[list_of_all_possible_classes[0]]
        elif len(all_clfs) > 2:
            return True, all_clfs
        else:
            raise Exception("only one label was provided!")
        
    def _binarize_labels(self, y):
        distinct_labels = list(set(y))
        all_labels = label_binarize(y, distinct_labels).transpose()
        labels_dict = {}
        for i in range(len(distinct_labels)):
            labels_dict[distinct_labels[i]] = all_labels[i]
        return labels_dict
        
    def fit(self, X, y):
        if not self.multiclass:
            # transform X
            # considering last label in self.list_of_classes as the `positive`
            self.nb_transformers = NBTransformer(y, self.list_of_classes[-1])
            X_transformed = self.nb_transformers.transform(X)
            # just handle like a "vanilla" sklearn classifier
            self.ovr_classifiers.fit(X_transformed, y)
            # update weights with interpolation
            try:
                self.ovr_classifiers.coef_ = self._interpolate(self.ovr_classifiers.coef_)
            except:
                warnings.warn(
                    "the classifier you instantiated with does not have an attribute `.coef_`"
                    "interpolation will not occur"
                )
        else:
            # handle with 'one-v-rest' approach
            # binarize labels
            labels_dict = self._binarize_labels(y)
            if labels_dict.keys() != self.ovr_classifiers.keys():
                raise Exception(
                    "mismatch in labels during fit() and during class instantiation; {} != {}".format(
                        labels_dict.keys(), self.ovr_classifiers.keys()
                    )
                  )
            for l, clf in self.ovr_classifiers.items():
                # transform X for this particular label
                self.nb_transformers[l] = NBTransformer(y, l)
                X_transformed = self.nb_transformers[l].transform(X)
                # fit individual classifier with transformed data
                clf.fit(X_transformed, labels_dict[l])
                # update weights with interpolation
                try:
                    self.ovr_classifiers.coef_ = self._interpolate(self.ovr_classifiers.coef_)
                except:
                    if l == list(self.ovr_classifiers.keys())[0]:
                        warnings.warn(
                            "the classifier you instantiated with does not have an attribute `.coef_`"
                            "interpolation will not occur"
                        )
            
    def _interpolate(self, coeffs):
        # calculate mean magnitude
        mean_magnitude = np.sum(coeffs)/coeffs.shape[1]
        mean_magnitude_vector = np.full(coeffs.shape, mean_magnitude)
        # build interpolated coeffs
        interpolated_coeffs = (1 - self.interpolation_factor) * mean_magnitude_vector + \
                                self.interpolation_factor * coeffs
        return interpolated_coeffs
        
    def decision_function(self, X):
        if not self.multiclass:
            # transform X
            X_transformed = self.nb_transformers.transform(X)
            try:
                final_distance = self.ovr_classifiers.decision_function(X_transformed)
                return final_distance
            except:
                raise Exception(
                    "the classifier you instantiated with does not have a method for `decision_function()`"
                )
        else:
            # handle with 'one-v-rest' approach
            all_distances = None
            for l, clf in self.ovr_classifiers.items():
                # transform X
                X_transformed = self.nb_transformers[l].transform(X)
                try:
                    distance = clf.decision_function(X_transformed)
                except:
                    raise Exception(
                        "the classifier you instantiated with does not have a method for `decision_function()`"
                    )
                if np.any(all_distances):
                    all_distances = np.vstack((all_distances, distance))
                else:
                    all_distances = distance
            return np.array(all_distances)
        
    def predict(self, X):
        if not self.multiclass:
            # transform X
            X_transformed = self.nb_transformers.transform(X)
            return self.ovr_classifiers.predict(X_transformed)
        else:
            # get all boundary distances
            all_distances = self.decision_function(X)
            # return most positive (or least negative) margin
            return np.argmax(all_distances, axis=0)

## `binary` test

The above custom `classifier` can still be used in the `binary` case.

In [22]:
binary = NB_plus_Classifier()
binary.fit(vectorizer_bernoulli.fit_transform(all_docs), np.array(all_labels))

In [23]:
test_features = vectorizer_bernoulli.transform([test_doc])

In [24]:
binary.decision_function(test_features), binary.predict(test_features)

(array([-0.23337768]), array([0]))

## `multiclass` test

Here is a simple toy dataset to test the `multiclass` case.

In [25]:
obvious_docs = [
    "cat",
    "cat",
    "cat",
    "pencil",
    "pencil",
    "pencil",
    "car",
    "car",
    "car",
]

obvious_labels = [
    0,
    0,
    0,
    1,
    1,
    1,
    2,
    2,
    2
]

test_docs = [
    "cat",
    "pen",
    "pencil",
    "car"
]

### traditional `bernoulli` `vectorizer`

In [26]:
multiclass_bernoulli_vectorizer = CountVectorizer(
    binary=True
)

multiclass_original_features = multiclass_bernoulli_vectorizer.fit_transform(obvious_docs)

In [27]:
multiclass_original_features.toarray()

array([[0, 1, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [1, 0, 0],
       [1, 0, 0],
       [1, 0, 0]], dtype=int64)

### Train

In [28]:
multiclass = NB_plus_Classifier([0,1,2])
multiclass.fit(multiclass_bernoulli_vectorizer.fit_transform(obvious_docs), obvious_labels)


the classifier you instantiated with does not have an attribute `.coef_`interpolation will not occur



### Test

In [29]:
multiclass_test_features = multiclass_bernoulli_vectorizer.transform(test_docs)
multiclass_test_features.toarray()

array([[0, 1, 0],
       [0, 0, 0],
       [0, 0, 1],
       [1, 0, 0]])

The classifier generates predictions for each class.  Below, each row is a classifier trained for a different label.  The values in the vector represent the margin of each data point from the decision boundary.

Note: Since the second data point ("pen") was never seen in the training vocabulary, the classifiers all generate the same prediction, none of which are positive (implying a low level of confidence)

In [30]:
for i, row in zip([0,1,2], multiclass.decision_function(multiclass_test_features)):
    print(i, row)

0 [ 0.33105273 -0.46413668 -0.89943101 -1.23883467]
1 [-0.89943229 -0.46413668  0.33105039 -1.23883467]
2 [-0.89943229 -0.46413668 -0.89943101  0.95106629]


We then select the "most confident" classifier for each data point, where "most confident" is the one with the largest positive margin.

In [31]:
multiclass.predict(multiclass_test_features)

array([0, 0, 1, 2])

# Experiment on `20Newsgroups`

## `alt.atheism` v. `talk.religion.misc`

In [32]:
# categories = None
categories = [
    'alt.atheism', 'talk.religion.misc',
]

In [33]:
remove = (
    'headers', 
#     'footers', 
#     'quotes'
)

## Data

In [34]:
data_train = fetch_20newsgroups(subset='train', categories=categories,
                                shuffle=True, random_state=42,
                                remove=remove)

data_test = fetch_20newsgroups(subset='test', categories=categories,
                               shuffle=True, random_state=42,
                               remove=remove)

idx2class_lookup = dict((i,c) for i,c in enumerate(data_train.target_names))
idx2class_lookup

{0: 'alt.atheism', 1: 'talk.religion.misc'}

In [35]:
# get label distribution
num_pos = list(data_train.target).count(1)
num_neg = list(data_train.target).count(0)
num_pos, num_neg

(377, 480)

In [36]:
# determine intercept (log(N+/N-))
intercept = np.log(num_pos/num_neg)
intercept

-0.24154091645392586

## Unigrams

### Vectorize

In [37]:
newsgroups_vectorizer_uni = CountVectorizer(
    binary=True
)

newsgroups_vectorizer_tfidf = TfidfVectorizer()

In [38]:
newsgroups_train_features_uni = newsgroups_vectorizer_uni.fit_transform(data_train.data)
newsgroups_train_features_uni

<857x17440 sparse matrix of type '<class 'numpy.int64'>'
	with 136539 stored elements in Compressed Sparse Row format>

In [39]:
newsgroups_test_features_uni = newsgroups_vectorizer_uni.transform(data_test.data)
newsgroups_test_features_uni

<570x17440 sparse matrix of type '<class 'numpy.int64'>'
	with 86232 stored elements in Compressed Sparse Row format>

### transformed - nb

In [40]:
all_classifiers = OrderedDict()

all_classifiers["svm_from_paper"] = LinearSVC(
    loss='squared_hinge',
    penalty='l2',
    random_state=1,
    dual=False
)

all_classifiers["sgd"] = SGDClassifier(
    n_iter=50,
    penalty='elasticnet',
    random_state=1,
)

all_classifiers["ridge"] = RidgeClassifier(
    tol=1e-2,
    solver='lsqr',
    random_state=1,
)

all_classifiers["perceptron"] = Perceptron(
    n_iter=50,
    random_state=1,
)

all_classifiers["passive_aggressive"] = PassiveAggressiveClassifier(
    n_iter=50,
    random_state=1,
)

all_classifiers["adaboost"] = AdaBoostClassifier(
    n_estimators=100,
    random_state=1,
)

In [41]:
newsgroups_nb_plus_clfs = OrderedDict()
for clf_name, clf in all_classifiers.items():
    newsgroups_nb_plus_clfs[clf_name] = NB_plus_Classifier(
        list(map(lambda x: x[0], idx2class_lookup.items())),
        clf
)

In [42]:
# transformed - nb
for clf_name, clf in newsgroups_nb_plus_clfs.items():
    print("training {} - nb transformed".format(clf_name))
    clf.fit(newsgroups_train_features_uni, data_train.target)

training svm_from_paper - nb transformed
training sgd - nb transformed
training ridge - nb transformed



In Ridge, only 'sag' solver can currently fit the intercept when X is sparse. Solver has been automatically changed into 'sag'.



training perceptron - nb transformed
training passive_aggressive - nb transformed
training adaboost - nb transformed



the classifier you instantiated with does not have an attribute `.coef_`interpolation will not occur



In [43]:
newsgroups_nb_transformed_uni_results = OrderedDict()
for clf_name, clf in newsgroups_nb_plus_clfs.items():
    print("testing transformed {}".format(clf_name))
    preds = clf.predict(newsgroups_test_features_uni)
    newsgroups_nb_transformed_uni_results[clf_name] = accuracy_score(data_test.target, preds)
newsgroups_nb_transformed_uni_results

testing transformed svm_from_paper
testing transformed sgd
testing transformed ridge
testing transformed perceptron
testing transformed passive_aggressive
testing transformed adaboost


OrderedDict([('svm_from_paper', 0.76315789473684215),
             ('sgd', 0.81929824561403508),
             ('ridge', 0.78596491228070176),
             ('perceptron', 0.83157894736842108),
             ('passive_aggressive', 0.8403508771929824),
             ('adaboost', 0.75964912280701757)])

### no transformation

In [44]:
all_classifiers = OrderedDict()

all_classifiers["svm_from_paper"] = LinearSVC(
    loss='squared_hinge',
    penalty='l2',
    dual=False,
    C=0.1,
    random_state=1
)

all_classifiers["sgd"] = SGDClassifier(
    n_iter=50,
    penalty='elasticnet',
    random_state=1,
)

all_classifiers["ridge"] = RidgeClassifier(
    tol=1e-2,
    solver='lsqr',
    random_state=1,
)

all_classifiers["perceptron"] = Perceptron(
    n_iter=50,
    random_state=1,
)

all_classifiers["passive_aggressive"] = PassiveAggressiveClassifier(
    n_iter=50,
    random_state=1,
)

all_classifiers["random_forest"] = RandomForestClassifier(
    n_estimators=100,
    random_state=1,
)

all_classifiers["adaboost"] = AdaBoostClassifier(
    n_estimators=100,
    random_state=1,
)

all_classifiers["bernoulli_nb"] = BernoulliNB(
    alpha=0.01,
)

all_classifiers["multinomial_nb"] = MultinomialNB(
    alpha=0.01,
)

In [45]:
# no transformation count
for clf_name, clf in all_classifiers.items():
    print("training baseline {} - no transformation".format(clf_name))
    clf.fit(newsgroups_train_features_uni, data_train.target)

training baseline svm_from_paper - no transformation
training baseline sgd - no transformation
training baseline ridge - no transformation
training baseline perceptron - no transformation
training baseline passive_aggressive - no transformation
training baseline random_forest - no transformation



In Ridge, only 'sag' solver can currently fit the intercept when X is sparse. Solver has been automatically changed into 'sag'.



training baseline adaboost - no transformation
training baseline bernoulli_nb - no transformation
training baseline multinomial_nb - no transformation


In [46]:
newsgroups_no_transformation_uni_results = OrderedDict()
for clf_name, clf in all_classifiers.items():
    print("testing baseline {} - no transformation".format(clf_name))
    preds = clf.predict(newsgroups_test_features_uni)
    newsgroups_no_transformation_uni_results[clf_name] = accuracy_score(data_test.target, preds)
newsgroups_no_transformation_uni_results

testing baseline svm_from_paper - no transformation
testing baseline sgd - no transformation
testing baseline ridge - no transformation
testing baseline perceptron - no transformation
testing baseline passive_aggressive - no transformation
testing baseline random_forest - no transformation
testing baseline adaboost - no transformation
testing baseline bernoulli_nb - no transformation
testing baseline multinomial_nb - no transformation


OrderedDict([('svm_from_paper', 0.77719298245614032),
             ('sgd', 0.756140350877193),
             ('ridge', 0.77017543859649118),
             ('perceptron', 0.76666666666666672),
             ('passive_aggressive', 0.77017543859649118),
             ('random_forest', 0.77894736842105261),
             ('adaboost', 0.75964912280701757),
             ('bernoulli_nb', 0.83157894736842108),
             ('multinomial_nb', 0.83684210526315794)])

### transformation - tfidf

In [47]:
newsgroups_train_features_uni = newsgroups_vectorizer_tfidf.fit_transform(data_train.data)
newsgroups_test_features_uni = newsgroups_vectorizer_tfidf.transform(data_test.data)

In [48]:
all_classifiers = OrderedDict()

all_classifiers["svm_from_paper"] = LinearSVC(
    loss='squared_hinge',
    penalty='l2',
    C=0.1,
    random_state=1,
)

all_classifiers["sgd"] = SGDClassifier(
    n_iter=50,
    penalty='elasticnet',
    random_state=1,
)

all_classifiers["ridge"] = RidgeClassifier(
    tol=1e-2,
    solver='lsqr',
    random_state=1,
)

all_classifiers["perceptron"] = Perceptron(
    n_iter=50,
    random_state=1,
)

all_classifiers["passive_aggressive"] = PassiveAggressiveClassifier(
    n_iter=50,
    random_state=1,
)

all_classifiers["random_forest"] = RandomForestClassifier(
    n_estimators=100,
    random_state=1,
)

all_classifiers["adaboost"] = AdaBoostClassifier(
    n_estimators=100,
    random_state=1,
)

all_classifiers["bernoulli_nb"] = BernoulliNB(
    alpha=0.01
)

all_classifiers["multinomial_nb"] = MultinomialNB(
    alpha=0.01
)

In [49]:
for clf_name, clf in all_classifiers.items():
    print("training baseline {} - tfidf transformation".format(clf_name))
    clf.fit(newsgroups_train_features_uni, data_train.target)

training baseline svm_from_paper - tfidf transformation
training baseline sgd - tfidf transformation
training baseline ridge - tfidf transformation
training baseline perceptron - tfidf transformation
training baseline passive_aggressive - tfidf transformation
training baseline random_forest - tfidf transformation



In Ridge, only 'sag' solver can currently fit the intercept when X is sparse. Solver has been automatically changed into 'sag'.



training baseline adaboost - tfidf transformation
training baseline bernoulli_nb - tfidf transformation
training baseline multinomial_nb - tfidf transformation


In [50]:
newsgroups_tfidf_transformed_uni_results = OrderedDict()
for clf_name, clf in all_classifiers.items():
    print("testing baseline {} - no transformation".format(clf_name))
    preds = clf.predict(newsgroups_test_features_uni)
    newsgroups_tfidf_transformed_uni_results[clf_name] = accuracy_score(data_test.target, preds)
newsgroups_tfidf_transformed_uni_results

testing baseline svm_from_paper - no transformation
testing baseline sgd - no transformation
testing baseline ridge - no transformation
testing baseline perceptron - no transformation
testing baseline passive_aggressive - no transformation
testing baseline random_forest - no transformation
testing baseline adaboost - no transformation
testing baseline bernoulli_nb - no transformation
testing baseline multinomial_nb - no transformation


OrderedDict([('svm_from_paper', 0.79649122807017547),
             ('sgd', 0.81228070175438594),
             ('ridge', 0.81403508771929822),
             ('perceptron', 0.7929824561403509),
             ('passive_aggressive', 0.82105263157894737),
             ('random_forest', 0.75438596491228072),
             ('adaboost', 0.74210526315789471),
             ('bernoulli_nb', 0.83157894736842108),
             ('multinomial_nb', 0.83859649122807023)])

In [51]:
groups = [
    ("no_transform", newsgroups_no_transformation_uni_results),
    ("tfidf_transform", newsgroups_tfidf_transformed_uni_results), 
    ("nb_transform", newsgroups_nb_transformed_uni_results)
]

clfs = list(all_classifiers.keys())

traces = OrderedDict()
for i in range(len(groups)):
    transformation = groups[i][1]
    transformation_name = groups[i][0]
    scores = []
    for label in clfs:
        try:
            score = transformation[label]
        except:
            score = 0.0
        scores.append(score)
    traces["trace_{}".format(transformation_name)] = go.Bar(
        x=clfs,
        y=scores,
        name=transformation_name,
    )

data_ = [v for k, v in traces.items()]
layout_ = go.Layout(
    barmode='group',
    title='Transformation comparison: AthR (2.9)',
    yaxis=dict(
        range=[0, 1]
    )
)

fig_ = go.Figure(data=data_, layout=layout_)
iplot(fig_)    

## `comp.graphics` v. `comp.windows.x`

In [52]:
# categories = None
categories = [
    'comp.graphics', 'comp.windows.x',
#     'rec.sport.baseball', 'sci.crypt',
#     'alt.atheism', 'talk.religion.misc', 'comp.graphics', sci.space',
]

In [53]:
remove = (
    'headers', 
#     'footers', 
#     'quotes'
)

## Data

In [54]:
data_train = fetch_20newsgroups(subset='train', categories=categories,
                                shuffle=True, random_state=42,
                                remove=remove)

data_test = fetch_20newsgroups(subset='test', categories=categories,
                               shuffle=True, random_state=42,
                               remove=remove)

idx2class_lookup = dict((i,c) for i,c in enumerate(data_train.target_names))
idx2class_lookup

{0: 'comp.graphics', 1: 'comp.windows.x'}

In [55]:
# get label distribution
num_pos = list(data_train.target).count(1)
num_neg = list(data_train.target).count(0)
num_pos, num_neg

(593, 584)

In [56]:
# determine intercept (log(N+/N-))
intercept = np.log(num_pos/num_neg)
intercept

0.0152934161694984

## Unigrams

### Vectorize

In [57]:
newsgroups_vectorizer_uni = CountVectorizer(
    binary=True
)

newsgroups_vectorizer_tfidf = TfidfVectorizer()

In [58]:
newsgroups_train_features_uni = newsgroups_vectorizer_uni.fit_transform(data_train.data)
newsgroups_train_features_uni

<1177x21163 sparse matrix of type '<class 'numpy.int64'>'
	with 136868 stored elements in Compressed Sparse Row format>

In [59]:
newsgroups_test_features_uni = newsgroups_vectorizer_uni.transform(data_test.data)
newsgroups_test_features_uni

<784x21163 sparse matrix of type '<class 'numpy.int64'>'
	with 89865 stored elements in Compressed Sparse Row format>

### transformed - nb

In [60]:
all_classifiers = OrderedDict()

all_classifiers["svm_from_paper"] = LinearSVC(
    loss='squared_hinge',
    penalty='l2',
    random_state=1,
    dual=False
)

all_classifiers["sgd"] = SGDClassifier(
    n_iter=50,
    penalty='elasticnet',
    random_state=1,
)

all_classifiers["ridge"] = RidgeClassifier(
    tol=1e-2,
    solver='lsqr',
    random_state=1,
)

all_classifiers["perceptron"] = Perceptron(
    n_iter=50,
    random_state=1,
)

all_classifiers["passive_aggressive"] = PassiveAggressiveClassifier(
    n_iter=50,
    random_state=1,
)

all_classifiers["adaboost"] = AdaBoostClassifier(
    n_estimators=100,
    random_state=1,
)

In [61]:
newsgroups_nb_plus_clfs = OrderedDict()
for clf_name, clf in all_classifiers.items():
    newsgroups_nb_plus_clfs[clf_name] = NB_plus_Classifier(
        list(map(lambda x: x[0], idx2class_lookup.items())),
        clf
)

In [62]:
# transformed - nb
for clf_name, clf in newsgroups_nb_plus_clfs.items():
    print("training {} - nb transformed".format(clf_name))
    clf.fit(newsgroups_train_features_uni, data_train.target)

training svm_from_paper - nb transformed
training sgd - nb transformed
training ridge - nb transformed



In Ridge, only 'sag' solver can currently fit the intercept when X is sparse. Solver has been automatically changed into 'sag'.



training perceptron - nb transformed
training passive_aggressive - nb transformed
training adaboost - nb transformed



the classifier you instantiated with does not have an attribute `.coef_`interpolation will not occur



In [63]:
newsgroups_nb_transformed_uni_results = OrderedDict()
for clf_name, clf in newsgroups_nb_plus_clfs.items():
    print("testing transformed {}".format(clf_name))
    preds = clf.predict(newsgroups_test_features_uni)
    newsgroups_nb_transformed_uni_results[clf_name] = accuracy_score(data_test.target, preds)
newsgroups_nb_transformed_uni_results

testing transformed svm_from_paper
testing transformed sgd
testing transformed ridge
testing transformed perceptron
testing transformed passive_aggressive
testing transformed adaboost


OrderedDict([('svm_from_paper', 0.82908163265306123),
             ('sgd', 0.85076530612244894),
             ('ridge', 0.85969387755102045),
             ('perceptron', 0.85331632653061229),
             ('passive_aggressive', 0.8660714285714286),
             ('adaboost', 0.81122448979591832)])

### no transformation

In [64]:
all_classifiers = OrderedDict()

all_classifiers["svm_from_paper"] = LinearSVC(
    loss='squared_hinge',
    penalty='l2',
    dual=False,
    C=0.1,
    random_state=1
)

all_classifiers["sgd"] = SGDClassifier(
    n_iter=50,
    penalty='elasticnet',
    random_state=1,
)

all_classifiers["ridge"] = RidgeClassifier(
    tol=1e-2,
    solver='lsqr',
    random_state=1,
)

all_classifiers["perceptron"] = Perceptron(
    n_iter=50,
    random_state=1,
)

all_classifiers["passive_aggressive"] = PassiveAggressiveClassifier(
    n_iter=50,
    random_state=1,
)

all_classifiers["random_forest"] = RandomForestClassifier(
    n_estimators=100,
    random_state=1,
)

all_classifiers["adaboost"] = AdaBoostClassifier(
    n_estimators=100,
    random_state=1,
)

all_classifiers["bernoulli_nb"] = BernoulliNB(
    alpha=0.01,
)

all_classifiers["multinomial_nb"] = MultinomialNB(
    alpha=0.01,
)

In [65]:
# no transformation count
for clf_name, clf in all_classifiers.items():
    print("training baseline {} - no transformation".format(clf_name))
    clf.fit(newsgroups_train_features_uni, data_train.target)

training baseline svm_from_paper - no transformation
training baseline sgd - no transformation
training baseline ridge - no transformation



In Ridge, only 'sag' solver can currently fit the intercept when X is sparse. Solver has been automatically changed into 'sag'.



training baseline perceptron - no transformation
training baseline passive_aggressive - no transformation
training baseline random_forest - no transformation
training baseline adaboost - no transformation
training baseline bernoulli_nb - no transformation
training baseline multinomial_nb - no transformation


In [66]:
newsgroups_no_transformation_uni_results = OrderedDict()
for clf_name, clf in all_classifiers.items():
    print("testing baseline {} - no transformation".format(clf_name))
    preds = clf.predict(newsgroups_test_features_uni)
    newsgroups_no_transformation_uni_results[clf_name] = accuracy_score(data_test.target, preds)
newsgroups_no_transformation_uni_results

testing baseline svm_from_paper - no transformation
testing baseline sgd - no transformation
testing baseline ridge - no transformation
testing baseline perceptron - no transformation
testing baseline passive_aggressive - no transformation
testing baseline random_forest - no transformation
testing baseline adaboost - no transformation
testing baseline bernoulli_nb - no transformation
testing baseline multinomial_nb - no transformation


OrderedDict([('svm_from_paper', 0.82397959183673475),
             ('sgd', 0.8482142857142857),
             ('ridge', 0.83418367346938771),
             ('perceptron', 0.83290816326530615),
             ('passive_aggressive', 0.82908163265306123),
             ('random_forest', 0.81377551020408168),
             ('adaboost', 0.81122448979591832),
             ('bernoulli_nb', 0.81760204081632648),
             ('multinomial_nb', 0.86352040816326525)])

### transformation - tfidf

In [67]:
newsgroups_train_features_uni = newsgroups_vectorizer_tfidf.fit_transform(data_train.data)
newsgroups_test_features_uni = newsgroups_vectorizer_tfidf.transform(data_test.data)

In [68]:
all_classifiers = OrderedDict()

all_classifiers["svm_from_paper"] = LinearSVC(
    loss='squared_hinge',
    penalty='l2',
    C=0.1,
    random_state=1,
)

all_classifiers["sgd"] = SGDClassifier(
    n_iter=50,
    penalty='elasticnet',
    random_state=1,
)

all_classifiers["ridge"] = RidgeClassifier(
    tol=1e-2,
    solver='lsqr',
    random_state=1,
)

all_classifiers["perceptron"] = Perceptron(
    n_iter=50,
    random_state=1,
)

all_classifiers["passive_aggressive"] = PassiveAggressiveClassifier(
    n_iter=50,
    random_state=1,
)

all_classifiers["random_forest"] = RandomForestClassifier(
    n_estimators=100,
    random_state=1,
)

all_classifiers["adaboost"] = AdaBoostClassifier(
    n_estimators=100,
    random_state=1,
)

all_classifiers["bernoulli_nb"] = BernoulliNB(
    alpha=0.01
)

all_classifiers["multinomial_nb"] = MultinomialNB(
    alpha=0.01
)

In [69]:
for clf_name, clf in all_classifiers.items():
    print("training baseline {} - tfidf transformation".format(clf_name))
    clf.fit(newsgroups_train_features_uni, data_train.target)

training baseline svm_from_paper - tfidf transformation
training baseline sgd - tfidf transformation
training baseline ridge - tfidf transformation
training baseline perceptron - tfidf transformation
training baseline passive_aggressive - tfidf transformation
training baseline random_forest - tfidf transformation



In Ridge, only 'sag' solver can currently fit the intercept when X is sparse. Solver has been automatically changed into 'sag'.



training baseline adaboost - tfidf transformation
training baseline bernoulli_nb - tfidf transformation
training baseline multinomial_nb - tfidf transformation


In [70]:
newsgroups_tfidf_transformed_uni_results = OrderedDict()
for clf_name, clf in all_classifiers.items():
    print("testing baseline {} - no transformation".format(clf_name))
    preds = clf.predict(newsgroups_test_features_uni)
    newsgroups_tfidf_transformed_uni_results[clf_name] = accuracy_score(data_test.target, preds)
newsgroups_tfidf_transformed_uni_results

testing baseline svm_from_paper - no transformation
testing baseline sgd - no transformation
testing baseline ridge - no transformation
testing baseline perceptron - no transformation
testing baseline passive_aggressive - no transformation
testing baseline random_forest - no transformation
testing baseline adaboost - no transformation
testing baseline bernoulli_nb - no transformation
testing baseline multinomial_nb - no transformation


OrderedDict([('svm_from_paper', 0.8482142857142857),
             ('sgd', 0.8571428571428571),
             ('ridge', 0.85459183673469385),
             ('perceptron', 0.84311224489795922),
             ('passive_aggressive', 0.85459183673469385),
             ('random_forest', 0.80867346938775508),
             ('adaboost', 0.78061224489795922),
             ('bernoulli_nb', 0.81760204081632648),
             ('multinomial_nb', 0.86734693877551017)])

In [71]:
groups = [
    ("no_transform", newsgroups_no_transformation_uni_results),
    ("tfidf_transform", newsgroups_tfidf_transformed_uni_results), 
    ("nb_transform", newsgroups_nb_transformed_uni_results)
]

clfs = list(all_classifiers.keys())

traces = OrderedDict()
for i in range(len(groups)):
    transformation = groups[i][1]
    transformation_name = groups[i][0]
    scores = []
    for label in clfs:
        try:
            score = transformation[label]
        except:
            score = 0.0
        scores.append(score)
    traces["trace_{}".format(transformation_name)] = go.Bar(
        x=clfs,
        y=scores,
        name=transformation_name,
    )

data_ = [v for k, v in traces.items()]
layout_ = go.Layout(
    barmode='group',
    title='Transformation comparison: XGraph (1.8)',
    yaxis=dict(
        range=[0, 1]
    )
)

fig_ = go.Figure(data=data_, layout=layout_)
iplot(fig_)    

## `rec.sport.baseball` v. `sci.crypt`

In [72]:
# categories = None
categories = [
    'rec.sport.baseball', 'sci.crypt',
#     'alt.atheism', 'talk.religion.misc', 'comp.graphics', sci.space',
]

In [73]:
remove = (
    'headers', 
#     'footers', 
#     'quotes'
)

## Data

In [74]:
data_train = fetch_20newsgroups(subset='train', categories=categories,
                                shuffle=True, random_state=42,
                                remove=remove)

data_test = fetch_20newsgroups(subset='test', categories=categories,
                               shuffle=True, random_state=42,
                               remove=remove)

idx2class_lookup = dict((i,c) for i,c in enumerate(data_train.target_names))
idx2class_lookup

{0: 'rec.sport.baseball', 1: 'sci.crypt'}

In [75]:
# get label distribution
num_pos = list(data_train.target).count(1)
num_neg = list(data_train.target).count(0)
num_pos, num_neg

(595, 597)

In [76]:
# determine intercept (log(N+/N-))
intercept = np.log(num_pos/num_neg)
intercept

-0.0033557078469723042

## Unigrams

### Vectorize

In [77]:
newsgroups_vectorizer_uni = CountVectorizer(
    binary=True
)

newsgroups_vectorizer_tfidf = TfidfVectorizer()

In [78]:
newsgroups_train_features_uni = newsgroups_vectorizer_uni.fit_transform(data_train.data)
newsgroups_train_features_uni

<1192x21197 sparse matrix of type '<class 'numpy.int64'>'
	with 176313 stored elements in Compressed Sparse Row format>

In [79]:
newsgroups_test_features_uni = newsgroups_vectorizer_uni.transform(data_test.data)
newsgroups_test_features_uni

<793x21197 sparse matrix of type '<class 'numpy.int64'>'
	with 97031 stored elements in Compressed Sparse Row format>

### transformed - nb

In [80]:
all_classifiers = OrderedDict()

all_classifiers["svm_from_paper"] = LinearSVC(
    loss='squared_hinge',
    penalty='l2',
    random_state=1,
    dual=False
)

all_classifiers["sgd"] = SGDClassifier(
    n_iter=50,
    penalty='elasticnet',
    random_state=1,
)

all_classifiers["ridge"] = RidgeClassifier(
    tol=1e-2,
    solver='lsqr',
    random_state=1,
)

all_classifiers["perceptron"] = Perceptron(
    n_iter=50,
    random_state=1,
)

all_classifiers["passive_aggressive"] = PassiveAggressiveClassifier(
    n_iter=50,
    random_state=1,
)

all_classifiers["adaboost"] = AdaBoostClassifier(
    n_estimators=100,
    random_state=1,
)

In [81]:
newsgroups_nb_plus_clfs = OrderedDict()
for clf_name, clf in all_classifiers.items():
    newsgroups_nb_plus_clfs[clf_name] = NB_plus_Classifier(
        list(map(lambda x: x[0], idx2class_lookup.items())),
        clf
)

In [82]:
# transformed - nb
for clf_name, clf in newsgroups_nb_plus_clfs.items():
    print("training {} - nb transformed".format(clf_name))
    clf.fit(newsgroups_train_features_uni, data_train.target)

training svm_from_paper - nb transformed
training sgd - nb transformed
training ridge - nb transformed



In Ridge, only 'sag' solver can currently fit the intercept when X is sparse. Solver has been automatically changed into 'sag'.



training perceptron - nb transformed
training passive_aggressive - nb transformed
training adaboost - nb transformed



the classifier you instantiated with does not have an attribute `.coef_`interpolation will not occur



In [83]:
newsgroups_nb_transformed_uni_results = OrderedDict()
for clf_name, clf in newsgroups_nb_plus_clfs.items():
    print("testing transformed {}".format(clf_name))
    preds = clf.predict(newsgroups_test_features_uni)
    newsgroups_nb_transformed_uni_results[clf_name] = accuracy_score(data_test.target, preds)
newsgroups_nb_transformed_uni_results

testing transformed svm_from_paper
testing transformed sgd
testing transformed ridge
testing transformed perceptron
testing transformed passive_aggressive
testing transformed adaboost


OrderedDict([('svm_from_paper', 0.84489281210592682),
             ('sgd', 0.96973518284993698),
             ('ridge', 0.97856242118537196),
             ('perceptron', 0.97982345523329129),
             ('passive_aggressive', 0.98486759142496849),
             ('adaboost', 0.9319041614123581)])

### no transformation

In [84]:
all_classifiers = OrderedDict()

all_classifiers["svm_from_paper"] = LinearSVC(
    loss='squared_hinge',
    penalty='l2',
    dual=False,
    C=0.1,
    random_state=1
)

all_classifiers["sgd"] = SGDClassifier(
    n_iter=50,
    penalty='elasticnet',
    random_state=1,
)

all_classifiers["ridge"] = RidgeClassifier(
    tol=1e-2,
    solver='lsqr',
    random_state=1,
)

all_classifiers["perceptron"] = Perceptron(
    n_iter=50,
    random_state=1,
)

all_classifiers["passive_aggressive"] = PassiveAggressiveClassifier(
    n_iter=50,
    random_state=1,
)

all_classifiers["random_forest"] = RandomForestClassifier(
    n_estimators=100,
    random_state=1,
)

all_classifiers["adaboost"] = AdaBoostClassifier(
    n_estimators=100,
    random_state=1,
)

all_classifiers["bernoulli_nb"] = BernoulliNB(
    alpha=0.01,
)

all_classifiers["multinomial_nb"] = MultinomialNB(
    alpha=0.01,
)

In [85]:
# no transformation count
for clf_name, clf in all_classifiers.items():
    print("training baseline {} - no transformation".format(clf_name))
    clf.fit(newsgroups_train_features_uni, data_train.target)

training baseline svm_from_paper - no transformation
training baseline sgd - no transformation
training baseline ridge - no transformation



In Ridge, only 'sag' solver can currently fit the intercept when X is sparse. Solver has been automatically changed into 'sag'.



training baseline perceptron - no transformation
training baseline passive_aggressive - no transformation
training baseline random_forest - no transformation
training baseline adaboost - no transformation
training baseline bernoulli_nb - no transformation
training baseline multinomial_nb - no transformation


In [86]:
newsgroups_no_transformation_uni_results = OrderedDict()
for clf_name, clf in all_classifiers.items():
    print("testing baseline {} - no transformation".format(clf_name))
    preds = clf.predict(newsgroups_test_features_uni)
    newsgroups_no_transformation_uni_results[clf_name] = accuracy_score(data_test.target, preds)
newsgroups_no_transformation_uni_results

testing baseline svm_from_paper - no transformation
testing baseline sgd - no transformation
testing baseline ridge - no transformation
testing baseline perceptron - no transformation
testing baseline passive_aggressive - no transformation
testing baseline random_forest - no transformation
testing baseline adaboost - no transformation
testing baseline bernoulli_nb - no transformation
testing baseline multinomial_nb - no transformation


OrderedDict([('svm_from_paper', 0.95838587641866335),
             ('sgd', 0.95586380832282469),
             ('ridge', 0.94703656998738961),
             ('perceptron', 0.95208070617906682),
             ('passive_aggressive', 0.95964691046658257),
             ('random_forest', 0.94955863808322827),
             ('adaboost', 0.9319041614123581),
             ('bernoulli_nb', 0.94703656998738961),
             ('multinomial_nb', 0.98612862547288782)])

### transformation - tfidf

In [87]:
newsgroups_train_features_uni = newsgroups_vectorizer_tfidf.fit_transform(data_train.data)
newsgroups_test_features_uni = newsgroups_vectorizer_tfidf.transform(data_test.data)

In [88]:
all_classifiers = OrderedDict()

all_classifiers["svm_from_paper"] = LinearSVC(
    loss='squared_hinge',
    penalty='l2',
    C=0.1,
    random_state=1,
)

all_classifiers["sgd"] = SGDClassifier(
    n_iter=50,
    penalty='elasticnet',
    random_state=1,
)

all_classifiers["ridge"] = RidgeClassifier(
    tol=1e-2,
    solver='lsqr',
    random_state=1,
)

all_classifiers["perceptron"] = Perceptron(
    n_iter=50,
    random_state=1,
)

all_classifiers["passive_aggressive"] = PassiveAggressiveClassifier(
    n_iter=50,
    random_state=1,
)

all_classifiers["random_forest"] = RandomForestClassifier(
    n_estimators=100,
    random_state=1,
)

all_classifiers["adaboost"] = AdaBoostClassifier(
    n_estimators=100,
    random_state=1,
)

all_classifiers["bernoulli_nb"] = BernoulliNB(
    alpha=0.01
)

all_classifiers["multinomial_nb"] = MultinomialNB(
    alpha=0.01
)

In [89]:
for clf_name, clf in all_classifiers.items():
    print("training baseline {} - tfidf transformation".format(clf_name))
    clf.fit(newsgroups_train_features_uni, data_train.target)

training baseline svm_from_paper - tfidf transformation
training baseline sgd - tfidf transformation
training baseline ridge - tfidf transformation
training baseline perceptron - tfidf transformation
training baseline passive_aggressive - tfidf transformation
training baseline random_forest - tfidf transformation



In Ridge, only 'sag' solver can currently fit the intercept when X is sparse. Solver has been automatically changed into 'sag'.



training baseline adaboost - tfidf transformation
training baseline bernoulli_nb - tfidf transformation
training baseline multinomial_nb - tfidf transformation


In [90]:
newsgroups_tfidf_transformed_uni_results = OrderedDict()
for clf_name, clf in all_classifiers.items():
    print("testing baseline {} - no transformation".format(clf_name))
    preds = clf.predict(newsgroups_test_features_uni)
    newsgroups_tfidf_transformed_uni_results[clf_name] = accuracy_score(data_test.target, preds)
newsgroups_tfidf_transformed_uni_results

testing baseline svm_from_paper - no transformation
testing baseline sgd - no transformation
testing baseline ridge - no transformation
testing baseline perceptron - no transformation
testing baseline passive_aggressive - no transformation
testing baseline random_forest - no transformation
testing baseline adaboost - no transformation
testing baseline bernoulli_nb - no transformation
testing baseline multinomial_nb - no transformation


OrderedDict([('svm_from_paper', 0.95208070617906682),
             ('sgd', 0.96343001261034045),
             ('ridge', 0.9709962168978562),
             ('perceptron', 0.95586380832282469),
             ('passive_aggressive', 0.97477931904161408),
             ('random_forest', 0.94955863808322827),
             ('adaboost', 0.91298865069356872),
             ('bernoulli_nb', 0.94703656998738961),
             ('multinomial_nb', 0.97982345523329129)])

In [91]:
groups = [
    ("no_transform", newsgroups_no_transformation_uni_results),
    ("tfidf_transform", newsgroups_tfidf_transformed_uni_results), 
    ("nb_transform", newsgroups_nb_transformed_uni_results)
]

clfs = list(all_classifiers.keys())

traces = OrderedDict()
for i in range(len(groups)):
    transformation = groups[i][1]
    transformation_name = groups[i][0]
    scores = []
    for label in clfs:
        try:
            score = transformation[label]
        except:
            score = 0.0
        scores.append(score)
    traces["trace_{}".format(transformation_name)] = go.Bar(
        x=clfs,
        y=scores,
        name=transformation_name,
    )

data_ = [v for k, v in traces.items()]
layout_ = go.Layout(
    barmode='group',
    title='Transformation comparison: BbCrypt (0.5)',
    yaxis=dict(
        range=[0, 1]
    )
)

fig_ = go.Figure(data=data_, layout=layout_)
iplot(fig_)    