In [0]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.multiclass import check_classification_targets
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import KFold

pairwise_distances
Compute the distance matrix from a vector array X and optional Y

sklearn.utils.multiclass.type_of_target(y)
 type is the most specific type that can be inferred. For example:
binary is more specific but compatible with multiclass.
multiclass of integers is more specific but compatible with continuous.
multilabel-indicator is more specific but compatible with multiclass-multioutput.

In [3]:
#hyper params
cats = ['rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.space', 'rec.motorcycles', 'misc.forsale']
newsgroups = fetch_20newsgroups(subset='train', categories=cats, remove=('headers', 'footers', 'quotes'))
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(newsgroups.data)
y = newsgroups.target
metric = 'euclidean'

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [0]:
check_classification_targets(y)
n_samples, n_features = X.shape
le = LabelEncoder()
y_indices = le.fit_transform(y)
classes = le.classes_
n_classes = classes.size
# Mask mapping each class to its members.
centroids = np.empty((n_classes, n_features), dtype=np.float64)
# Number of clusters in each class.
n_cluster = np.zeros(n_classes)

for current_class in range(n_classes):
    center_mask = y_indices == current_class
    n_cluster[current_class] = np.sum(center_mask)
    centroids[current_class] = X[center_mask].mean(axis=0)

In [0]:
def get_vectorizer_array(query):
    return vectorizer.transform([query]).toarray()
def pred(X):
    return classes[pairwise_distances(X, centroids, metric=metric).argmin(axis=1)]

In [11]:
newsgroups_test = fetch_20newsgroups(subset='test', categories=cats, remove=('headers', 'footers', 'quotes'))
x_testdata = newsgroups_test.data
y_test = newsgroups_test.target
testdata = [[a_, b_] for a_, b_ in zip(x_testdata, y_test)]

correct = sum(str(pred(get_vectorizer_array(testcase[0]))[0]) == str(testcase[1]) for testcase in testdata)

# Print the accurency in percentage
result = str(correct / len(testdata) * 100) + " %"

print("Accuracy before K-Folding: %s" % result)

Accuracy before K-Folding: 70.80168776371309 %


In [12]:
import numpy as np
newsgroups = fetch_20newsgroups(subset='all', categories=cats, remove=('headers', 'footers', 'quotes'))
X = np.asarray(newsgroups.data)
y = np.asarray(newsgroups.target)
print(X.shape)
K = 5
kf =  KFold(n_splits=K)
for train_index, test_index in kf.split(X):
    X_train, X_test = vectorizer.fit_transform(X[train_index]), X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    n_samples, n_features = X_train.shape
    le = LabelEncoder()
    y_indices = le.fit_transform(y_train)
    classes = le.classes_
    n_classes = classes.size

    # Mask mapping each class to its members.
    centroids = np.empty((n_classes, n_features), dtype=np.float64)
    # Number of clusters in each class.
    n_cluster = np.zeros(n_classes)
    for current_class in range(n_classes):
        center_mask = y_indices == current_class
        n_cluster[current_class] = np.sum(center_mask)
        centroids[current_class] = X_train[center_mask].mean(axis=0)
    
    
    testdata = [[a_, b_] for a_, b_ in zip(X_test, y_test)]

    correct = sum(str(pred(get_vectorizer_array(testcase[0]))[0]) == str(testcase[1]) for testcase in testdata)

    # Print the accurency in percentage
    result = str(correct / len(testdata) * 100) + "%"

    print("Accuracy after %d-Folding: %s" %( K, result))

(5932,)
Accuracy after 5-Folding: 71.44060657118787%
Accuracy after 5-Folding: 72.03032855939342%
Accuracy after 5-Folding: 73.10286677908938%
Accuracy after 5-Folding: 72.76559865092749%
Accuracy after 5-Folding: 72.93423271500843%


Another Example

In [13]:
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')
X_train = newsgroups_train.data
X_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', NearestCentroid()),
                     ])

text_clf.fit(X_train, y_train)


predicted = text_clf.predict(X_test)

print(metrics.classification_report(y_test, predicted))



              precision    recall  f1-score   support

           0       0.75      0.49      0.60       319
           1       0.44      0.76      0.56       389
           2       0.75      0.68      0.71       394
           3       0.71      0.59      0.65       392
           4       0.81      0.71      0.76       385
           5       0.83      0.66      0.74       395
           6       0.49      0.88      0.63       390
           7       0.86      0.76      0.80       396
           8       0.91      0.86      0.89       398
           9       0.85      0.79      0.82       397
          10       0.95      0.80      0.87       399
          11       0.94      0.66      0.78       396
          12       0.40      0.70      0.51       393
          13       0.84      0.49      0.62       396
          14       0.89      0.72      0.80       394
          15       0.55      0.73      0.63       398
          16       0.68      0.76      0.71       364
          17       0.97    