In [0]:
import sklearn.neighbors
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn import model_selection

In [0]:
cats = ['rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.space', 'rec.motorcycles', 'misc.forsale']
newsgroups = fetch_20newsgroups(subset='all', categories=cats)
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(newsgroups.data)
y = newsgroups.target

In [0]:
weights='uniform'
clf=sklearn.neighbors.KNeighborsClassifier(n_neighbors=1,weights=weights)

In [0]:
def cross_validation(data, target, classifier, cv=5):
    return sklearn.model_selection.cross_val_score(classifier, data, target, cv=cv)

def test_classifier(X, y, clf, test_size=0.4, y_names=None):
    # train-test split
    print('test size is: %2.0f%%' % (test_size * 100))
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=test_size)
    clf.fit(X_train, y_train)
    y_predicted = clf.predict(X_test)
    print ("accuracy before cross-validation is %0.2f" % accuracy_score(y_test, y_predicted))
    scores = cross_validation(X, y, clf, cv=5)
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    print('Classification report:')
    print(sklearn.metrics.classification_report(y_test, y_predicted, target_names=y_names))

In [13]:
test_classifier(X, y, clf, test_size=0.2, y_names=newsgroups.target_names)


test size is: 20%
accuracy before cross-validation is 0.90
Accuracy: 0.90 (+/- 0.02)
Classification report:
                  precision    recall  f1-score   support

    misc.forsale       0.88      0.77      0.83       195
 rec.motorcycles       0.95      0.94      0.95       215
rec.sport.hockey       0.92      0.98      0.95       197
       sci.crypt       0.85      0.96      0.90       205
 sci.electronics       0.91      0.84      0.87       178
       sci.space       0.90      0.91      0.91       197

        accuracy                           0.90      1187
       macro avg       0.90      0.90      0.90      1187
    weighted avg       0.90      0.90      0.90      1187



Another example for all the database

In [14]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')
X_train = newsgroups_train.data
X_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', KNeighborsClassifier()),
                     ])

text_clf.fit(X_train, y_train)

predicted = text_clf.predict(X_test)

print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.43      0.76      0.55       319
           1       0.50      0.61      0.55       389
           2       0.56      0.57      0.57       394
           3       0.53      0.58      0.56       392
           4       0.59      0.56      0.57       385
           5       0.69      0.60      0.64       395
           6       0.58      0.45      0.51       390
           7       0.75      0.69      0.72       396
           8       0.84      0.81      0.82       398
           9       0.77      0.72      0.74       397
          10       0.85      0.84      0.84       399
          11       0.76      0.84      0.80       396
          12       0.70      0.50      0.58       393
          13       0.82      0.49      0.62       396
          14       0.79      0.76      0.78       394
          15       0.75      0.76      0.76       398
          16       0.70      0.73      0.72       364
          17       0.62    