In [1]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)

In [2]:
>>> from sklearn.pipeline import Pipeline
>>> from sklearn.feature_extraction.text import CountVectorizer
>>> from sklearn.feature_extraction.text import TfidfTransformer
>>> from sklearn.naive_bayes import MultinomialNB 
>>> text_clf = Pipeline([('vect', CountVectorizer(ngram_range = (1,2))),
...                      ('tfidf', TfidfTransformer(use_idf = True)),
...                      ('clf', MultinomialNB(alpha = 0.01)),
... ])
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)

In [3]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
predicted = text_clf.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

0.8344397238449283

In [4]:
>>> from sklearn.model_selection import cross_validate
>>> scoring = ['accuracy','precision_micro', 'precision_macro', 'f1_micro', 'f1_macro', 'recall_micro', 'recall_macro']
>>> scores = cross_validate(text_clf, twenty_train.data, twenty_train.target, scoring=scoring,
...                         cv=5, return_train_score=False)
scores.pop('fit_time')
scores.pop('score_time')
>>> from pandas import DataFrame
>>> DataFrame(scores)

Unnamed: 0,test_accuracy,test_precision_micro,test_precision_macro,test_f1_micro,test_f1_macro,test_recall_micro,test_recall_macro
0,0.912737,0.912737,0.915551,0.912737,0.911285,0.912737,0.90989
1,0.9188,0.9188,0.922393,0.9188,0.917796,0.9188,0.916215
2,0.913428,0.913428,0.916081,0.913428,0.912767,0.913428,0.911468
3,0.907522,0.907522,0.908116,0.907522,0.905947,0.907522,0.905193
4,0.918847,0.918847,0.923991,0.918847,0.919455,0.918847,0.917235


In [5]:
>>> from sklearn.linear_model import SGDClassifier
>>> text_clf = Pipeline([('vect', CountVectorizer()),
...                      ('tfidf', TfidfTransformer()),
...                      ('clf', SGDClassifier()),
... ])
>>> text_clf.fit(twenty_train.data, twenty_train.target)  
>>> docs_test = twenty_test.data
>>> predicted = text_clf.predict(docs_test)
>>> np.mean(predicted == twenty_test.target) 



0.8507700477960701

In [6]:
>>> scores = cross_validate(text_clf, twenty_train.data, twenty_train.target, scoring=scoring,
...                         cv=5, return_train_score=False)
scores.pop('fit_time')
scores.pop('score_time')
>>> DataFrame(scores)



Unnamed: 0,test_accuracy,test_precision_micro,test_precision_macro,test_f1_micro,test_f1_macro,test_recall_micro,test_recall_macro
0,0.922433,0.922433,0.924013,0.922433,0.920947,0.922433,0.919524
1,0.921889,0.921889,0.922882,0.921889,0.920343,0.921889,0.919144
2,0.922703,0.922703,0.922956,0.922703,0.921351,0.922703,0.920417
3,0.916372,0.916372,0.915954,0.916372,0.914375,0.916372,0.913778
4,0.927273,0.927273,0.930068,0.927273,0.92699,0.927273,0.925482


In [7]:
>>> from sklearn.linear_model import LogisticRegression
>>> text_clf = Pipeline([('vect', CountVectorizer()),
...                      ('tfidf', TfidfTransformer()),
...                      ('clf', LogisticRegression()),
... ])
>>> text_clf.fit(twenty_train.data, twenty_train.target)  
>>> docs_test = twenty_test.data
>>> predicted = text_clf.predict(docs_test)
>>> np.mean(predicted == twenty_test.target) 

0.8279341476367499

In [8]:
>>> scores = cross_validate(text_clf, twenty_train.data, twenty_train.target, scoring=scoring,
...                         cv=5, return_train_score=False)
scores.pop('fit_time')
scores.pop('score_time')
>>> DataFrame(scores)

Unnamed: 0,test_accuracy,test_precision_micro,test_precision_macro,test_f1_micro,test_f1_macro,test_recall_micro,test_recall_macro
0,0.891141,0.891141,0.895744,0.891141,0.887674,0.891141,0.885326
1,0.889232,0.889232,0.895564,0.889232,0.886514,0.889232,0.883812
2,0.886042,0.886042,0.88967,0.886042,0.883401,0.886042,0.880874
3,0.897345,0.897345,0.900421,0.897345,0.893378,0.897345,0.8913
4,0.896231,0.896231,0.903932,0.896231,0.893332,0.896231,0.890157


In [9]:
>>> from sklearn.svm import LinearSVC
>>> text_clf = Pipeline([('vect', CountVectorizer()),
...                      ('tfidf', TfidfTransformer()),
...                      ('clf', LinearSVC()),
... ])
>>> text_clf.fit(twenty_train.data, twenty_train.target)  
>>> docs_test = twenty_test.data
>>> predicted = text_clf.predict(docs_test)
>>> np.mean(predicted == twenty_test.target) 

0.8531598513011153

In [10]:
>>> scores = cross_validate(text_clf, twenty_train.data, twenty_train.target, scoring=scoring,
...                         cv=5, return_train_score=False)
scores.pop('fit_time')
scores.pop('score_time')
>>> DataFrame(scores)

Unnamed: 0,test_accuracy,test_precision_micro,test_precision_macro,test_f1_micro,test_f1_macro,test_recall_micro,test_recall_macro
0,0.918466,0.918466,0.920199,0.918466,0.917537,0.918466,0.916203
1,0.929391,0.929391,0.930344,0.929391,0.928479,0.929391,0.9276
2,0.924028,0.924028,0.92443,0.924028,0.923213,0.924028,0.922575
3,0.923894,0.923894,0.923607,0.923894,0.922353,0.923894,0.921715
4,0.929047,0.929047,0.932648,0.929047,0.929663,0.929047,0.928141


In [11]:
>>> from sklearn.tree import DecisionTreeClassifier
>>> text_clf = Pipeline([('vect', CountVectorizer()),
...                      ('tfidf', TfidfTransformer()),
...                      ('clf', DecisionTreeClassifier()),
... ])
>>> text_clf.fit(twenty_train.data, twenty_train.target)  
>>> docs_test = twenty_test.data
>>> predicted = text_clf.predict(docs_test)
>>> np.mean(predicted == twenty_test.target) 

0.5507169410515136

In [12]:
>>> scores = cross_validate(text_clf, twenty_train.data, twenty_train.target, scoring=scoring,
...                         cv=5, return_train_score=False)
scores.pop('fit_time')
scores.pop('score_time')
>>> DataFrame(scores)

Unnamed: 0,test_accuracy,test_precision_micro,test_precision_macro,test_f1_micro,test_f1_macro,test_recall_micro,test_recall_macro
0,0.623623,0.623623,0.625865,0.623623,0.621861,0.623623,0.620161
1,0.634598,0.634598,0.63357,0.634598,0.631555,0.634598,0.631244
2,0.624117,0.624117,0.625161,0.624117,0.620618,0.624117,0.619329
3,0.630088,0.630088,0.627073,0.630088,0.625576,0.630088,0.625125
4,0.639024,0.639024,0.635087,0.639024,0.63309,0.639024,0.634365
