In [11]:
from sklearn import datasets, model_selection, feature_extraction, naive_bayes, svm, pipeline, metrics
import nltk

Defining stop words list and tokenizing them

In [12]:
stop_words = []
for sw in nltk.corpus.stopwords.words('english'):
    stop_words += nltk.word_tokenize(sw)
stop_words = set(stop_words)
list(stop_words)[:15]

['all',
 'while',
 "n't",
 'having',
 'the',
 'doing',
 'wouldn',
 'itself',
 'if',
 'over',
 'hers',
 'than',
 "'ll",
 'we',
 'it']

In [13]:
# dataset: http://qwone.com/~jason/20Newsgroups/
news_data = datasets.load_files("../large_files/20_newsgroups", shuffle=True, encoding='ISO-8859-1')
news_data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [14]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(news_data.data, news_data.target, test_size=0.33)

In [15]:
model = pipeline.Pipeline([
    ('counts', feature_extraction.text.CountVectorizer(
        tokenizer=nltk.word_tokenize,  # use the NLTK tokenizer
        min_df=2,  # minimum document frequency, i.e. the word must appear more than once.
        ngram_range=(1, 2),
        stop_words=stop_words
    )),
    ('tfidf', feature_extraction.text.TfidfTransformer()),
    #('naivebayes', naive_bayes.MultinomialNB()),
    ('svm', svm.LinearSVC()), # much better than NB
])

In [16]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

Visualizing with Pandas

In [17]:
import pandas as pd
df = pd.DataFrame({"pred": y_pred, "test": y_test})
df["same"] = df["pred"] == df["test"]
df[:20]

Unnamed: 0,pred,test,same
0,17,17,True
1,18,18,True
2,18,16,False
3,10,10,True
4,13,13,True
5,6,6,True
6,11,11,True
7,4,4,True
8,1,1,True
9,6,6,True


Comparing with NumPy

In [18]:
import numpy as np
np.mean(y_pred == y_test)

0.9418181818181818

In [19]:
tsv = metrics.classification_report(y_test, y_pred, target_names=news_data.target_names)
print(tsv)

                          precision    recall  f1-score   support

             alt.atheism       0.78      0.81      0.80       305
           comp.graphics       0.96      0.97      0.96       321
 comp.os.ms-windows.misc       0.95      0.97      0.96       307
comp.sys.ibm.pc.hardware       0.97      0.97      0.97       332
   comp.sys.mac.hardware       0.99      0.97      0.98       341
          comp.windows.x       0.98      0.98      0.98       344
            misc.forsale       0.97      0.97      0.97       357
               rec.autos       0.99      0.99      0.99       336
         rec.motorcycles       0.98      0.99      0.99       328
      rec.sport.baseball       1.00      1.00      1.00       311
        rec.sport.hockey       1.00      0.99      1.00       343
               sci.crypt       1.00      0.99      0.99       349
         sci.electronics       0.97      0.97      0.97       333
                 sci.med       0.98      0.98      0.98       333
         

With GridSearch it's possible to find the parameters that throw the best accuracy, in this case with 2 possible values of ```ngram_range``` in 'counts' and 2 possible values of ```alpha``` in 'naivebayes'

In [None]:
grid_search_model = model_selection.GridSearchCV(
    model,
    {
        'counts__ngram_range': [(1, 1), (1, 2)],
        'naivebayes__alpha': (0.1, 3.0)
    },
    n_jobs=-1  # detects how many cores are installed in the machine and uses them all
)
grid_search_model.fit(X_train, y_train)

In [None]:
print(grid_search_model.cv_results_)