# This code is based on http://scikit-learn.org/dev/tutorial/text_analytics/working_with_text_data.html
# It requires scikit-learn 0.18.dev0 which can be installed with the following command in a terminal
## pip install git+git://github.com/scikit-learn/scikit-learn.git

### Importing required packages

In [78]:
import cPickle as pickle
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

In [80]:
# Global variable
CATEGORIES = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
PARAMETERS = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
             }

### Get data functions

In [52]:
def get_train_data():
    try:
        twenty_train = pickle.load("twenty_train.p")
    except:
        twenty_train = fetch_20newsgroups(subset='train', categories=CATEGORIES, shuffle=True, random_state=42)
        pickle.dump(twenty_train, open("twenty_train.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
    return twenty_train

In [59]:
def get_test_data():
    try:
        twenty_test = pickle.load("twenty_test.p")
    except:
        twenty_test = fetch_20newsgroups(subset='test',
             categories=CATEGORIES, shuffle=True, random_state=42)
        pickle.dump(twenty_test, open("twenty_test.p", "wb"), protocol=pickle.HIGHEST_PROTOCOL)
    return twenty_test

### Naive Bayes (multinomial) 

In [68]:
def pipeline_multinomialNB():
    # Create pipeline vectorizer => transformer => classifier
    text_clf = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', MultinomialNB()),
    ])

    # Get data
    twenty_train = get_train_data()
    twenty_test = get_test_data()

    # train the model with a single command (fit data)
    text_clf = text_clf.fit(twenty_train.data, twenty_train.target)

    # Predict data
    predicted = text_clf.predict(twenty_test.data)

    return text_clf, predicted, twenty_test

### Linear Support Vector Machine (SVM)

In [69]:
def pipeline_svm():
    # Create pipeline
    text_clf = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-3, n_iter=5, random_state=42)),
    ])

    # Get data
    twenty_train = get_train_data()
    twenty_test = get_test_data()

    # Fit data
    _ = text_clf.fit(twenty_train.data, twenty_train.target)

    # Predict data
    predicted = text_clf.predict(twenty_test.data)

    return text_clf, predicted, twenty_test

### Grid search (to get optimal parameters)

In [73]:
def grid_search(text_clf):
    # Get data
    twenty_train = get_train_data()
    twenty_test = get_test_data()
    
    gs_clf = GridSearchCV(text_clf, PARAMETERS, n_jobs=-1)
    gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])

    # Predict some string...
    twenty_train.target_names[gs_clf.predict(['God is love'])]

    # But mostly, show optimal parameters
    best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
    for param_name in sorted(best_parameters.keys()):
        print("%s: %r" % (param_name, best_parameters[param_name]))

    print score

### Evaluate accuracy

In [56]:
def eval_accuracy(predicted, twenty_test):
    np.mean(predicted == twenty_test.target)
    print(metrics.classification_report(twenty_test.target, predicted,
                                        target_names=twenty_test.target_names))

    print metrics.confusion_matrix(twenty_test.target, predicted)

# Running the application

In [87]:
# Run for multinomial naive bayes
text_clf_mNB, predicted_mNB, twenty_test_mNB = pipeline_multinomialNB()

In [71]:
eval_accuracy(predicted_mNB, twenty_test_mNB)

                        precision    recall  f1-score   support

           alt.atheism       0.97      0.60      0.74       319
         comp.graphics       0.96      0.89      0.92       389
               sci.med       0.97      0.81      0.88       396
soc.religion.christian       0.65      0.99      0.78       398

           avg / total       0.88      0.83      0.84      1502

[[192   2   6 119]
 [  2 347   4  36]
 [  2  11 322  61]
 [  2   2   1 393]]


In [88]:
grid_search(text_clf_mNB)

clf__alpha: 0.01
tfidf__use_idf: True
vect__ngram_range: (1, 2)
0.93




In [91]:
# Run for SVM
text_clf_svm, predicted_svm, twenty_test_svm = pipeline_svm()

In [85]:
eval_accuracy(predicted_svm, twenty_test_svm)

                        precision    recall  f1-score   support

           alt.atheism       0.95      0.81      0.87       319
         comp.graphics       0.88      0.97      0.92       389
               sci.med       0.94      0.90      0.92       396
soc.religion.christian       0.90      0.95      0.93       398

           avg / total       0.92      0.91      0.91      1502

[[258  11  15  35]
 [  4 379   3   3]
 [  5  33 355   3]
 [  5  10   4 379]]


In [90]:
grid_search(text_clf_svm)

clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 1)
0.9


