In [73]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_files
#from sklearn.model_selection import train_test_split
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.grid_search import GridSearchCV

#Models
from sklearn.linear_model import Perceptron, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.decomposition import TruncatedSVD

In [2]:
path = 'data'
dataset = load_files(path)

In [3]:
docs_train, docs_test, y_train, y_test = train_test_split(dataset.data,
                                                         dataset.target,
                                                         test_size=0.25)

In [4]:
# Build a vectorizer/classifier pipeline

In [78]:
text_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', MultinomialNB())
    ])

In [75]:
# Build a grid search to find best parameters

In [79]:
parameters = {'vect__ngram_range': [(1,1), (1,2), (1,3), (1,4)],
             'tfidf__use_idf': (True, False),
             'clf__alpha': (1e-2, 1e-3)}
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)

In [80]:
_ = gs_clf.fit(docs_train, y_train)

In [81]:
# Get the best parameters

In [82]:
best_parameters, score, _ = max(gs_clf.grid_scores_, key = lambda x: x[1])
for param_name in sorted(parameters.keys()):
    print('%s: %r' % (param_name, best_parameters[param_name]))

clf__alpha: 0.001
tfidf__use_idf: False
vect__ngram_range: (1, 1)


In [83]:
# Generate predictions

In [84]:
y_predicted = gs_clf.predict(docs_test)

In [85]:
# Model Metrics

In [86]:
print(metrics.classification_report(y_test, y_predicted,
                                        target_names=dataset.target_names))

             precision    recall  f1-score   support

          1       0.81      0.93      0.86        27
          2       0.31      0.40      0.35        10
          3       1.00      0.20      0.33         5
          4       0.91      0.71      0.80        14

avg / total       0.76      0.71      0.71        56



In [66]:
cm = metrics.confusion_matrix(y_test, y_predicted)
print(cm)

[[25  1  0  1]
 [ 6  4  0  0]
 [ 0  4  1  0]
 [ 0  4  0 10]]
