## Extracting features from text files

### Bags of Words

In [19]:
from sklearn.datasets import load_files

categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
twenty_train = load_files( "./data/twenty_newsgroups/20news-bydate-train", categories=categories, shuffle=True, random_state=42 )
print( twenty_train.target_names )
print( len( twenty_train.data ))
print( len( twenty_train.filenames ))

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']
2257
2257


In [20]:
for t in twenty_train.target[:10]:
    print(twenty_train.target_names[t])

comp.graphics
alt.atheism
sci.med
sci.med
alt.atheism
comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
sci.med


### Tokenizing text with scikit-learn

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(decode_error='ignore')
x_train_counts = cv.fit_transform( twenty_train.data )
print( x_train_counts.shape )

(2257, 35787)


In [26]:
cv.vocabulary_.get(u'algorithm')

4690

### From occurrences to frequencies

In [28]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit( x_train_counts )
x_train_tf = tf_transformer.transform( x_train_counts )
x_train_tf.shape

(2257, 35787)

In [34]:
tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform( x_train_counts )
x_train_tfidf.shape

(2257, 35787)

## Training a classifier

In [40]:
from sklearn.naive_bayes import MultinomialNB

# naïve Bayes classifier
clf = MultinomialNB().fit( x_train_tfidf, twenty_train.target )

docs_new = ['God is love', 'OpenGL on the GPU is fast']
x_new_counts = cv.transform( docs_new )
x_new_tfidf = tfidf_transformer.transform( x_new_counts )

predicted = clf.predict( x_new_tfidf )

for doc, category in zip( docs_new, predicted ):
    print( "{} => {}".format( doc, twenty_train.target_names[category]  ) )


God is love => soc.religion.christian
OpenGL on the GPU is fast => comp.graphics


In [39]:
x_new_counts

<2x35787 sparse matrix of type '<class 'numpy.int64'>'
	with 9 stored elements in Compressed Sparse Row format>

## Building a pipeline

In [45]:
from sklearn.pipeline import Pipeline

text_clf = Pipeline( [
    ( 'vect', CountVectorizer( decode_error='ignore' ) ),
    ( 'tfidf', TfidfTransformer() ),
    ( 'clf', MultinomialNB() ),
] )

text_clf.fit( twenty_train.data, twenty_train.target )



AttributeError: lower not found

## Evaluation of the performance on the test set

In [46]:
import numpy as np

twenty_test = load_files( "./data/twenty_newsgroups/20news-bydate-test", categories=categories, shuffle=True, random_state=42 )
docs_test = twenty_test.data
predicted = text_clf.predict( docs_test )
np.mean( predicted == twenty_test.target )



0.8348868175765646

In [53]:
from sklearn.linear_model import SGDClassifier

# support vector machine (SVM)
text_clf = Pipeline( [
    ( "vect",  CountVectorizer( decode_error='ignore' ) ),
    ( "tfidf", TfidfTransformer() ),
    ( "clf", SGDClassifier( loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None ) ),
])

text_clf.fit( twenty_train.data, twenty_train.target )
predicted = text_clf.predict( docs_test )
np.mean( predicted == twenty_test.target )

0.9081225033288948

In [56]:
from sklearn import metrics

print( metrics.classification_report( twenty_test.target, predicted, target_names=twenty_test.target_names ) )
print( metrics.confusion_matrix( twenty_test.target, predicted ) )

                        precision    recall  f1-score   support

           alt.atheism       0.96      0.79      0.87       319
         comp.graphics       0.87      0.98      0.92       389
               sci.med       0.94      0.89      0.91       396
soc.religion.christian       0.89      0.95      0.92       398

           avg / total       0.91      0.91      0.91      1502

[[251  11  17  40]
 [  1 382   3   3]
 [  4  37 351   4]
 [  5  11   2 380]]


In [57]:
twenty_test.target

array([3, 1, 1, ..., 2, 3, 3])

## Parameter tuning using grid search

In [60]:
from sklearn.model_selection import GridSearchCV

parameters = { 
    'vect__ngram_range': [ (1,1), (1,2) ],
    'tfidf__use_idf': ( True, False ),
    'clf__alpha': (1e-2, 1e-3)
}

gs_clf = GridSearchCV( text_clf, parameters, cv=5, iid=False, n_jobs=-1 )
# If we give this parameter a value of -1, grid search will detect how many cores are installed and use them all

gs_clf = gs_clf.fit( twenty_train.data[:400], twenty_train.target[:400] )

twenty_train.target_names[ gs_clf.predict(['God is love'])[0] ]


'alt.atheism'

In [67]:
print( gs_clf.best_score_ )

for param_name in sorted( parameters.keys() ):
    print( "{}: {}".format( param_name, gs_clf.best_params_[ param_name ] ) )
    
# gs_clf.cv_results_

0.9200570401625253
clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 2)
