## Mission

* Write a text classification pipeline using a custom preprocessor and CharNGramAnalyzer using data from Wikipedia articles as training set.

* Evaluate the performance on some held out test set.

* [solution](https://github.com/scikit-learn/scikit-learn/blob/master/doc/tutorial/text_analytics/solutions/exercise_01_language_train_model.py)

In [20]:
from sklearn.datasets import load_files

training_set_path = "./data/languages/paragraphs"
test_set_paht = "./data/languages/short_paragraphs"
training_set = load_files( training_set_path )
test_set = load_files( test_set_paht )

print( len( training_set.data ) )
print( training_set.target_names)

print( len( test_set.data ) )
print( test_set.target_names )

985
['ar', 'de', 'en', 'es', 'fr', 'it', 'ja', 'nl', 'pl', 'pt', 'ru']
8805
['ar', 'de', 'en', 'es', 'fr', 'it', 'ja', 'nl', 'pl', 'pt', 'ru']


In [22]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

## Naive Bayes Classicifier
text_clf_nb = Pipeline([
    ( 'vect', CountVectorizer(analyzer='char', decode_error='ignore') ),
    ( 'tfidf', TfidfTransformer() ),
    ( 'clf', MultinomialNB() )
])

text_clf_nb.fit( training_set.data, training_set.target )

## SVM Classicifier
text_clf_svm = Pipeline([
    ( 'vect', CountVectorizer(analyzer='char', decode_error='ignore') ),
    ( 'tfidf', TfidfTransformer() ),
    ( 'clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None) )
])

text_clf_svm.fit( training_set.data, training_set.target )

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='char', binary=False, decode_error='ignore',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False))])

In [23]:
import numpy as np
from sklearn import metrics

def report( clf, test_set ):
    predicted = clf.predict( test_set.data )
    rst1 = np.mean( predicted == test_set.target )
    print( rst1 )
    rst2 = metrics.classification_report( test_set.target, predicted, target_names=test_set.target_names )
    print( rst2 )

report( text_clf_nb, test_set )
report( text_clf_svm, test_set )



0.49074389551391256
             precision    recall  f1-score   support

         ar       1.00      1.00      1.00       332
         de       0.34      0.94      0.50      1075
         en       0.28      0.77      0.42      1085
         es       0.75      0.34      0.47      1056
         fr       0.73      0.62      0.67      1054
         it       0.00      0.00      0.00      1019
         ja       0.00      0.00      0.00       580
         nl       1.00      0.02      0.04       590
         pl       1.00      0.17      0.28      1056
         pt       1.00      0.98      0.99       958

avg / total       0.59      0.49      0.43      8805

0.705167518455423
             precision    recall  f1-score   support

         ar       1.00      1.00      1.00       332
         de       0.66      0.79      0.72      1075
         en       0.65      0.51      0.57      1085
         es       0.58      0.62      0.60      1056
         fr       0.79      0.68      0.73      1054
    

  .format(len(labels), len(target_names))
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [25]:
from sklearn.model_selection import GridSearchCV

parameters = {
    'vect__ngram_range': [(1,1), (1,2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3),
    'clf__max_iter': (5, 10),
}

gs_clf = GridSearchCV( text_clf_svm, parameters, cv=5, iid=False, n_jobs=-1 )

gs_clf = gs_clf.fit( training_set.data, training_set.target )

print( gs_clf.best_score_ )

for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

0.9979687056943263
clf__alpha: 0.001
clf__max_iter: 10
tfidf__use_idf: True
vect__ngram_range: (1, 2)
