# Using eli_5 with sklearn

## Loading Data

In [0]:
from sklearn.datasets import fetch_20newsgroups

In [0]:
categories = ['alt.atheism', 'soc.religion.christian', 
              'comp.graphics', 'sci.med']

In [0]:
twenty_train = fetch_20newsgroups(
    subset='train', categories = categories,
    shuffle=True,
    random_state=42
)

In [0]:
twenty_test = fetch_20newsgroups(
    subset='test', categories = categories,
    shuffle=True,
    random_state=42
)

## Make Pipeline

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import make_pipeline

In [0]:
vec = CountVectorizer()
clf = LogisticRegressionCV(verbose=True, n_jobs=-1, solver='lbfgs')
pipe = make_pipeline(vec, clf)

In [7]:
pipe.fit(twenty_train.data, twenty_train.target)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.7min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(memory=None,
         steps=[('countvectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('logisticregressioncv',
                 LogisticRegressionCV(Cs=10, class_weight=None, cv=None,
                                      dual=False, fit_intercept=True,
                                      intercept_scaling=1.0, l1_ratios=None,
                                      max_iter=100, multi_class

## Test predictions

In [0]:
from sklearn import metrics

In [0]:
def print_report(pipe):
    y_test = twenty_test.target
    y_pred = pipe.predict(twenty_test.data)
    report = metrics.classification_report(y_test, y_pred, target_names=twenty_test.target_names)
    print(report)
    print("Accuracy: - %0.2f" %(metrics.accuracy_score(y_test, y_pred)))

In [10]:
print_report(pipe)

                        precision    recall  f1-score   support

           alt.atheism       0.91      0.81      0.85       319
         comp.graphics       0.86      0.94      0.90       389
               sci.med       0.92      0.81      0.86       396
soc.religion.christian       0.88      0.98      0.92       398

              accuracy                           0.89      1502
             macro avg       0.89      0.89      0.89      1502
          weighted avg       0.89      0.89      0.89      1502

Accuracy: - 0.89


# Checking what eli5 wants to say

In [0]:
! pip install eli5

In [12]:
%tensorflow_version 2.x

TensorFlow 2.x selected.


In [0]:
import eli5

In [15]:
eli5.show_weights(clf, vec=vec, top=10, target_names=twenty_test.target_names)

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3
+0.567,keith,,
+0.482,atheism,,
+0.472,mathew,,
+0.472,bible,,
+0.428,atheists,,
+0.416,okcforum,,
+0.392,writes,,
+0.375,benedikt,,
… 10203 more positive …,… 10203 more positive …,,
… 25576 more negative …,… 25576 more negative …,,

Weight?,Feature
+0.567,keith
+0.482,atheism
+0.472,mathew
+0.472,bible
+0.428,atheists
+0.416,okcforum
+0.392,writes
+0.375,benedikt
… 10203 more positive …,… 10203 more positive …
… 25576 more negative …,… 25576 more negative …

Weight?,Feature
+1.084,graphics
+0.827,<BIAS>
+0.508,images
+0.496,software
+0.495,file
+0.488,image
+0.487,files
+0.454,package
+0.450,card
… 14441 more positive …,… 14441 more positive …

Weight?,Feature
+0.589,information
+0.580,pitt
+0.579,doctor
+0.486,disease
+0.478,msg
+0.471,treatment
+0.417,health
+0.409,radford
… 13608 more positive …,… 13608 more positive …
… 22171 more negative …,… 22171 more negative …

Weight?,Feature
+1.244,<BIAS>
+0.706,rutgers
+0.644,church
+0.631,christians
+0.585,christian
+0.543,god
+0.510,christ
… 10292 more positive …,… 10292 more positive …
… 25487 more negative …,… 25487 more negative …
-0.562,posting


In [0]:
eli5.show_prediction(clf, twenty_test.data[0], vec=vec, target_names=twenty_test.target_names)

# Trying with Tf-idf

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
vec = TfidfVectorizer()
clf = LogisticRegressionCV()
pipe = make_pipeline(vec, clf)

In [0]:
pipe.fit(twenty_train.data, twenty_train.target)

In [21]:
print_report(pipe)

                        precision    recall  f1-score   support

           alt.atheism       0.95      0.85      0.90       319
         comp.graphics       0.90      0.97      0.93       389
               sci.med       0.95      0.91      0.93       396
soc.religion.christian       0.91      0.96      0.93       398

              accuracy                           0.93      1502
             macro avg       0.93      0.92      0.92      1502
          weighted avg       0.93      0.93      0.92      1502

Accuracy: - 0.93


In [0]:
eli5.show_prediction(clf, twenty_test.data[0], vec=vec, target_names=twenty_test.target_names, targets=['sci.med'])