In [None]:
import lime
import numpy as np
import sklearn
import sklearn.ensemble
import sklearn.metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from __future__ import print_function

In [None]:
X = []
y = []
with open('data/SMSSpamCollection') as f:
    for line in f:
        splited = line.split('\t')
        X.append(splited[1])
        if (splited[0] == 'ham'):
            y.append(0)
        elif (splited[0] == 'spam'):
            y.append(1)
        else:
            print('ERROR: label not found')
class_names = ['ham', 'spam']

In [None]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.2)

In [None]:
vectorizer = TfidfVectorizer(lowercase=False)
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)

In [None]:
rf = sklearn.ensemble.RandomForestClassifier(n_estimators=500)
rf.fit(train_vectors, y_train)

In [None]:
pred = rf.predict(test_vectors)
sklearn.metrics.f1_score(y_test, pred, average='binary')

In [None]:
from sklearn.pipeline import make_pipeline
c = make_pipeline(vectorizer, rf)

In [None]:
print(c.predict_proba([X_test[0]]))

In [None]:
from lime.lime_text import LimeTextExplainer
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
idx = 83
exp = explainer.explain_instance(X_test[idx], c.predict_proba, num_features=10)
print('Document id: %d' % idx)
print('Probability(spam) =', c.predict_proba([X_test[idx]])[0,1])
print('True class: %s' % class_names[y_test[idx]])

In [None]:
exp.as_list()

In [None]:
%matplotlib inline
fig = exp.as_pyplot_figure()

In [None]:
exp.show_in_notebook(text=True)

In [None]:
exp.save_to_file('tmp/oi.html')