# Experiments with the LIME interpretation model
Data was downloaded [on kaggle](https://www.kaggle.com/c/word2vec-nlp-tutorial/download/labeledTrainData.tsv.zip). 


In [35]:
import pandas as pd
df = pd.read_csv('./labeledTrainData.tsv', sep='\t')
df.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


## Text preprocessing

Here I simply replace rare words with an "oov" (out-of-vocabulary) tag. This step is crucial as rare words (tipically less than 10 occurences) do not appear often enough for the model to learn pattern with it, and they increase the vector space representation dimension dramatically with most text vectorization method (such as bag-of-words or TF-IDF).

In [107]:
from collections import defaultdict
from sklearn.model_selection import train_test_split

texts = [t.lower() for t in df['review'].values]
y = df['sentiment'].values

# 1) Compute word frequencies
all_tokens = [token for t in texts for token in t.split()] 
frequencies = defaultdict(int)
for w in all_tokens:
    frequencies[w] += 1
print('Vocabulary size before rare words tagging:', len(set(all_tokens)))

# 2) Replace words occuring less than 20 times in the corpus with an out-of-vocabulary tag 
texts = [' '.join([w if frequencies[w] > 20 else '<oov>' 
                   for w in t.split()])
         for t in texts]
print('Vocabulary size after rare words tagging:', len(set([token for t in texts for token in t.split()])))

# 3) Split train and test sets (should be done before rare words tagging in real world applications)
texts_train, texts_test, y_train, y_test = train_test_split(texts, y)

Vocabulary size before rare words tagging: 257663
Vocabulary size after rare words tagging: 15351


## Keras tests

In [1]:
from keras.preprocessing.text import Tokenizer
#from sklearn. import TransformerMixin

class sklearn_tokenizer(Tokenizer, TransformerMixin):
    def __init__(self, **kwargs):
        super().__init__(kwargs)
        self.fit = super().fit_on_texts
        self.transform = super().texts_to_sequences
        
tokenizer = Tokenizer()
tokenizer.fit_on_texts(['bla bli', 'bli blu ble'])
tokenizer.texts_to_sequences(['blu bli', 'bli blu'])



Using Theano backend.


NameError: name 'TransformerMixin' is not defined

In [5]:
all_tokens = [token for t in texts for token in t.split()] 
words = set(all_tokens)
print('Lexicon size:', len(words))
words_indices = dict((w, i+1) for i, w in enumerate(words))
indices_words = dict((i+1, w) for i, w in enumerate(words))

maxlen = 128
max_features = len(words) + 1
n_samples = len(texts)
batch_size = 32
x = np.zeros((n_samples, maxlen), dtype=np.int64)

for i, text in enumerate(np.append(texts_train, texts_test)):
    for t, word in enumerate(text.split()[-maxlen:]):
        x[i, (maxlen-1-t)] = words_indices[word]

x_train = x[:len(texts_train)]
x_test = x[len(texts_train):]

Lexicon size: 4315


In [8]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, Bidirectional
from keras.layers import LSTM
from keras import regularizers
from keras.wrappers.scikit_learn import KerasClassifier

def create_model():
    model = Sequential()
    model.add(Embedding(max_features, 128))
    model.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)))
    model.add(Dense(1, activation='sigmoid', kernel_regularizer=regularizers.l2(0.001)))
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

sklearn_lstm = KerasClassifier(build_fn=create_model, epochs=1, batch_size=batch_size, verbose=0)

In [9]:
sklearn_lstm.fit(x_train[:1000, :], y_train[:1000])

<keras.callbacks.History at 0x116721390>

In [6]:
model = create_model()
print('Train...')
model.fit(x_train[:1000, :], y_train[:1000],
          batch_size=batch_size,
          epochs=1,
          validation_data=(x_test[:100, :], y_test[:100]), 
          verbose=2, );

#score, acc = model.evaluate(x_test, y_test,
#                            batch_size=batch_size)
#print('Test score:', score)
#print('Test accuracy:', acc)

Using Theano backend.


Train...
Train on 1000 samples, validate on 100 samples
Epoch 1/1
20s - loss: 0.6924 - acc: 0.5320 - val_loss: 0.7116 - val_acc: 0.4200


In [60]:
sklearn_clf.fit(x_train[:1000, :], y_train[:1000])

<keras.callbacks.History at 0x143277550>

In [62]:
y_pred = sklearn_clf.predict(x_test[:1000, :])

In [64]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred, y_test[:1000])

0.502

## CNN

In [None]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.datasets import imdb
from keras.utils import np_utils
from sklearn.model_selection import train_test_split

# Define networks parameters
embedding_dims = 64
batch_size = 32
filters = 40
kernel_size = 3
hidden_dims = 16
epochs = 20

def create_model():
    model = Sequential()
    model.add(Embedding(len(words)+1,
                        embedding_dims,
                        input_length=maxlen))
    model.add(Dropout(0.5))

    model.add(Conv1D(filters,
                     kernel_size,
                     padding='valid',
                     activation='relu',
                     strides=1,))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(hidden_dims))
    model.add(Dropout(0.5))
    model.add(Activation('relu'))

    # We project onto a single unit output layer, and squash it with a sigmoid:
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
    return model

def train_and_evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train,
              batch_size=batch_size,
              epochs=epochs,
              validation_data=(X_test, y_test))


model = create_model()
train_and_evaluate_model(model, x_train, y_train, x_test, y_test)


In [21]:
import spacy
nlp = spacy.load('en', create_pipeline=create_pipeline)

corpus = nlp(df.loc[0, 'review'])

corpus.vector

## LSTM

In [None]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.datasets import imdb

max_features = 20000
# cut texts after this number of words
# (among top max_features most common words)
maxlen = 100
batch_size = 32

print('Loading data...')
print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print('x_train shape:', X_train.shape)
print('x_test shape:', X_test.shape)
y_train = np.array(y_train)
y_test = np.array(y_test)

def create_model():
    model = Sequential()
    model.add(Embedding(max_features, 128, input_length=maxlen))
    model.add(Bidirectional(LSTM(64)))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes))
    model.add(Activation('softmax'))
    model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])
    return model

print('Train...')
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=1,
          validation_data=[X_test, y_test],
         verbose=2)