In [0]:
from tensorflow import keras
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

(X_train, y_train), (X_test, y_test) = keras.datasets.imdb.load_data()
word_index = keras.datasets.imdb.get_word_index()
word_to_num = {word:(num+3) for word,num in word_index.items()}
word_to_num["<PAD>"] = 0
word_to_num["<START>"] = 1
word_to_num["<UNKNOWN>"] = 2
num_to_word = {num:word for word,num in word_to_num.items()}

X_train = [' '.join(num_to_word[i] for i in X_train[j] if i < len(num_to_word)) for j in range(X_train.shape[0])]
X_test = [' '.join(num_to_word[i] for i in X_test[j] if i < len(num_to_word)) for j in range(X_test.shape[0])]

model = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
    ('lr', LogisticRegression())
])
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Test Accuracy: ', (y_pred == y_test).mean())

print(X_test[0])
print(y_test[0], model.predict([X_test[0]]))