In [75]:
import re
import string
import numpy as np
import pandas as pd
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [76]:
from keras.models import load_model
model = load_model('model.h5', compile=True)

In [77]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 101, 128)          256000    
                                                                 
 bidirectional_1 (Bidirectio  (None, 101, 202)         185840    
 nal)                                                            
                                                                 
 global_max_pooling1d_1 (Glo  (None, 202)              0         
 balMaxPooling1D)                                                
                                                                 
 batch_normalization_1 (Batc  (None, 202)              808       
 hNormalization)                                                 
                                                                 
 dropout_3 (Dropout)         (None, 202)               0         
                                                      

In [78]:
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [79]:
def clean_text(text):
    '''Make text lowercase, 
    remove text in square brackets, 
    remove links, 
    remove punctuation and 
    remove words containing numbers.'''
    
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [80]:
stop_words = stopwords.words('english')

def remove_stopwords(text):
    text = ' '.join(word for word in text.split(' ') if word not in stop_words)
    return text

In [81]:
stemmer = nltk.SnowballStemmer('english')

def stem_text(text):
    text = ' '.join(stemmer.stem(word) for word in text.split(' '))
    return text

In [82]:
def preprocess_data(text):
    text = clean_text(text)
    text = remove_stopwords(text)
    text = stem_text(text)
    return pd.Series(text)

In [83]:
text = preprocess_data("Nice video!")

In [90]:
def embed(corpus): 
    return tokenizer.texts_to_sequences(corpus)

train_padded_sentences = pad_sequences(
    embed(text), 
    101, 
    padding='post'
)

train_padded_sentences

array([[80,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0]])

In [91]:
train_padded_sentences.shape

(1, 101)

In [92]:
y_preds = (model.predict(train_padded_sentences) > 0.5).astype("int32")

In [93]:
y_preds

array([[0]])