In [1]:
from __future__ import print_function

import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, Flatten
from keras.layers import Embedding, LSTM, Dropout
from keras.models import Model
from keras import regularizers
from keras.callbacks import ModelCheckpoint



#pip install h5py
EMBEDDING_DIM = 300
EPOCH = 100
BATCH_SIZE = 10
LONGEST_SENTENCE = 39

#GLOVE_DIR = "./glove_data/glove.6B." + str(EMBEDDING_DIM) + "d.txt"
GLOVE_DIR = "../QuestionClassificationScikit/glove_data/glove.6B." + str(EMBEDDING_DIM) + "d.txt"

TEXT_DATA = "./train_5500.label.text"
TAG_DATA = "./train_5500.label.tag"
SAVE_DIR = "./train_model"
NAME_MODEL = "model_LSTM"


MAX_SEQUENCE_LENGTH = LONGEST_SENTENCE # longest sentences
MAX_NB_WORDS = 20000

VALIDATION_SPLIT = 0.1

# first, build index mapping words in the embeddings set
# to their embedding vector
texts = []  # list of text samples
mapping2 = {'ABBR' : 0, 
               'DESC' : 1,
               'ENTY' : 2,
               'HUM' : 3,
               'LOC' : 4,
               'NUM' : 5} # dictionary mapping label name to numeric id
labels = []  # list of label ids



print('Indexing word vectors.')

def loadGloveModel(gloveFile):
        print ("Loading Glove Model")
        f = open(gloveFile,'r', encoding="UTF-8")
        model = {}
        for line in f:
            splitLine = line.split()
            word = splitLine[0]
            embedding = [float(val) for val in splitLine[1:]]
            model[word] = np.array(embedding).reshape(1,-1)
        print ("Done.",len(model)," words loaded!")
        return model

embeddings_index = loadGloveModel(GLOVE_DIR)

# second, prepare text samples and their labels
print('Processing text dataset')

with open(TEXT_DATA, mode="r", encoding="ISO-8859-1") as file:
        texts = file.readlines()

with open(TAG_DATA, mode="r", encoding="ISO-8859-1") as file:
        labels = [mapping2[tag.strip()] for tag in file.readlines()]

print('Found %s texts.' % len(texts))

# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)

Using TensorFlow backend.


Indexing word vectors.
Loading Glove Model
Done. 400000  words loaded!
Processing text dataset
Found 5452 texts.


In [13]:
from keras.models import model_from_json

# load json and create model
json_file = open("./train_model/model_CNN_300_10_reg_bigger_window.json", 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("./train_model/model_CNN_300_10_reg_bigger_window.h5")
print("Loaded model from disk")

Loaded model from disk


In [87]:
inv_mapping = {0: ["I am not sure.","What?"], 
               1: ["Sounds right.","Please, tell me more."],
               2: ["I will take it." ,"This type of things is strange."],
               3: ["I dont know him.", "Ok", "No", "He is right next to me."],
               4: ["I will be there", "I know that place", "It is near to me."],
               5: ["That is correct.", "Very big numbers.", ""]} # dictionary mapping label name to numeric id}

In [88]:
inv_mapping2 = {v: k for k, v in mapping2.items()}

In [91]:
test_text = ["who are you"]
sequence = tokenizer.texts_to_sequences(test_text)
data = pad_sequences(np.array(sequence), maxlen=MAX_SEQUENCE_LENGTH)
choose = np.argmax(loaded_model.predict(np.array(data)))
print(inv_mapping2[choose])
print("Answers :" , inv_mapping[choose])

HUM
Answers : ['I dont know him.', 'Ok', 'No', 'He is right next to me.']


In [92]:
test_text = ["Where are you from?"]
sequence = tokenizer.texts_to_sequences(test_text)
data = pad_sequences(np.array(sequence), maxlen=MAX_SEQUENCE_LENGTH)
choose = np.argmax(loaded_model.predict(np.array(data)))
print(inv_mapping2[choose])
print("Answers :" , inv_mapping[choose])

LOC
Answers : ['I will be there', 'I know that place', 'It is near to me.']


In [93]:
test_text = ["Is there something like god?"]
sequence = tokenizer.texts_to_sequences(test_text)
data = pad_sequences(np.array(sequence), maxlen=MAX_SEQUENCE_LENGTH)
choose = np.argmax(loaded_model.predict(np.array(data)))
print(inv_mapping2[choose])
print("Answers :" , inv_mapping[choose])

DESC
Answers : ['Sounds right.', 'Please, tell me more.']
