In [1]:
import nltk
from nltk.corpus import stopwords

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

MAX_SEQUENCE_LENGTH = 20000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 300

WORKDIR = '/home/lukas/git-projects/lstm-irgan'
DOCUMENTS_DIR = WORKDIR + '/data/wikiclir/dev.docs'  #'/data/example/documents/'
QUERIES = WORKDIR + '/data/wikiclir/dev.queries' #'/data/example/queries.txt'
LABELLED_DATA = WORKDIR + '/data/wikiclir/dev.qrel' #'/data/example/labelled_data.txt'

def __get_documents():
    path = DOCUMENTS_DIR
    documents = {}
    doc_ids = []

    with open(path) as f:
        content = f.readlines()
        for line in content:
            values = line.split("\t", 1)
            id = int(values[0])
            text = values[1]
            documents[id] = text
            doc_ids.append(id)
    return documents, doc_ids


def __get_queries():
    path = QUERIES
    queries = {}
    query_ids = []

    with open(path) as f:
        content = f.readlines()
        for line in content:
            values = line.split("\t", 1)
            id = int(values[0])
            text = values[1]
            queries[id] = text
            query_ids.append(id)
    return queries, query_ids


def __get_ratings():
    path = LABELLED_DATA
    ratings = {}

    with open(path) as f:
        content = f.readlines()
        for line in content:
            values = line.split("\t")
            query = int(values[0])
            text = int(values[2])
            rating = float(values[3])

            if query in ratings.keys():
                ratings[query][text] = rating
            else:
                ratings[query] = {text: rating}

    return ratings


def __filter_stop_words(texts, stop_words):
    for i, text in enumerate(texts):
        new_text = [word for word in text.split() if word not in stop_words]
        texts[i] = ' '.join(new_text)
    return texts


def __init_tokenizer(text_data, max_sequence_length):
    texts = list(text_data.values())
    ids = list(text_data.keys())

    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))
    stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])
    texts = __filter_stop_words(texts, stop_words)

    # finally, vectorize the text samples into a 2D integer tensor
    tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)

    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    data = pad_sequences(sequences, maxlen=max_sequence_length)

    text_data_sequenced = {}
    for i, text in enumerate(data):
        text_data_sequenced[ids[i]] = text

    return tokenizer, text_data_sequenced


def get_data():
    documents_data, doc_ids = __get_documents()
    queries_data, query_ids = __get_queries()
    ratings_data = __get_ratings()

    print('Tokenize queries')
    tokenizer_q, queries_data = __init_tokenizer(queries_data, MAX_SEQUENCE_LENGTH)
    print('Tokenize documents')
    tokenizer_d, documents_data = __init_tokenizer(documents_data, MAX_SEQUENCE_LENGTH)

    print('Found %s training data.' % len(ratings_data))

    return query_ids, ratings_data, documents_data, queries_data, tokenizer_q, tokenizer_d


Using TensorFlow backend.


In [None]:
query_ids, ratings_data, documents_data, queries_data, tokenizer_q, tokenizer_d = get_data()

Tokenize queries
[nltk_data] Downloading package stopwords to /home/lukas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Found 41594 unique tokens.
Tokenize documents
[nltk_data] Downloading package stopwords to /home/lukas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
import os
import warnings

import tensorflow as tf
from keras import backend

with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=FutureWarning)

tf_config = tf.ConfigProto()
tf_config.gpu_options.allow_growth = False #True
tf_config.gpu_options.allocator_type = 'BFC'

sess = tf.Session(graph=tf.get_default_graph(), config=tf_config)

backend.set_session(sess)

os.environ["CUDA_VISIBLE_DEVICES"] = "-1" #""0"

In [None]:
import numpy as np
import random

#from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

x_train, x_test = train_test_split(query_ids, test_size=0.15, random_state=42)

p_best_val = 0.0
ndcg_best_val = 0.0

skf = KFold(n_splits=5, shuffle=True)   

In [None]:
def __build_train_data(x_train, ratings_data, queries_data, documents_data):
    train_queries_data = {}
    train_documents_data = {}
    train_ratings_data = {}

    for query_id in x_train:
        train_ratings_data[query_id] = ratings_data[query_id]
        train_queries_data[query_id] = queries_data[query_id]
        for key in ratings_data.keys():
            if key in documents_data.keys():
                train_documents_data[key] = documents_data[key]

    return train_ratings_data, train_queries_data, train_documents_data

In [None]:
x_train_k, x_val_k = train_test_split(query_ids, test_size=0.50, random_state=42)

train_ratings_data, train_queries_data, train_documents_data = __build_train_data(x_train, ratings_data, queries_data, documents_data)

In [None]:
def __get_query_specific_data(query_id, ratings_data, documents_data):
    # get all query specific ratings
    x_pos_list = list(ratings_data[query_id].keys())
    y_pos_list = list(ratings_data[query_id].values())

    # get all other ratings
    docs_pos_ids = np.unique(x_pos_list)
    candidate_list = []
    for doc_id in documents_data.keys():
        if doc_id not in docs_pos_ids:
            candidate_list.append(doc_id)

    return x_pos_list, y_pos_list, candidate_list

def __get_rand_batch_from_candidates_for_negatives(query_id, queries_data, documents_data, candidate_list, x_pos_list):
    rand_batch = np.random.choice(np.arange(len(candidate_list)), [5 * len(x_pos_list)])

    # prepare pos and neg data
    data_queries = [queries_data[query_id]] * len(rand_batch)
    doc_ids = np.array(candidate_list)[rand_batch]
    data_documents = [documents_data[x] for x in doc_ids]

    # Importance Sampling
    prob = [0.2,0.2,0.2]

    return prob, data_queries, data_documents

query_id = x_train[0]

x_pos_list, y_pos_list, candidate_list = __get_query_specific_data(query_id, ratings_data, documents_data)

prob, data_queries, data_documents = __get_rand_batch_from_candidates_for_negatives(query_id, queries_data, documents_data, candidate_list, x_pos_list)

neg_list = np.random.choice(candidate_list, size=[len(x_pos_list)])

In [None]:
import fasttext

class FastText(object):
    def __init__(self, fasttext_lib_directory, fasttext_model_path):
        cmds = [fasttext_lib_directory, 'print-word-vectors', fasttext_model_path]  # Add '-' in the end for interactive mode, yet it didn't work for me...
        self.model = subprocess.Popen(cmds, stdout=subprocess.PIPE, stdin=subprocess.PIPE, env=os.environ.copy())

        # Test the model
        print('\nTesting the model...\nPrediction for apple: ')
        item = 'apple\n'
        item = item.encode('utf-8')
        self.model.stdin.write(item)
        result = self.model.stdout.readline()
        result = result[len(item):]
        result = np.fromstring(result, dtype=np.float32, sep=' ')
        self.vector_size = len(result)
        print('Length of word-vector is:', self.vector_size)

    def __getitem__(self, item):
        assert type(item) is str
        initial_item = item
        item = item.lower().replace('/', '').replace('-', '').replace('\\', '').replace('`', '')
        if len(item) == 0 or ' ' in item:
            raise KeyError('Could not process: ' + initial_item)

        if not item.endswith('\n'):
            item += '\n'

        item = item.encode('utf-8')
        self.model.stdin.write(item)
        self.model.stdout.flush()
        result = self.model.stdout.readline()  # Read result
        result = result[len(item):]            # Take everything but the initial item
        result = np.fromstring(result, dtype=np.float32, sep=' ')

        if len(result) != self.vector_size:
            print('Could not process: ' + item)
            raise KeyError('Could not process: ' + initial_item)
        return result

In [None]:
model = FastText(fasttext_lib_directory='./fastText/fasttext', fasttext_model_path='/home/lukas/Downloads/BioWordVec_PubMed_MIMICIII_d200.bin')
print(model['apple'])
print(model['banana'])