In [1]:
import re
import urllib

import requests

cache = {}


def get_concepts_of_instance_by_probase(instance, use_cache=True):
    """
    Fetches the concept and the probabilities for a given instance by probase.
    :param instance: the instance, for which the concepts should be requested
    :param use_cache: if true a cache for instances and corresponding concepts is used, to avoid unnecessary requests
    :return: the concepts and their probability
    """
    if use_cache == True and instance in cache:
        return cache[instance]

    probase_url = 'https://concept.research.microsoft.com/api/Concept/ScoreByProb?instance={}&topK=20&api_key=eT5luCbmII34ZvpPVs7HxtbUU1cFcE12'
    requestUrl = probase_url.format(urllib.parse.quote_plus(instance))
    response = requests.get(requestUrl)
    concepts = response.json()
    return concepts


def appendIfNotEmpty(list, item):
    """
    Append item to list, if item is not None. in place
    :param list: the list, where the item should been appended to
    :param item: the item which should been appended to the list
    """
    if item:
        list.append(item)


def split_text_in_words(text):
    """
    Splits a given text into words
    :param text: the text which should be splited into words
    :return: a list containing the splitted words
    """
    real_words = []

    words = re.findall(r'\'|’|"|”|“|»|«|\(|\)|\[|\]|\{|\}:;|[^\'’"”“»«\(\)\[\]\{\}\s:;]+', text)
    for word in words:
        word = word.strip()
        if word.startswith("..."):
            real_words.append(word[:3])
            appendIfNotEmpty(real_words, word[3:])
        if word.startswith(("\"", "(", "[", "{", "<", "«", "…", "“")):
            real_words.append(word[:1])
            word = word[1:]
        if word.endswith("..."):
            appendIfNotEmpty(real_words, word[:-3])
            real_words.append(word[-3:])
        elif word.endswith((".", ",", ":", ";", "]" ")", "}", "!", "?", "\"", ">", "»", "…", "”")):
            appendIfNotEmpty(real_words, word[:-1])
            real_words.append(word[-1:])
        else:
            appendIfNotEmpty(real_words, word)
    return real_words


In [2]:
import itertools
import os

import gensim
from gensim import corpora
from gensim.corpora.wikicorpus import _extract_pages, filter_wiki
from gensim.parsing.preprocessing import STOPWORDS
from gensim.utils import smart_open, simple_preprocess
from stop_words import get_stop_words


def iter_wiki(wiki_dump_file, min_length_of_article=50, ignore_namespaces=None):
    """
    Iterator over wiki_dump_file.
    Returns title and tokens for next article in dump file.
    Ignores short articles.
    Ignores meta articles, throug given namespaces.
    Default namespaces are 'Wikipedia', 'Category', 'File', 'Portal', 'Template', 'MediaWiki', 'User', 'Help', 'Book', 'Draft'
    :param wiki_dump_file: the dump file
    :param min_length_of_article: the min number of words in the next article. Default = 50
    :param ignore_namespaces: list of namespaces which should be ignored.
    :return: title, tokens
    """
    if ignore_namespaces is None:
        ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split()
    for title, text, pageid in _extract_pages(smart_open(wiki_dump_file)):
        text = filter_wiki(text)
        tokens = tokenize(text)
        if len(tokens) < min_length_of_article or any(title.startswith(namespace + ':') for namespace in ignore_namespaces):
            continue  # ignore short articles and various meta-articles
        yield title, tokens


# TODO compare simple_preprocess to my own preprocess
def tokenize(text):
    """
    Preprocess and then tokenize a given text
    :param text: the text which should be tokenized.
    :return: the token of the given text, after preprocess the text
    """
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]


class LDA():
    def __init__(self):
        self.stop_words = get_stop_words('en')

    def load(self, model_file):
        """
        Loads a LDA model from a given file
        :param model_file: the file which contains the model, which should be loaded
        """
        self.ldamodel = gensim.models.ldamulticore.LdaMulticore.load(model_file)

    def train_on_document_folder(self, num_topics, document_folder, model_outputfile, training_iterations=20):
        """
        Trains a new lda model, based on a folder with different document.
        Each document in a different file.
        :param num_topics: the number of topics, which should be generated
        :param document_folder: the folder, which contains the documents
        :param model_outputfile: the file in which the trained model should be saved
        :param training_iterations: the number of LDA training iterations
        """
        corpus, dictionary = self.__create_lda_corpus_based_on_document_folder(document_folder)

        self.ldamodel = gensim.models.ldamulticore.LdaMulticore(corpus, num_topics=num_topics, id2word=dictionary,
                                                                passes=training_iterations)
        self.ldamodel.save(model_outputfile)

    def generate_bow_of_wiki_dump(self, wiki_dump_file, bow_output_file, dict_output_file):
        doc_stream = (tokens for _, tokens in iter_wiki(wiki_dump_file))
        id2word_wiki = gensim.corpora.Dictionary(doc_stream)
        print(id2word_wiki)
        id2word_wiki.filter_extremes(no_below=20, no_above=0.1, keep_n=None)
        print(id2word_wiki)
        wiki_corpus = WikiCorpus(wiki_dump_file, id2word_wiki)
        print("save bow...")
        gensim.corpora.MmCorpus.serialize(bow_output_file, wiki_corpus)
        print("save dict")
        id2word_wiki.save(dict_output_file)

    def train_on_wiki_dump(self, num_topics, bow_path, dict_path, model_outputfile, training_iterations=20,
                           max_docs=None):
        """
        Trains a new LDA model based on a wikipedia dump or any other dump in the same format.
        The dump could be zipped.
        :param num_topics: the number of topics, which should be generated
        :param bow_path: the path inclusive filename, where the bag of words should be saved
        :param dict_path: the path incl. filename, where the dictionary should be saved
        :param model_outputfile: the file in which the trained model should be stored
        :param training_iterations: the number of LDA training iterations
        :param max_docs: the number of how many docs should be used for training, if None all docs are used
        """
        print("load bow...")
        mm_corpus = gensim.corpora.MmCorpus(bow_path)
        print("load dict...")
        id2word_wiki = gensim.corpora.Dictionary.load(dict_path)
        clipped_corpus = gensim.utils.ClippedCorpus(mm_corpus, max_docs)
        print("start trainig")
        self.ldamodel = gensim.models.ldamulticore.LdaMulticore(clipped_corpus, num_topics=num_topics,
                                                                id2word=id2word_wiki, passes=training_iterations)
        print("save model")
        self.ldamodel.save(model_outputfile)

    def __create_lda_corpus_based_on_document_folder(self, document_folder):
        """
        Creates a corpus and the corresponding dictionary, for a given document folder
        :param document_folder: the folder, which contains the documents
        :return: the corpus and the dictionary
        """
        files = os.listdir(document_folder)
        doc_set = []
        for file in files:
            with open(document_folder + '/' + file, "r", encoding="utf8") as f:
                for line in f:
                    l = line.strip()
                    if len(l) > 0:
                        doc_set.append(l)

        print("Finished reading {} documents".format(len(doc_set)))

        # list for tokenized documents in loop
        texts = self.preprocess_documents(doc_set)

        # turn our tokenized documents into a id <-> term dictionary
        dictionary = corpora.Dictionary(texts)

        # convert tokenized documents into a document-term matrix
        return ([dictionary.doc2bow(text) for text in texts], dictionary)

    def preprocess_documents_original(self, docs):
        """
        Tokenize and remove stop words of given documents
        :param docs: collection which contains the documents
        :return: the preprocessed texts
        """
        texts = []
        for doc in docs:
            raw = doc.lower()
            tokens = tokenize(raw)

            stopped_tokens = [i for i in tokens if not i in self.stop_words]
            texts.append(stopped_tokens)
        return texts

    def preprocess_documents(self, documents):
        """
        Preprocess given documents.

        removes meta-articles,
        ignores short documents, to avoid unwanted word 2 word connections,
        removes stop words
        :param documents: collection of to be processed documents
        """
        namespaces = ['Wikipedia', 'Category', 'File', 'Portal', 'Template', 'MediaWiki', 'User', 'Help', 'Book',
                      'Draft']
        namespaces = [namespace.lower() for namespace in namespaces]

        # remove_short_documents
        # remove_articles_where_title_specific_namespaces
        texts = []
        for document in documents:
            if len(document) >= 200:
                if document.title not in namespaces:
                    tokens = tokenize(document.lower())
                    # Stoplist cleaning:
                    cleaned_tokens = [token for token in tokens if token not in self.stop_words]
                    texts.append(tokens)
        # Remove most frequent and less frequent words
        # for word in all documents:
        #    if word appears in more than 10 % of the articles:
        #        remove(word) from whole corpora
        #    if word apperas in less than 20 articles:
        #        remove(word) from corpora

        # Additional possible steps
        # filter by length
        # lemmatization
        # stemming
        # parts of speech

        # then keep top n words # recommended 50.000 - 100.000
        return texts

    def update_on_document_folder(self, document_folder, model_output_file):
        """
        Online learning.
        Updates the current LDA model, trained on the given documents
        :param document_folder: the folder, which contains the new documents
        :param model_output_file: the outputfile, where the updated model should be stored
        """
        corpus, dictionary = self.__create_lda_corpus_based_on_document_folder(document_folder)
        print('start updating')
        self.ldamodel.update(corpus)
        print('finished updating. Now save model')
        self.ldamodel.save(model_output_file)


class WikiCorpus(object):
    def __init__(self, dump_file, dictionary, clip_docs=None):
        """
        Parse the first `clip_docs` Wikipedia documents from file `dump_file`.
        Yield each document in turn, as a list of tokens (unicode strings).
        """
        self.dump_file = dump_file
        self.dictionary = dictionary
        self.clip_docs = clip_docs

    def __iter__(self):
        """
        Iterator over wiki corpus
        :return: bag-of-words format = list of `(token_id, token_count)` 2-tuples
        """
        self.titles = []
        for title, tokens in itertools.islice(iter_wiki(self.dump_file), self.clip_docs):
            self.titles.append(title)
            yield self.dictionary.doc2bow(tokens)

    def __len__(self):
        return self.clip_docs




In [75]:
lda = LDA()
#lda.load('../models/ldamodel_topics100_trainiter20_train_en.gensim')
lda.load('../models/ldamodel_50.gensim')

In [76]:
from utilities import get_concepts_of_instance_by_probase
from utilities import split_text_in_words
import collections
import numpy as np

class Conceptualizer():
    def __init__(self, lda):
        self.lda = lda
        self.ldamodel = lda.ldamodel

    def conceptualize(self, sentence, instance):
        """
        Conceptualize the given instance in the given context (sentence)
        :param sentence: a sentence as context
        :param instance: the instance, which should be conceptualized in the given context
        :return: the most likely concept for the intance in the given context
        """
        concepts = get_concepts_of_instance_by_probase(instance)
        if len(concepts) == 0:  # TODO
            return None

        try:
            # check context
            words = split_text_in_words(sentence.lower())
            instance_words = split_text_in_words(instance.lower())
            i = words.index(instance_words[0])
            largerProbaseConcepts = get_concepts_of_instance_by_probase(
                " ".join(words[max(i - 1, 0):i + len(instance_words)]))
            if len(largerProbaseConcepts) > 0:
                pass
                #return None
            largerProbaseConcepts = get_concepts_of_instance_by_probase(" ".join(words[i:i + len(instance_words) + 1]))
            if len(largerProbaseConcepts) > 0:
                pass
                #return None
        except Exception as e:
            print('Error getting larger concepts for {} in {}: {}'.format(instance.encode('utf-8'),
                                                                          sentence.encode('utf-8'), e))
        probabilities_of_concepts = self.__calculate_probs_of_concepts(concepts, sentence)
        
        most_likely_concept = max(probabilities_of_concepts, key=lambda item: item[1])[0]
        return most_likely_concept
    
    def __calculate_probs_of_concepts(self, concepts, sentence):
        """
        Calculates for each concept the probability of the concept for the given sentence
        :param concepts: the concepts and their probability
        :param sentence: the given sentence
        :return: the concepts and ther probabilities
        """
        probabilities_of_concepts = []
        for concept in collections.OrderedDict(sorted(concepts.items())):#concepts:
            if concept not in self.ldamodel.id2word.token2id.keys():
                continue
            prob_c_given_w = concepts[concept]
            
            #topics_of_concept = self.ldamodel.get_term_topics(concept, minimum_probability=0.0) # phi
            #probs_of_topics_for_given_concept = [0] * self.ldamodel.num_topics
            #summm = 0
            #for topic_id, prob_of_topic in topics_of_concept:
            #    probs_of_topics_for_given_concept[topic_id] = prob_of_topic
            #    summm += prob_of_topic
            #print(summm)
            topic_terms_ = self.ldamodel.state.get_lambda()
            topics_terms_proba_ = np.apply_along_axis(lambda x: x/x.sum(), 1, topic_terms_)
            probs_of_topics_for_given_concept = topics_terms_proba_[:,self.ldamodel.id2word.token2id[concept]]
            
            #for topic_id in range(100):
            #    print(np.sum(topics_terms_proba_[topic_id,:]))
            
            bag_of_words = self.ldamodel.id2word.doc2bow(simple_preprocess(sentence))
            # topic_distribution_for_given_bow
            topics_of_text = self.ldamodel.get_document_topics(bag_of_words, minimum_probability=0)
            sum = 0
            for topic_id, prob_of_topic in topics_of_text:
                sum += probs_of_topics_for_given_concept[topic_id] * prob_of_topic
            prob_c_given_w_z = prob_c_given_w * sum
            
            probabilities_of_concepts.append((concept, prob_c_given_w_z))
        return probabilities_of_concepts


In [77]:
from nltk.stem.snowball import EnglishStemmer
conceptualizer = Conceptualizer(lda)
conceptualizer.conceptualize("When was Barack Obama born?", "Barack Obama")

'leader'

In [7]:
len(lda.ldamodel.id2word.token2id.keys())

30202

In [8]:
'person' in lda.ldamodel.id2word.token2id

True