In [1]:
from stop_words import get_stop_words
from gensim import corpora
import gensim
import urllib
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
import os
import logging
import re

probase_cache = {}
translate_cache = {}


def split_tweet_in_words(tweet):
    real_words = []

    words = re.findall(r'\'|’|"|”|“|»|«|\(|\)|\[|\]|\{|\}:;|[^\'’"”“»«\(\)\[\]\{\}\s:;]+', tweet)
    for word in words:
        word = word.strip()
        if word.startswith("..."):
            real_words.append(word[:3])
            append_if_not_empty(real_words, word[3:])
        if word.startswith(("\"", "(", "[", "{", "<", "«", "…", "“")):
            real_words.append(word[:1])
            word = word[1:]
        if word.endswith("..."):
            append_if_not_empty(real_words, word[:-3])
            real_words.append(word[-3:])
        elif word.endswith((".", ",", ":", ";", "]" ")", "}", "!", "?", "\"", ">", "»", "…", "”")):
            append_if_not_empty(real_words, word[:-1])
            real_words.append(word[-1:])
        else:
            append_if_not_empty(real_words, word)
    return real_words


def append_if_not_empty(list, item):
    if item:
        list.append(item)


def get_entity_probase_concepts(entity):
    if entity in probase_cache:
        return probase_cache[entity]
    url = 'https://concept.research.microsoft.com/api/Concept/ScoreByProb?instance={}&topK=20&api_key=eT5luCbmII34ZvpPVs7HxtbUU1cFcE12'
    request_url = url.format(urllib.parse.quote_plus(entity))
    response = requests.get(request_url, verify=False)
    concepts = response.json()
    if len(concepts) == 0:
        request_url = url.format(urllib.parse.quote_plus(entity))
        response = requests.get(request_url, verify=False)
        concepts = response.json()
    # for now let's keep it simple
    probase_cache[entity] = concepts
    return concepts



In [12]:

class LDA():
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        self.stop_words = get_stop_words('en')

    def load(self, file):
        self.ldamodel = gensim.models.ldamodel.LdaModel.load(file)

    def update(self, document_folder, save_to):
        corpus, dictionary = self.create_lda_corpus(document_folder)
        # update LDA model
        self.ldamodel.update(corpus)
        print('Finished updating')
        self.ldamodel.save(save_to)

    def train(self, nr_of_topics, document_folder, save_to, training_iterations = 20):
        corpus, dictionary = self.create_lda_corpus(document_folder)

        # generate LDA model
        self.ldamodel = gensim.models.ldamulticore.LdaMulticore(corpus, num_topics=nr_of_topics, id2word=dictionary, passes=training_iterations)
        print('Finished training')
        self.ldamodel.save(save_to)

    def create_lda_corpus(self, document_folder):
        # load documents
        files = os.listdir(document_folder)
        doc_set = []
        for file in files:
            with open(document_folder + '/' + file, "r", encoding="utf8") as f:
                for line in f:
                    l = line.strip()
                    if len(l) > 0:
                        doc_set.append(l)

        print("Read {} documents".format(len(doc_set)))

        # list for tokenized documents in loop
        texts = []

        # loop through document list
        for i in doc_set:
            # add tokens to list
            texts.append(self.process_document_content(i))

        # turn our tokenized documents into a id <-> term dictionary
        dictionary = corpora.Dictionary(texts)

        # convert tokenized documents into a document-term matrix
        return ([dictionary.doc2bow(text) for text in texts], dictionary)

    def process_document_content(self, doc):
        raw = doc.lower()
        tokens = self.tokenizer.tokenize(raw)

        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in self.stop_words]

        return stopped_tokens


In [24]:
import collections

class Conceptualizer():
    def __init__(self, lda):
        self.lda = lda
        self.ldamodel = lda.ldamodel

    def conceptualize(self, sentence, instance):
        probase_concepts = get_entity_probase_concepts(instance)
        if len(probase_concepts) == 0:
            return None

        try:
            # check context
            words = split_tweet_in_words(sentence.lower())
            instance_words = split_tweet_in_words(instance.lower())
            i = words.index(instance_words[0])
            larger_probase_concepts = get_entity_probase_concepts(" ".join(words[max(i - 1, 0):i + len(instance_words)]))
            if len(larger_probase_concepts) > 0:
                return None
            larger_probase_concepts = get_entity_probase_concepts(" ".join(words[i:i + len(instance_words) + 1]))
            if len(larger_probase_concepts) > 0:
                return None
        except Exception as e:
            print('Error getting larger concepts for {} in {}: {}'.format(instance.encode('utf-8'), sentence.encode('utf-8'), e))

        probabilities = []
        alpha = 1000
        for name in collections.OrderedDict(sorted(probase_concepts.items())):
            if name in self.ldamodel.id2word.token2id.keys():
                probability = probase_concepts[name] / alpha
                concept_topics = self.ldamodel.get_term_topics(name, minimum_probability=0)
                concept_probabilities = [0] * self.ldamodel.num_topics
                for nr, prob in concept_topics:
                    concept_probabilities[nr] = prob
                bow = self.ldamodel.id2word.doc2bow(self.lda.process_document_content(sentence))
                doc_topics = self.ldamodel.get_document_topics(bow, minimum_probability=0)
                summ = 0
                for k in doc_topics:
                    summ += concept_probabilities[k[0]] * k[1]
                probabilities.append((name, summ * probability))

        print(probabilities)
        if len(probabilities) == 0:
            return None
        found_concept = max(probabilities, key=lambda item: item[1])[0]
        return found_concept

In [25]:
from nltk.tokenize import RegexpTokenizer
lda = LDA(RegexpTokenizer(r'\w+'))
#lda.load('../models/ldamodel_50.gensim')
lda.load('../models/ldamodel_topics100_trainiter20_train_en.gensim')
conceptualizer = Conceptualizer(lda)
conceptualizer.conceptualize('When was Barack Obama born?', 'Barack Obama')

[('candidate', 7.2788384779448036e-08), ('celebrity', 2.9674991200152272e-10), ('democrat', 3.26039879304097e-08), ('leader', 2.5843048461607351e-07), ('person', 1.1772555810108534e-07), ('personality', 1.7598007182319703e-09), ('politician', 2.0864288599554689e-07), ('president', 3.2535116587479946e-07)]


'president'

In [13]:
a = {'elitist': 0.052054794520547946, 'person': 0.1095890410958904, 'prominent black leader': 0.0410958904109589, 'notable liberal': 0.03287671232876712, 'personality': 0.06027397260273973, 'great leader': 0.030136986301369864, 'celebrity': 0.0273972602739726, 'famous person': 0.030136986301369864, 'political leader': 0.049315068493150684, 'satanic asslickers': 0.0273972602739726, 'candidate': 0.0547945205479452, 'democrat': 0.0547945205479452, 'presidential candidate': 0.024657534246575342, 'college student': 0.024657534246575342, 'name': 0.038356164383561646, 'u s politician': 0.0410958904109589, 'world leader': 0.052054794520547946, 'leader': 0.1095890410958904, 'politician': 0.11506849315068493, 'president': 0.024657534246575342}

In [19]:
import collections
for name in collections.OrderedDict(sorted(a.items())):
    print(name, a[name])

candidate 0.0547945205479452
celebrity 0.0273972602739726
college student 0.024657534246575342
democrat 0.0547945205479452
elitist 0.052054794520547946
famous person 0.030136986301369864
great leader 0.030136986301369864
leader 0.1095890410958904
name 0.038356164383561646
notable liberal 0.03287671232876712
person 0.1095890410958904
personality 0.06027397260273973
political leader 0.049315068493150684
politician 0.11506849315068493
president 0.024657534246575342
presidential candidate 0.024657534246575342
prominent black leader 0.0410958904109589
satanic asslickers 0.0273972602739726
u s politician 0.0410958904109589
world leader 0.052054794520547946
