In [1]:
from nltk import DependencyGraph
import codecs
import numpy as np
import pandas as pd
import itertools
import re
import os
import pymorphy2
import math
from collections import Counter
from stop_words import get_stop_words
import time
import codecs
import os.path
from sklearn import metrics

import artm



In [2]:
def get_processed_sentences(conll_file):
    processed_sentences = []
    sentence = []
    for line in codecs.open(conll_file, 'r', 'utf-8'):
        if len(line) == 1:
            processed_sentences.append(sentence)
            sentence = []
        else:
            word = line.split("\t")
            sentence.append(word)
    return processed_sentences

def get_raw_sentences(text_file):
    sentences = []
    for line in codecs.open(text_file, 'r', 'utf-8'):
        sentences.append(line)
    return sentences

In [3]:
def get_deps(processed_sentences):
    deps = []
    for sentence in processed_sentences:
        s = u''
        for line in sentence:
            s += u"\t".join(line) + u'\n'
        deps.append(s)
    return deps

def print_deps_tree(sent_dep):
    graph = DependencyGraph(tree_str=sent_dep)
    for triple in graph.triples():
        for e in triple:
            print(e[0]) if isinstance(e, tuple) else e,
        print
    print
    tree = graph.tree()
    print(tree.pretty_print())

## 1. Finding SPO triplets

In [4]:
# Transforms conll lines into lists:
def get_lists(sent_dep):
    dependencies = []
    pos = []
    tp = []
    words = []
    for t in sent_dep.split('\n'):
        if len(t) > 1:
            splt = t.split('\t')
            dependencies.append(int(splt[6]) - 1)
            pos.append(splt[3])
            tp.append(splt[7])
            words.append(splt[1])
            
    for i in range(len(tp)):
        # Find 'and' sequences
        if tp[i] == 'conj' and pos[i] == 'VERB':
            ids = [x for x in range(len(tp)) if dependencies[x] == dependencies[i] and tp[x] == 'nsubj'] 
            for j in ids:
                words.append(words[j])
                pos.append(pos[j])
                tp.append(tp[j])
                dependencies.append(i)
        elif tp[i] == 'conj' and pos[i] != 'VERB':
            dep = dependencies[i]
            pos[i] = pos[dep]
            dependencies[i] = dependencies[dep]
            tp[i] = tp[dep]
            
        # Find complex verbs
        if tp[i] in ['xcomp','dep']:
            dep = dependencies[i]
            words[dep] = words[dep] + ' ' + words[i]
            ids = [x for x in range(len(tp)) if dependencies[x] == i]
            for j in ids:
                dependencies[j] = dep
            pos[dep] = u'VERB'
            pos[i] = 'ADD_VERB'
            tp[i] = 'ADD_VERB'
            
        # Adjective triplets
        if tp[i] == 'ADJ' and pos[dependencies[i]] == 'VERB':
            dep = dependencies[i]
            words[dep] = words[dep]+' '+words[i]
        
        # Determine negative verbs
        if tp[i] == u'neg':
            dep = dependencies[i]
            words[dep] = words[i]+' '+words[dep]
        
        # Substitude words with their names if present
        if tp[i] == u'name':
            dep = dependencies[i]
            words[dep] = words[i]

#         if u'котор' in words[i]:
#             dep = int(dependencies[i]) - 1
#             words[i] = words[dep]
#             print words[i]
    return words, pos, dependencies, tp
            
                
# Find triplets in conll processed form        
def get_triplets(processed_sentence):
    triplets = []
    sent_dep = u''
    for line in processed_sentence:
        sent_dep += u"\t".join(line) + u'\n'
    words, pos, dependencies, tp = get_lists(sent_dep)
    
    ids = range(len(words))
    
    # regular triplets
    verbs = [x for x in ids if pos[x] == u'VERB' and tp[x] != 'amod']
    for i in verbs:
        verb_subjects = [words[x] for x in ids if tp[x] in ['nsubj','nsubjpass'] and dependencies[x] == i]
        if len(verb_subjects) == 0:
            verb_subjects.append(u'imp')
        verb_objects = [words[x] for x in ids if tp[x] == 'dobj' and dependencies[x] == i]
        if len(verb_objects) == 0:
            verb_objects.append(u'imp')
        for subj, obj in itertools.product(verb_subjects, verb_objects):
            triplets.append([subj, words[i], obj])
       
    # participle triplets
    participles = [x for x in ids if pos[x] == u'VERB' and tp[x] == 'amod']
    for i in participles:
        participle_subjects = [words[x] for x in ids if dependencies[i] == x]
        if len(participle_subjects) == 0:
            participle_subjects.append(u'imp')
        participle_objects = [words[x] for x in ids if tp[x] == 'dobj' and dependencies[x] == i]
        if len(participle_objects) == 0:
            participle_objects.append(u'imp')
        for subj, obj in itertools.product(participle_subjects, participle_objects):
            triplets.append([subj, words[i], obj])
            
    # implicit noun-noun triplets
    appos = [x for x in ids if tp[x] == u'appos']
    for i in appos:
        obj = words[dependencies[i]]
        triplets.append([words[i], u'есть', obj])

                
    #adjectives triplets
    adjectives = [x for x in ids if pos[x] == 'ADJ' and tp[x] == 'amod']
    for adj in adjectives:
        triplets.append([words[dependencies[adj]], u'есть', words[adj]])
    return triplets


def print_triplets(triplets_list):
    for i, triplet in enumerate(triplets_list):
        print(str(i + 1), '(' + triplet[0],', ', triplet[1],', ', triplet[2] + ')')

In [5]:
# Preprocess raw text for syntaxnet input
def syntaxnet_preprocess(filename):
    f = codecs.open(filename + '.txt', 'r')
    t = open(filename + '_prepared.txt','w')
    for line in f.readlines():
        line = re.sub(r'([.,!?()])', r' \1 ', line)
        line = re.sub('  ',' ',line)
        line = re.sub('«', '', line)
        line = re.sub('»', '', line)
        line = re.sub('"', '', line)
        line = re.sub('-', '', line)
        
        line = line.replace(r'. ', '.\n')
        t.write(line)
        

def run_syntaxnet(textfile, conllfile):
    command = "cat " + textfile + " | docker run --rm -i inemo/syntaxnet_rus > " + conllfile
    os.system(command)
    
# Get triplets from text doc or conll doc    
def get_doc_triplets(filename, conll = False):
    if conll == False: 
        syntaxnet_preprocess(filename)
        run_syntaxnet(filename + '_prepared.txt', filename + '.conll')
    processed_sentences = get_processed_sentences(filename + '.conll')
    text_triplets = []
    for sent in processed_sentences:
        text_triplets.extend(get_triplets(sent))
    return text_triplets

# Extract all subjects from triplet list
def subjects_from_triplets(triplet_list):
    stop_words = get_stop_words('russian')
    return [x[0] for x in triplet_list if x[0] != u'imp' and x[0] not in stop_words]

# Extract all objects from triplet list
def objects_from_triplets(triplet_list):
    stop_words = get_stop_words('russian')
    return [x[2] for x in triplet_list if x[2] != u'imp' and x[2] not in stop_words]


def get_subjects_from_triplet_lists(triplet_lists):
    subject_lists = []
    for triplets in triplet_lists:
        subject_lists.append(subjects_from_triplets(triplets))
    return subject_lists

# Lemmatize each triplet in triplet list
def lemmatize_triplet_list(triplet_list):
    lemmatizer = pymorphy2.MorphAnalyzer()
    stop_words = get_stop_words('russian')
    for i, triplet in enumerate(triplet_list):
        triplet_list[i] = [lemmatizer.parse(token)[0].normal_form.strip()
                           for token in triplet]


## 2. Topic modeling with labelled data

In [6]:
def prepare_vw(exists = True):
    marks = []
    
    if exists == True:
        f = open('news_vw','r')
        for line in f.readlines():
            marks.append(int(line.split('|mark ')[-1]))
        f.close()
        return marks
    
    
    output_file = open('news_vw', 'w')

    f = codecs.open('lnr_dnr_labelled.txt','r')
    i = 0
    j = -1
    for line in f.readlines():
        if line == '\n':
            continue
        j+=1
        
        if j%2 == 0:
            text = line.split('|text')[1]
            num = line.split('|text')[0]
            textfile = codecs.open('lnr_dnr_text.txt','w')
            textfile.write(text)
            textfile.close()

            triplets = get_doc_triplets('lnr_dnr_text', conll = False)
            lemmatize_triplet_list(triplets)
            subjects = subjects_from_triplets(triplets)
            objects = objects_from_triplets(triplets)
            
        else:
            mark = line.split('|mark')[1]
            if mark != '  9\n':
                marks.append(int(mark))
                output_file.write(str(i + 1) + " |subjects ")
                for subject in set(subjects):
                    if subject == u'—':
                        continue
                    subject = re.sub(':', '', subject)
                    output_file.write(' ' + subject.lower().encode('utf8'))
                    output_file.write(':' + str(subjects.count(subject)))
                    
                output_file.write(" |objects ")
                for obj in set(objects):
                    if obj == u'—':
                        continue
                    obj = re.sub(':', '', obj)
                    output_file.write(' ' + obj.lower().encode('utf8'))
                    output_file.write(':' + str(objects.count(obj)))
                    

                output_file.write(" |mark " + mark)
            i+=1

        


    output_file.close()
    return marks

In [7]:
f = open('news_vw','r')
t = open('lnr_dnr_vw','w')
i=0
for line in f.readlines():
    t.write(str(i)+' |subjects'+line.split('|subjects')[1])
    i+=1
t.write('\n')
f.close()
t.close()

In [21]:
marks = prepare_vw(exists = True)

In [22]:
data_path = './lnr_dnr_vw'
batches_path = './batches/'

batch_vectorizer = artm.BatchVectorizer(data_path=data_path, collection_name='',
                                            data_format='vowpal_wabbit', batch_size = 100, 
                                            target_folder=batches_path)

In [23]:
batch_vectorizer = artm.BatchVectorizer(data_path=batches_path, 
                                         data_format='batches',
                                       gather_dictionary=True)

In [30]:
def topic_model(num_of_topics, num_back, tau, tf):
    class_ids = {
         'subjects': 0.7,
         'objects':0.3
    }

    names_of_topics = [str(x) for x in range(num_of_topics)]

    dictionary_path=batches_path + '/news_dictionary.dict'

    my_dictionary = artm.Dictionary()

    if os.path.exists(dictionary_path):
        os.remove(dictionary_path)
        
    my_dictionary.gather(data_path=batches_path)
    my_dictionary.save(dictionary_path=batches_path + '/news_dictionary')
    my_dictionary.load(dictionary_path=batches_path + '/news_dictionary.dict')

    my_dictionary.filter(min_tf=tf)

    scores_artm = [artm.PerplexityScore(name='PerplexityScore', 
                                        dictionary=my_dictionary,class_ids=class_ids
                                       ),
                   artm.TopTokensScore(name='TopTokensScore', 
                                       topic_names=names_of_topics, 
                                       num_tokens=1000, 
                                       dictionary=my_dictionary,
                                       class_id='text'
                                      )]

    model = artm.ARTM(num_topics=num_of_topics,
                      #reuse_theta=True,
                      cache_theta=True,
                      num_document_passes=1,
                      topic_names=names_of_topics,
                      class_ids=class_ids, 
                      scores=scores_artm,
                      #regularizers=regularizers_artm,
                      dictionary=my_dictionary)


    model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='SparsePhiRegularizer',
                                                            class_ids=['@default_class'],
                                                            topic_names=model.topic_names[:-num_back],tau = -tau))
    model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='SmoothPhiRegularizer',
                                                            class_ids=['@default_class'],
                                                            topic_names=model.topic_names[-num_back:],tau = tau))


    model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='DecorrelatorRegularizer',
                                                          class_ids=['@default_class'],
                                                          topic_names=model.topic_names[:-num_back], tau=tau*10))
    model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='SparseThetaRegularizer',
                                                            topic_names=model.topic_names[-num_back], tau = tau))
    return model


In [12]:
def map_clusters(y_true, y_pred):
    m = {}
    clusters = set(y_true)
    for c1 in clusters:
        cnt1 = 0
        for c2 in set(y_pred): 
            
            cnt = 0
            for (x,y) in zip(y_true,y_pred):
                if (x==c1) & (y==c2):
                    cnt+=1
            if cnt>cnt1:
                cnt1 = cnt
                res = c2
        m[c1] = res
    return m

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
def precision_recall(y_true,y_pred):
    m = map_clusters(y_true,y_pred)
    if len(set(m.values()))<len(set(y_true)):
        return 0,0
    y_true = np.array([m[x] for x in y_true])
    precision = metrics.precision_score(y_true,y_pred,average='weighted')
    recall = metrics.recall_score(y_true,y_pred,average='weighted')
    return precision,recall

In [13]:
import sklearn.cluster
def without_zero(y_true,X):
    a, b = [],[]
    for (x,y) in zip(y_true,X):
        if x ==0:
            continue
        a.append(x)
        b.append(y)

    kmeans = sklearn.cluster.KMeans(n_clusters=2).fit(b)
    return np.array(a),np.array(kmeans.labels_)



In [14]:
param = {}
param['num_topics'] = [x for x in range(3,6)]
param['num_back'] = [x for x in range(1,3)]
param['min_tf'] = [x for x in range(1,4)]
param['tau'] = [x for x in np.arange(1.,10.,2.)]

from sklearn.model_selection import ParameterGrid

pg = ParameterGrid(param)
print(len(list(pg)))

best = [0,0]

for p in list(pg):
    
    model = topic_model(p['num_topics'], p['num_back'],p['tau'],p['min_tf'])
    model.fit_offline(batch_vectorizer, num_collection_passes=30)
    
    theta = model.get_theta()
    X = theta.as_matrix()[:-p['num_back']].T

    kmeans = sklearn.cluster.KMeans(n_clusters=3).fit(X)
    y_pred = kmeans.labels_
    y_true = np.array(marks)
    
    res = precision_recall(y_true,y_pred)
    
    
    if res[0] > best[0]:
        best = res
        best_p = p
    
print(best)
print(best_p)

90
[0, 0]


NameError: name 'best_p' is not defined

In [31]:
model = topic_model(5,1,1,3)
model.fit_offline(batch_vectorizer, num_collection_passes=5)

In [22]:
phi = model.get_phi()
for t in ['0','1']:
    top = phi.sort_values([t],ascending=False)[:10]
    print(top[t])

управление          0.110964
внешний             0.100842
завод               0.094606
украинский          0.075484
власть              0.042637
работа              0.041684
металлургический    0.041324
предприятие         0.040541
глава               0.039139
республика          0.038746
Name: 0, dtype: float32
блокада              0.097436
республика           0.090013
область              0.079313
самопровозгласить    0.066125
луганский            0.059843
управление           0.051056
внешний              0.044955
песок                0.044756
донецкий             0.044178
украинский           0.031704
Name: 1, dtype: float32


In [26]:
best = [0,0]

for p in list(pg):
    
    model = topic_model(p['num_topics'], p['num_back'],p['tau'],p['min_tf'])
    model.fit_offline(batch_vectorizer, num_collection_passes=30)
    
    theta = model.get_theta()
    X = theta.as_matrix()[:-p['num_back']].T

    kmeans = sklearn.cluster.KMeans(n_clusters=3).fit(X)
    y_pred = kmeans.labels_
    y_true = np.array(marks)
    
    
    a,b = without_zero(y_true,X)
    res = precision_recall(a,b)
    
    if res[0] > best[0]:
        best = res
        best_p = p
    
print(best)
print(best_p)

(0.77386569872958244, 0.77586206896551724)
{'min_tf': 1, 'num_back': 1, 'num_topics': 4, 'tau': 1.0}


In [32]:
model.score_tracker['PerplexityScore'].value

[0.0, 0.0, 0.0, 0.0, 0.0]