In [2]:
import artm

from nltk import DependencyGraph
from nltk.tokenize import RegexpTokenizer
import codecs
import numpy as np
import pandas as pd
import itertools
import re
import os
import pymorphy2
import math
from collections import Counter
from stop_words import get_stop_words
import time
import codecs
import os.path
from sklearn import metrics

from sklearn.feature_extraction.text import CountVectorizer
import sklearn


In [3]:
def lemmatize_word_list(word_list):
    res = []
    lemmatizer = pymorphy2.MorphAnalyzer()
    stop_words = get_stop_words('russian')
    for i, word in enumerate(word_list):
        if word in stop_words:
            continue
        res.append(lemmatizer.parse(word)[0].normal_form.strip())
    return res
  
  
def preprocess(filename):
    f = codecs.open(filename, 'r')
    t = open('lnr_dnr_reg_vw','w')
    tokenizer = RegexpTokenizer(r'[а-яА-Я]*')
    i = 0
    for line in f.readlines():
        if line =='\n':
            continue
        if i%2 == 0:
            words = [x.lower() for x in tokenizer.tokenize(line) if x != '']
            words = lemmatize_word_list(words)
            t.write(str(i//2) + ' |text ')
            for word in set(words):
                t.write(str(word) + ':' + str(words.count(word)) + ' ')
        else:
            t.write('|mark ' + line.split('|mark ')[1])
        i+=1
    f.close()
    t.close()

In [4]:

preprocess('lnr_dnr_labelled.txt')

In [5]:
marks = []
f = open('lnr_dnr_reg_vw','r')
t = open('news_reg_vw','w')
for line in f.readlines():
    m = int(line.split('|mark ')[1])
    if m == 9:
        continue
    t.write(line)
    marks.append(m)
f.close()
t.close()

In [20]:
data_path = './news_reg_vw'
batches_path = './batches/'

batch_vectorizer = artm.BatchVectorizer(data_path=data_path, collection_name='',
                                            data_format='vowpal_wabbit', batch_size = 100, 
                                            target_folder=batches_path)

In [21]:
batch_vectorizer = artm.BatchVectorizer(data_path=batches_path, 
                                         data_format='batches',
                                       gather_dictionary=True)

In [22]:
def topic_model(num_of_topics, num_back, tau, tf):
    class_ids = {
         'text': 1.0
    }

    names_of_topics = [str(x) for x in range(num_of_topics)]

    dictionary_path=batches_path + '/news_dictionary.dict'

    my_dictionary = artm.Dictionary()

    if os.path.exists(dictionary_path):
        os.remove(dictionary_path)
        
    my_dictionary.gather(data_path=batches_path)
    my_dictionary.save(dictionary_path=batches_path + '/news_dictionary')
    my_dictionary.load(dictionary_path=batches_path + '/news_dictionary.dict')

    my_dictionary.filter(min_tf=tf)

    scores_artm = [artm.PerplexityScore(name='PerplexityScore', 
                                        dictionary=my_dictionary
                                       ),
                   artm.TopTokensScore(name='TopTokensScore', 
                                       topic_names=names_of_topics, 
                                       num_tokens=1000, 
                                       dictionary=my_dictionary,
                                       class_id='text'
                                      )]

    model = artm.ARTM(num_topics=num_of_topics,
                      #reuse_theta=True,
                      cache_theta=True,
                      num_document_passes=1,
                      topic_names=names_of_topics,
                      class_ids=class_ids, 
                      scores=scores_artm,
                      #regularizers=regularizers_artm,
                      dictionary=my_dictionary)


    model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='SparsePhiRegularizer',
                                                            class_ids=['@default_class'],
                                                            topic_names=model.topic_names[:-num_back],tau = -tau))
    model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='SmoothPhiRegularizer',
                                                            class_ids=['@default_class'],
                                                            topic_names=model.topic_names[-num_back:],tau = tau))


    model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='DecorrelatorRegularizer',
                                                          class_ids=['@default_class'],
                                                          topic_names=model.topic_names[:-num_back], tau=tau))
    model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='SparseThetaRegularizer',
                                                            topic_names=model.topic_names[-num_back], tau = tau))
    return model



In [23]:
def map_clusters(y_true, y_pred):
    m = {}
    clusters = set(y_true)
    for c1 in clusters:
        cnt1 = 0
        for c2 in set(y_pred): 
            
            cnt = 0
            for (x,y) in zip(y_true,y_pred):
                if (x==c1) & (y==c2):
                    cnt+=1
            if cnt>cnt1:
                cnt1 = cnt
                res = c2
        m[c1] = res
    return m

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

def precision_recall(y_true,y_pred):
    m = map_clusters(y_true,y_pred)
    if len(set(m.values()))<len(set(y_true)):
        return 0,0
    y_true = np.array([m[x] for x in y_true])
    precision = metrics.precision_score(y_true,y_pred,average='weighted')
    recall = metrics.recall_score(y_true,y_pred,average='weighted')
    return precision,recall

In [24]:
param = {}
param['num_topics'] = [x for x in range(3,6)]
param['num_back'] = [x for x in range(1,3)]
param['min_tf'] = [x for x in range(1,4)]
param['tau'] = [x for x in np.arange(1.,5.,1.)]

from sklearn.model_selection import ParameterGrid
from sklearn import cluster

pg = ParameterGrid(param)
print(len(list(pg)))

best = [0,0]

for p in list(pg):
    
    model = topic_model(p['num_topics'], p['num_back'],p['tau'],p['min_tf'])
    model.fit_offline(batch_vectorizer, num_collection_passes=30)
    
    theta = model.get_theta()
    X = theta.as_matrix()[:-p['num_back']].T

    kmeans = sklearn.cluster.KMeans(n_clusters=3).fit(X)
    y_pred = kmeans.labels_
    y_true = np.array(marks)
    res = precision_recall(y_true,y_pred)
    
    if res[0] > best[0]:
        best = res
        best_p = p
    
print(best)
print(best_p)

72
(0.60058519793459553, 0.57831325301204817)
{'min_tf': 3, 'num_back': 1, 'num_topics': 3, 'tau': 3.0}


In [45]:
def without_zero(y_true,X):
    a, b = [],[]
    for (x,y) in zip(y_true,X):
        if x ==0:
            continue
        a.append(x)
        b.append(y)

    kmeans = sklearn.cluster.KMeans(n_clusters=2).fit(b)
    return np.array(a),np.array(kmeans.labels_)

In [46]:
best = [0,0]

for p in list(pg):
    
    model = topic_model(p['num_topics'], p['num_back'],p['tau'],p['min_tf'])
    model.fit_offline(batch_vectorizer, num_collection_passes=30)
    
    theta = model.get_theta()
    X = theta.as_matrix()[:-p['num_back']].T

    kmeans = sklearn.cluster.KMeans(n_clusters=3).fit(X)
    y_pred = kmeans.labels_
    y_true = np.array(marks)
    
    
    a,b = without_zero(y_true,X)
    res = precision_recall(a,b)
    
    if res[0] > best[0]:
        best = res
        best_p = p
    
print(best)
print(best_p)

(0.75862068965517238, 0.75862068965517238)
{'min_tf': 3, 'num_back': 1, 'num_topics': 3, 'tau': 3.0}
