In [5]:
import pandas as pd
import numpy as np


import artm

In [6]:
path = 'urnuuid3edd5574-2a35-11e8-a845-60f81daa9f8a'

In [7]:
batch_vectorizer = artm.BatchVectorizer(data_path=path,
                                        data_format='batches', )

In [8]:
dictionary = artm.Dictionary()
dictionary.gather(data_path=path)

In [9]:
model = artm.ARTM(num_topics=45, dictionary=dictionary, scores=[artm.TopTokensScore(name='top_tokens_score')])
model.fit_offline(batch_vectorizer, num_collection_passes=10)

In [10]:
top_tokens = model.score_tracker['top_tokens_score']

In [11]:
for topic_name in model.topic_names:
    print('\n', topic_name)
    for (token, weight) in zip(top_tokens.last_tokens[topic_name],
                               top_tokens.last_weights[topic_name]):
        print(token, '-', weight)


 topic_0
habrahabr post - 0.0347822941839695
https habrahabr post - 0.03297479823231697
https habrahabr - 0.029462352395057678
зада вопрос - 0.026467572897672653
какого нибуд - 0.023120654746890068
анализ данных - 0.021854152902960777
понял спасиб - 0.015486984513700008
тестов задан - 0.014950715005397797
задач реша - 0.013038813136518002
https meduza - 0.011857831850647926

 topic_1
очен хорош - 0.05261579528450966
больш част - 0.04552178084850311
последн врем - 0.030232027173042297
очен сильн - 0.030165808275341988
https docs - 0.026210611686110497
https blog - 0.019763341173529625
нужн тольк - 0.018047692254185677
главное чтоб - 0.017531875520944595
docs google - 0.015988724306225777
https docs google - 0.015728456899523735

 topic_2
того чтоб - 0.05700388550758362
можн взят - 0.03767002373933792
чтоб можн - 0.03447531908750534
каком нибуд - 0.02339722029864788
прощ всег - 0.01811327412724495
какое нибуд - 0.015097794122993946
правильн понима - 0.013379913754761219
когд можн - 0.01

In [12]:
df_clean = pd.read_csv('../../data/ods_dump/clean_message.csv', usecols=['user', 'text', 'stem_text'])
df_clean.head(3)

Unnamed: 0,user,text,stem_text
0,U1UMQM200,<@U1Z2QA4EM> как избавиться от рекурсии?,u1z2qa4em избав рекурсии
1,U1Z2QA4EM,<@U1UMQM200>: избавиться от искушения - это ка...,u1umqm200 избав искушен контрольн выстрел голов
2,U09JEC7V0,<@U1Z2QA4EM> в психотерапию умеешь?,u1z2qa4em психотерап умеешь


In [13]:
topics_df = model.transform(batch_vectorizer=batch_vectorizer).T

In [14]:
merge = pd.merge(df_clean, topics_df, left_index=True, right_index=True)
merge

Unnamed: 0,user,text,stem_text,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,...,topic_35,topic_36,topic_37,topic_38,topic_39,topic_40,topic_41,topic_42,topic_43,topic_44
0,U1UMQM200,<@U1Z2QA4EM> как избавиться от рекурсии?,u1z2qa4em избав рекурсии,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,...,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222
1,U1Z2QA4EM,<@U1UMQM200>: избавиться от искушения - это ка...,u1umqm200 избав искушен контрольн выстрел голов,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,...,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222
2,U09JEC7V0,<@U1Z2QA4EM> в психотерапию умеешь?,u1z2qa4em психотерап умеешь,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,...,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222
3,U1Z2QA4EM,<@U09JEC7V0>: ох уж этот реверс в аметисты сос...,u09jec7v0 этот реверс аметист соснул пришел пр...,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,...,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222
4,U065VP6F7,<@U1Z2QA4EM> может ты у мамки психолог?,u1z2qa4em может мамк психолог,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,U1Z2QA4EM,"<@U065VP6F7>: мамки ты насрал, пока жил там ка...",u065vp6f7 мамк насрал кажд может полезет получ,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
6,U09JEC7V0,<@U1Z2QA4EM> всегда с тобой приятно поговорить),u1z2qa4em всегд приятн поговорить,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,...,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222
7,U1Z2QA4EM,<@U09JEC7V0>: с тобой всегда находится у власт...,u09jec7v0 всегд наход власти способств развит ...,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,...,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222
8,U2AD078S1,<@U1Z2QA4EM> как лечится такое заболевание?,u1z2qa4em заболевание,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,...,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222
9,U1Z2QA4EM,<@U2AD078S1>: такое как раз в 12 лет сиськи бы...,u2ad078s1 сиськ друзья обосра,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,...,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222,0.022222


In [15]:
theme = model.get_phi()['topic_14']

In [16]:
theme.sort_values(ascending=False).head(15)

train test      0.031684
rand forest     0.026708
сраз посл       0.023891
https news      0.017541
news 2017       0.016590
тольк начина    0.014777
может стат      0.014609
сдела вывод     0.014572
спасиб ответ    0.014561
конц конц       0.012421
blog 2017       0.010905
когд начина     0.010809
кажд слов       0.010359
втор вопрос     0.010141
плат налог      0.009646
Name: topic_14, dtype: float32

In [17]:
merge['topic_14'] = merge['topic_14'] + merge['topic_20'] 
#Захардкодим словари
voc_topic = {
    'topic_4': 'Computer vision',
    'topic_6': 'Feature engineering',
    'topic_7': 'Логистическая регрессия',
    'topic_8': 'Линейная регерессия',
    'topic_9': 'Coursera',
    'topic_10': 'Software engineering',
    'topic_12': 'OpenDataScience',
    #'topic_14': 'Random Forest',
    'topic_18': 'Kaggle',
    'topic_19': 'Neural Networks',
    'topic_22': 'Градиентный бустинг',
    'topic_23': 'Задача Классификации',
    'topic_25': 'Loss functions',
    'topic_26': 'Reinforcement learning',
    'topic_31': 'Sklearn',
    'topic_36': 'Deep learning',
}

cols = [
    'topic_4',
    'topic_6',
    'topic_7',
    'topic_8',
    'topic_9',
    'topic_10',
    'topic_12',
    #'topic_14',
    'topic_18',
    'topic_19',
    'topic_22',
    'topic_23',
    'topic_25',
    'topic_26',
    'topic_31',
    'topic_36',
        ]
    

In [18]:
users_interests = pd.pivot_table(merge, index='user')[cols].rename(columns=voc_topic)
users_interests['user_id'] = users_interests.index

In [19]:
melt = pd.melt(users_interests, id_vars=['user_id'])
melt[melt['user_id'] == 'U4MKDSYD9'].sort_values(by='value', ascending=False)

Unnamed: 0,user_id,variable,value
26147,U4MKDSYD9,OpenDataScience,0.02649
58307,U4MKDSYD9,Deep learning,0.024175
14087,U4MKDSYD9,Линейная регерессия,0.023808
30167,U4MKDSYD9,Kaggle,0.023487
54287,U4MKDSYD9,Sklearn,0.022768
2027,U4MKDSYD9,Computer vision,0.022678
38207,U4MKDSYD9,Градиентный бустинг,0.022458
46247,U4MKDSYD9,Loss functions,0.021843
18107,U4MKDSYD9,Coursera,0.021238
6047,U4MKDSYD9,Feature engineering,0.021118


In [20]:
users_interests[users_interests.index == 'U34Q3KU8H']

Unnamed: 0_level_0,Computer vision,Feature engineering,Логистическая регрессия,Линейная регерессия,Coursera,Software engineering,OpenDataScience,Kaggle,Neural Networks,Градиентный бустинг,Задача Классификации,Loss functions,Reinforcement learning,Sklearn,Deep learning,user_id
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
U34Q3KU8H,0.022594,0.021786,0.020362,0.023832,0.019322,0.021227,0.02179,0.023556,0.017874,0.022101,0.025771,0.022191,0.019028,0.02025,0.027655,U34Q3KU8H


In [21]:
melt.head(3)

Unnamed: 0,user_id,variable,value
0,U040HKJE7,Computer vision,0.022242
1,U040M0W0S,Computer vision,0.020812
2,U041LH06L,Computer vision,0.021636


In [22]:
#melt_v2 = 
melt_v2 = melt.groupby(by='user_id').apply(lambda x: x.sort_values(by='value', ascending=False).head(10))

melt_v2[['user_id', 'variable']].to_csv('topic_top_5_user.csv', index=False)

In [25]:
melt_v2

Unnamed: 0_level_0,Unnamed: 1_level_0,user_id,variable,value
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
U040HKJE7,36180,U040HKJE7,Градиентный бустинг,0.025345
U040HKJE7,12060,U040HKJE7,Линейная регерессия,0.023519
U040HKJE7,56280,U040HKJE7,Deep learning,0.023444
U040HKJE7,24120,U040HKJE7,OpenDataScience,0.022625
U040HKJE7,20100,U040HKJE7,Software engineering,0.022516
U040HKJE7,0,U040HKJE7,Computer vision,0.022242
U040HKJE7,52260,U040HKJE7,Sklearn,0.022214
U040HKJE7,28140,U040HKJE7,Kaggle,0.021657
U040HKJE7,40200,U040HKJE7,Задача Классификации,0.021495
U040HKJE7,4020,U040HKJE7,Feature engineering,0.021215
