In [2]:
from gensim.models import AuthorTopicModel
import pandas as pd

In [3]:
model = AuthorTopicModel.load('./model/model.atm')

In [4]:
topics = []
for index, topic in enumerate(model.show_topics(num_topics=30, num_words=20)):
    topic_list = []
    for word, prob in model.show_topic(topic[0]):
        topic_list.append(word)
    topics.append(topic_list)

In [5]:
column_names = ['Topic ' + str(col + 1) for col in range(len(topics))]
df = pd.DataFrame(topics, index=column_names)
df = df

In [6]:
df.transpose()

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10,...,Topic 21,Topic 22,Topic 23,Topic 24,Topic 25,Topic 26,Topic 27,Topic 28,Topic 29,Topic 30
0,amp,time,people,good,dog,labour,school,government,think,restaurant,...,excited,wine,think,work,think,good,life,france,tax,beer
1,school,good,country,jade,phone,party,university,issue,feel,food,...,series,cheap,people,people,good,like,product,english,pay,scotland
2,area,year,british,company,library,union,student,point,time,uk,...,like,half,thing,child,know,jackson,personally,result,work,scottish
3,world,like,like,smoke,realise,tax,teacher,result,relationship,curry,...,light,italian,good,life,like,know,apple,god,people,australia
4,art,come,world,card,house,public,job,measure,people,serve,...,episode,travel,right,year,article,film,previous,pinch,government,flag
5,theatre,great,live,research,norway,conservative,education,suggest,like,drink,...,ask,madrid,point,need,thing,want,desire,ago,money,shop
6,film,work,friend,book,fiddle,work,state,case,need,different,...,bring,decent,way,woman,read,yes,usually,french,local,snp
7,future,day,believe,anti,poster,vote,system,way,know,indian,...,set,italy,want,time,write,man,creative,heart,need,drink
8,play,simply,find,read,hr,democratic,place,political,work,pudding,...,mean,seven,comment,way,love,book,understand,sorry,public,w
9,education,half,let,die,gym,cut,course,analysis,find,water,...,remember,brighton,reason,want,man,look,lol,perfect,year,charity_shop


In [7]:
def print_author_topics(model, author_id):
    print(f'AuthorID: {author_id}')
    print(f'Num Docs: {len(model.author2doc[author_id])}')
    print(f'Docs: {model.author2doc[author_id]}')
    df = pd.DataFrame(model[author_id], columns=['Topic', 'Value'])
    df = df.sort_values(by='Value', ascending=False)
    print(df)

In [8]:
print_author_topics(model, '7083')

AuthorID: 7083
Num Docs: 21
Docs: [705, 2541, 2542, 4221, 4895, 6616, 7982, 8491, 9763, 14035, 14500, 14501, 14503, 14504, 15539, 16307, 16308, 16929, 16935, 16936, 17981]
    Topic     Value
0       1  0.460414
6      29  0.113831
13     49  0.089854
1       3  0.061330
2       4  0.057731
11     41  0.046992
10     40  0.033180
5      24  0.030803
9      37  0.023997
3      11  0.020567
4      13  0.016553
8      36  0.013816
12     47  0.012908
7      35  0.011155


In [9]:
print_author_topics(model, '10607')

AuthorID: 10607
Num Docs: 29
Docs: [1223, 2520, 2532, 3044, 4387, 4388, 4553, 4853, 4865, 4866, 4870, 6602, 6603, 6604, 6605, 9738, 11392, 11394, 14509, 14510, 14511, 14512, 14513, 14516, 15542, 17443, 18224, 19213, 19214]
    Topic     Value
9      29  0.226553
0       1  0.134738
1       2  0.086829
13     47  0.085344
12     45  0.072668
4      10  0.068642
8      28  0.067985
3       6  0.052782
11     41  0.049497
6      23  0.045592
10     36  0.028994
2       5  0.028114
7      24  0.019381
5      21  0.014081


In [10]:
df_comments = pd.read_csv('/Volumes/Samsung_T5/Documents/Uni/text_mining/part_users/converted_selected_sorted_comments-standardized.csv')

In [11]:
author_comments = model.author2doc['7083']

In [12]:
pd.set_option('display.max_colwidth', -1)
df_comments[df_comments['comment_id'].isin(author_comments)]['comment_text']

705      Any substitute for gammon? Not sure they cure Pork that way where I am.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [15]:
model.print_topic(21, topn=20)

'0.034*"labour" + 0.024*"party" + 0.020*"union" + 0.015*"tax" + 0.015*"public" + 0.014*"conservative" + 0.014*"work" + 0.013*"vote" + 0.012*"democratic" + 0.012*"cut" + 0.012*"worker" + 0.012*"left" + 0.011*"election" + 0.011*"care" + 0.010*"wing" + 0.009*"government" + 0.009*"hard" + 0.008*"cameron" + 0.008*"left_wing" + 0.008*"opposition"'

In [30]:
def print_topic_df(model, topic_id, topn=20):
    d = model.print_topic(topic_id, topn).replace('"', '').split('+')
    d = [el.split('*') for el in d]
    df = pd.DataFrame(d, columns=['prob', 'word'])
    return df

In [31]:
print_topic_df(model, 21)

Unnamed: 0,prob,word
0,0.034,labour
1,0.024,party
2,0.02,union
3,0.015,tax
4,0.015,public
5,0.014,conservative
6,0.014,work
7,0.013,vote
8,0.012,democratic
9,0.012,cut
