# Assigning Topics to Documents

*This notebook computes topic models from documents and assigns the strongest topic to each document.*

In [1]:
import json
import nlp

dataset = 'asm' # lss/asm
input_file = '../data/lss-17k.json'
output_file = '../data/lss-17k-topics.json'

if dataset == 'asm':
    input_file = '../data/lss_asm_dataset.txt'
    output_file = '../data/lss-asm-topics.json'

####  Load data

In [2]:
def load(filename):
    'A text file with each line as a JSON text.'
    p = parse_asm if dataset == 'asm' else parse_lss
    with open(filename) as file:
        sessions = [p(line) for line in file]
    return [s for s in sessions if s]

def parse_asm(text):
    'Return the list of actions from the given JSON text.'
    obj = json.loads(text)
    if 'actions' in obj and obj['actions'] and len(obj['actions']) <= 150: # limit max length to sync with the vis
        action = [a['name'][6:] for a in obj['actions']] #  [6:] to exclude the prefix 'Action'
        return { 'id': obj['session']['dcxID'], 'action': action }

def parse_lss(text):
    'Return the list of actions from the given JSON text.'
    obj = json.loads(text)
    if 'actionsQueue' in obj and obj['actionsQueue'] and obj['userId'] != 'none' and obj['officeId'] != 'none' and obj['orga'] != 'none':
        action = [a[6:] for a in obj['actionsQueue']] #  [6:] to exclude the prefix 'Action'
        return { 'id': obj['PFX'], 'action': action }

In [3]:
data = load(input_file)
docs = [d['action'] for d in data]
len(data), data[0]

(18957,
 {'action': ['SearchUsr',
   'SearchUsr',
   'DisplayOneUser',
   'UpdateUserDetails',
   'TabBar',
   'SearchUsr',
   'DisplayOneUser',
   'UpdateUserDetails',
   'TabBar',
   'TabBar'],
  'id': '2LK1J5ID313Q1BWKN##NAGHU91'})

#### Compute topics with LDA

In [4]:
lda = nlp.build_lda(docs, num_topics=10)
lda.print_topics()

[(0,
  '0.341*"DisplayOneUser" + 0.137*"CloseTab" + 0.128*"CreateLoginArea" + 0.120*"AddNewLoginArea" + 0.092*"UpdateUserDetails" + 0.042*"NotifyDisplayedUser" + 0.031*"EditLoginArea" + 0.025*"DeleteLoginArea" + 0.023*"UpdateLogAreaIndic" + 0.019*"ResetPwdDisplayed"'),
 (1,
  '0.205*"ChooseOrganization" + 0.198*"FilterByApp" + 0.181*"FilterByAppACL" + 0.179*"DisplayOneUser" + 0.041*"AuthenticationHistorySearch" + 0.031*"Cancel" + 0.026*"FilterUser" + 0.022*"DuplicateToExistingUser1" + 0.022*"DuplicateToUsrConfirmation" + 0.018*"DuplicateToExistingUser"'),
 (2,
  '0.182*"CloseTab" + 0.148*"DisplayOneAppli" + 0.124*"UnLockUser" + 0.075*"ManageAppRole" + 0.055*"FilterRolesForApp" + 0.030*"FilterPermissionforApp" + 0.030*"Cancel" + 0.021*"CreateRole" + 0.020*"CreateRole2" + 0.019*"SaveAddRemovePermissions"'),
 (3,
  '0.197*"TpfRefreshTable" + 0.114*"FilterHistory" + 0.061*"TpfMassImport" + 0.061*"CreateAclForRole" + 0.061*"DisplayOneAppli" + 0.055*"CreateAclForRole2" + 0.046*"AddDataToACL"

#### Assign the strongest topic to each document

In [5]:
def assign_topic(data, docs, lda):
    'Return a lookup { id: topic }.'
    corpus = [lda.id2word.doc2bow(doc) for doc in docs]
    topic_ids = nlp.get_strongest_document_topics(lda, corpus)
    return { d['id']: topic_ids[i] for i, d in enumerate(data) }

In [6]:
output_data = assign_topic(data, docs, lda)
output_data

{'2LK1J5ID313Q1BWKN##NAGHU91': 0,
 'Y3M5R#QBI$1G0#DZEARJDT4U91': 9,
 'SRNGMVO$TI3F1X9JIKKMFPKU91': 8,
 'LQBSP0BOWN1506W1HTEMVMEU91': 4,
 '9F0E1SXEC6FU0S13CP8$MRJU91': 0,
 'W6BJQ25L91PK1S76ZFRG$SNU91': 4,
 'HW1AW$563S8F0XC9F63XMQ#T91': 0,
 'U8X22ATF2FT108XJ$X2GZV1U91': 0,
 'LN3SVSBR4V661WY8Y41C3W1U91': 2,
 'ZGTIAU$826HA0V6EZJ59KGHU91': 0,
 '7T06G8D697QI0NX8RG7ML8T7A0': 3,
 'Y3MEU4X4UOKC1TKF#6P2WYLU91': 6,
 'JG1F7MJ7WU6W1DCT2J1DDVFU91': 0,
 'IT5XQ2H0RUFD1Q$3UK0GBQ#T91': 0,
 '3#DTSGCRFD040T#K6JVYXMEU91': 0,
 '$$QIF#VDXOTV1DCT2J1DDVFU91': 8,
 'I8ZL0B$9MY9Q0BWKN##NAGHU91': 0,
 'M0QY#PA81$WT0EQT5KFUBW1U91': 8,
 '8P45VWBSIP8G1ZE3B2$E#KLU91': 0,
 '$SHWEKMBB9QC0NX8RG7ML8T7A0': 8,
 'A6S$8S6FSJ560NX8RG7ML8T7A0': 6,
 'IHQRKV6HOWB314V2QRMJPIEY90': 9,
 'QHAFCS76PHFO1NX8RG7ML8T7A0': 8,
 '4A06$GD6ZS0J0NX8RG7ML8T7A0': 2,
 '7DXF2UBZONPU04V2QRMJPIEY90': 2,
 '3336WYM5118C1BWKN##NAGHU91': 0,
 'JYQN35MQ5HHF1NX8RG7ML8T7A0': 2,
 '9W3IBTH1ZXWD04V2QRMJPIEY90': 2,
 'EYYIW2IG6MT$04V2QRMJPIEY90': 2,
 'I0C7XW40SL0T

#### Get information of topics

In [7]:
def get_topics2(lda):
    'Return topics with terms and probabilities.'
    def get_one(id):
        return [(lda.id2word[t]) for t, p in lda.get_topic_terms(id, topn=5)]

    return [get_one(i) for i in range(lda.num_topics)]

In [37]:
def get_topics(lda):
    'Return topics with terms and probabilities.'
    def get_one(id):
        return [(lda.id2word[t], float(p)) for t, p in lda.get_topic_terms(id, topn=5)]

    return [get_one(i) for i in range(lda.num_topics)]

In [8]:
all_topics = get_topics2(lda)
all_topics

[['DisplayOneUser',
  'CloseTab',
  'CreateLoginArea',
  'AddNewLoginArea',
  'UpdateUserDetails'],
 ['ChooseOrganization',
  'FilterByApp',
  'FilterByAppACL',
  'DisplayOneUser',
  'AuthenticationHistorySearch'],
 ['CloseTab',
  'DisplayOneAppli',
  'UnLockUser',
  'ManageAppRole',
  'FilterRolesForApp'],
 ['TpfRefreshTable',
  'FilterHistory',
  'TpfMassImport',
  'CreateAclForRole',
  'DisplayOneAppli'],
 ['CreateUser',
  'CloseTab',
  'DeleteUser',
  'ResetPwdUnlock'],
 ['Next', 'Prev', 'DisplayOneUser', 'Cancel', 'OneUserRoleDetails'],
 ['DisplayOrgaDetails',
  'AdminOguStep1',
  'RefreshReports',
  'AddOfficeDod',
  'AddOfficeDodFinish'],
 ['AssignAcl2',
  'AssignAcl1',
  'AssignACLStep3',
  'AssignRoleAclConfirmation1',
  'SearchRoleForAssignRoleAcl'],
 ['SearchOffice',
  'CloseTab',
  'DisplayOneOffice',
  'DisplayOneUser',
  'DisplayOneOGU'],
 ['AssignRole1',
  'AssignRole2',
  'AssignRoleAclConfirmation1',
  'CloseTab',
  'CreateUser']]

#### Export data

In [39]:
with open(output_file, 'w') as f:
    json.dump({ 'topics': all_topics, 'sessions': output_data }, f)