In [1]:
import numpy as np
import pandas as pd
# from bertopic import BERTopic

### Create a dummy `topic_representations_` list

In [2]:
topic_list = list(range(-1,7))
nreps = 9
nactions = 8
ntopics = topic_list[-1] + 1

In [3]:
def ctfidf(t, s, ntopics, nreps):
    alpha = 0.5
    gamma = 0.3
    c = 0.1
    # ret_val = alpha * (ntopics - t) + gamma * (nreps/ (s+1)) + c
    ret_val = s
    return ret_val

In [4]:
representation_dict = {}
for t in topic_list:
    alist = []
    for rep in range(nreps):
        string = f'topic_{str(t)}_string_{str(rep)}'
        ctfidf_score = ctfidf(t, rep, ntopics, nreps)
        alist.append((string, ctfidf_score))
    representation_dict[t] = alist

In [5]:
import pickle
with open('representation_dictionary.pickle' ,'wb') as handle:
    pickle.dump(representation_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
representation_dict

{-1: [('topic_-1_string_0', 0),
  ('topic_-1_string_1', 1),
  ('topic_-1_string_2', 2),
  ('topic_-1_string_3', 3),
  ('topic_-1_string_4', 4),
  ('topic_-1_string_5', 5),
  ('topic_-1_string_6', 6),
  ('topic_-1_string_7', 7),
  ('topic_-1_string_8', 8)],
 0: [('topic_0_string_0', 0),
  ('topic_0_string_1', 1),
  ('topic_0_string_2', 2),
  ('topic_0_string_3', 3),
  ('topic_0_string_4', 4),
  ('topic_0_string_5', 5),
  ('topic_0_string_6', 6),
  ('topic_0_string_7', 7),
  ('topic_0_string_8', 8)],
 1: [('topic_1_string_0', 0),
  ('topic_1_string_1', 1),
  ('topic_1_string_2', 2),
  ('topic_1_string_3', 3),
  ('topic_1_string_4', 4),
  ('topic_1_string_5', 5),
  ('topic_1_string_6', 6),
  ('topic_1_string_7', 7),
  ('topic_1_string_8', 8)],
 2: [('topic_2_string_0', 0),
  ('topic_2_string_1', 1),
  ('topic_2_string_2', 2),
  ('topic_2_string_3', 3),
  ('topic_2_string_4', 4),
  ('topic_2_string_5', 5),
  ('topic_2_string_6', 6),
  ('topic_2_string_7', 7),
  ('topic_2_string_8', 8)],
 3

In [6]:
action_representation_map_df = pd.read_csv('action_representation_map.csv')

In [7]:
# collect the ctfidf values of all the representations for every topic into
# a 2d list
# array shape: number of topics x number of representations per topic
ctfidf_topic_rep = np.zeros([ntopics, nreps])
for topic_no in range(ntopics):
    ctfidf_topic_rep[topic_no, :] = np.array([representation_dict[topic_no][k][1] for k in range(nreps)])

In [8]:
action_topic_wise_ctfidf3d = np.repeat(ctfidf_topic_rep[np.newaxis, :, :], nactions, axis=0)

In [9]:
action_topic_wise_ctfidf3d.shape

(8, 7, 9)

In [10]:
selection_matrix = np.zeros([nactions, ntopics, nreps])
for i in range(len(action_representation_map_df)):
        adict = action_representation_map_df.iloc[i].to_dict()
        action_id = adict['ActionID']
        topic_number = adict['TopicNumber']
        unmask_indices = eval(adict['RepresentationList'])
        selection_matrix[action_id, topic_number, unmask_indices] = 1
        # print(action_id, topic_number, unmask_indices)

In [11]:
res = selection_matrix * action_topic_wise_ctfidf3d

In [12]:
np.round(res[4], 2)

array([[0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 5., 6., 7., 8.],
       [0., 1., 2., 3., 4., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [13]:
# np.round(action_topic_wise_ctfidf3d[1], 2)

In [14]:
def softmax2d(anarray):
    '''
    argument: a 2d array of numbers
    returns: softmax probability distribution.
             The calculation is performed column-wise.
    '''
    exp_array = np.exp(anarray)
    normalization_array = np.sum(exp_array, axis=0)
    return exp_array / normalization_array[:]

In [15]:
topic_action_ctfidf_sum = np.sum(res, axis=2)
topic_action_probability = softmax2d(topic_action_ctfidf_sum)

In [16]:
np.round(topic_action_ctfidf_sum,2)

array([[10.,  0.,  0.,  0.,  0.,  0.,  0.],
       [26., 10.,  0.,  0.,  0.,  0.,  0.],
       [ 0., 26., 10.,  0.,  0.,  0.,  0.],
       [ 0.,  0., 26., 10.,  0.,  0.,  0.],
       [ 0.,  0.,  0., 26., 10.,  0.,  0.],
       [ 0.,  0.,  0.,  0., 26., 10.,  0.],
       [ 0.,  0.,  0.,  0.,  0., 26., 10.],
       [ 0.,  0.,  0.,  0.,  0.,  0., 26.]])

In [17]:
np.round(topic_action_probability, 3)

array([[0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 1.]])