# Arranged Words-Matrix

In [153]:
s = """西汉桑弘羊主持的盐铁官营，在各郡设盐铁官署，严禁私人生产，“敢私铸铁器，煮盐者，钛左趾，没人其器物”。这反映了汉武帝时期政府增加财政收人的迫切需要。构成这段文字的是（）A历史观点和历史解释B历史观点和历史结论C历史结论和历史解释D历史叙述和历史解释"""

In [140]:
import jieba
import numpy as np

def count_words(s):
    stop_words = ['$', '?', '_', '“', '”', '、', '。', '《', '》','，','（', '）', '的', '了', '是']
    tokenstr = []
    result = {}
    
    word2pos = {}
    pos2word = {}
    
    words = jieba.cut(s)
    
    i = 0 
    for word in words:
        if word in stop_words: continue
        tokenstr.append(word)
        result[word] = result.get(word, 0) + 1
        pos2word[i] = word
        
        indexs = word2pos.get(word, [])
        indexs.append(i)
        word2pos[word] = indexs
        
        i+=1
    
    result = dict(sorted(result.items(), key=lambda x: (x[1],x[0]), reverse=True))
    wordslist = list(result.keys())
    assert len(set(tokenstr)) == len(wordslist)
    return (wordslist, tokenstr, word2pos, pos2word)

In [33]:
def fill_table(TD_list, related_tables,target_width, qqueue):
    TD_list[0] = qqueue[0] # TD_list 长度为target_width 第一个位置对应此单词在wlist中的索引。0,1,2...
    count = 1

    while qqueue != [] and count < target_width:
        use_index = qqueue[0] # 单词索引
        del qqueue[0]
        use_list = related_tables[use_index]  #取出use_index单词对应的相关单词。
        len1 = len(use_list)   # 查看 i对应 的相关单词的个数。
        len2 = target_width - count 
        if len1 >= len2:   # 大体意思应该是查看单词i对应的相关单词个数如果满足 target_width就直接从相关单词按顺序取出来填充到TD_list中。
            TD_list[count:] = use_list[:len2]  
            assert len(TD_list) == target_width
            count = target_width
            break
        else:              # 如果不满足就有多少填多少。剩下的用 -1填充。
            TD_list[count:count + len1] = use_list
            assert len(TD_list) == target_width
            count += len1
            for next_id in use_list:
                qqueue.append(next_id)
    for i in range(count, target_width):
        TD_list[i] = -1

In [135]:
def reorder(table, word2pos, pos2word, wlist, word2id):
    sort_table = []
    topn, neighbor = np.array(table).shape
    for i in range(topn):
        tmp = []
        tmp += word2pos[wlist[table[i][0]]] # record each center word index
        length = len(tmp)                   # occurred times of center words
        t = []                              # t is use to related words index
        for j in range(1, neighbor):
            t += word2pos[wlist[table[i][j]]]
        index = np.random.randint(len(t), size = 20-length)
        t = np.array(t)
        t = list(t[index])
        tmp = tmp + t                       # conccat the index of center word and index of its related words
        tmp.sort()
        for j in range(len(tmp)):
            tmp[j] = word2id[pos2word[tmp[j]]] # convert index to word_id
            # tmp[j] = pos2word[tmp[j]]       # convert index to word
        sort_table.append(tmp)
    
    return np.array(sort_table)

In [151]:
def text2matrix(s, sliding_window=3, target_width=5):
    """
    
    """
    (wlist, tokenwords, word2pos, pos2word) = count_words(s)
    word2id = {k:v for k,v in zip(wlist, range(len(wlist)))}
    wordslist_length = len(wlist)
    
    AM_table = [[0 for i in range(wordslist_length)] for j in range(wordslist_length)]
    
    # generate occurred matrix with sliding_window
    for num in range(len(tokenwords)-sliding_window+1):
        for i in range(sliding_window-1):
            for j in range(i+1, sliding_window):
                AM_table[wlist.index(tokenwords[num + i])][wlist.index(tokenwords[num + j])] += 1
                AM_table[wlist.index(tokenwords[num + j])][wlist.index(tokenwords[num + i])] += 1
                
    related_tables = {}
    for i in range(wordslist_length):
        related_tables[i] = [[index, num] for index, num in enumerate(AM_table[i]) if num > 0 and index != i]
        related_tables[i].sort(key=lambda x: x[1], reverse=True)
        related_tables[i] = [element[0] for element in related_tables[i]]
    
    TD_table = [[-1 for i in range(target_width)] for j in range(wordslist_length)]
    for i in range(wordslist_length):
        fill_table(TD_table[i], related_tables, target_width, [i]) # fill TD table with -1
    
    # TD_table = reorder(TD_table, word2pos, pos2word, wlist, word2id)
    
    # convert id to words: arrange word matrix
    awm = []
    for row in TD_table:
        awm.append([pos2word[i] for i in row])
    return wlist, awm # ,TD_table

In [159]:
a,b = text2matrix(s)
for i,j in zip(a,b):
    print('center word:',i,'\tarrange words matrix:',j)

center word: 历史 	arrange words matrix: ['西汉', '桑弘羊', '主持', '铁官营', '在']
center word: 和 	arrange words matrix: ['桑弘羊', '西汉', '铁官营', '主持', '在']
center word: 解释 	arrange words matrix: ['主持', '西汉', '桑弘羊', '解释', '历史']
center word: 这 	arrange words matrix: ['盐', '其', '这', '这', '历史']
center word: 观点 	arrange words matrix: ['铁官营', '西汉', '桑弘羊', '历史', '观点']
center word: 结论 	arrange words matrix: ['在', '西汉', '桑弘羊', 'B', '桑弘羊']
center word: 铸铁 	arrange words matrix: ['各', '铸铁', '段', '左', '时期']
center word: 铁官营 	arrange words matrix: ['郡', '盐者', '构成', '文字', '和']
center word: 钛 	arrange words matrix: ['设', '煮', '财政', '严禁', '左']
center word: 郡 	arrange words matrix: ['盐铁', '生产', '文字', '器', '构成']
center word: 迫切需要 	arrange words matrix: ['官署', '这', '增加', '盐', '私人']
center word: 趾 	arrange words matrix: ['严禁', '趾', '财政', '设', '观点']
center word: 财政 	arrange words matrix: ['私人', '增加', '迫切需要', '官署', '政府']
center word: 设 	arrange words matrix: ['生产', '盐铁', '器', '收人', '文字']
center word: 西汉 	arrange words mat