In [1]:
from typing import List
from sudachipy import tokenizer
from sudachipy import dictionary
from gensim.models import KeyedVectors

In [2]:
class Tokenizer(object):
    """
    Sudachiによる単語の分割
    """

    def __init__(self, hinshi_list: List[str] = None, split_mode: str = "C"):
        """
        :param hinshi_list: 使用する品詞のリスト. example) hinshi_list=["動詞", "名詞", "形容詞"]
        :param split_mode:
        """
        split_mode_list = ["A", "B", "C"]
        assert split_mode in split_mode, f"{split_mode} is a non-existent split_mode {split_mode_list}"
        split_dic = {
            "A": tokenizer.Tokenizer.SplitMode.A,
            "B": tokenizer.Tokenizer.SplitMode.B,
            "C": tokenizer.Tokenizer.SplitMode.C,
        }
        self.tokenizer_obj = dictionary.Dictionary().create()
        self.mode = split_dic[split_mode]
        self.hinshi_list = hinshi_list

    def __call__(self, text: str) -> str:
        if self.hinshi_list:
            return " ".join([m.normalized_form() for m in self.tokenizer_obj.tokenize(text, self.mode) if
                             m.part_of_speech()[0] in self.hinshi_list and m.normalized_form() != " "])
        return " ".join(
            m.normalized_form() for m in self.tokenizer_obj.tokenize(text, self.mode) if m.normalized_form() != " ")

In [3]:
model = KeyedVectors.load_word2vec_format("/home/chen/cc.ja.300.vec.gz")
tokenizer = Tokenizer(hinshi_list=["動詞", "名詞", "形容詞"], split_mode="A")

In [4]:
import pandas as pd
import spacy
import numpy as np
from tqdm import tqdm
nlp = spacy.load('ja_ginza')

In [5]:
dataset_path = '/home/chen/relevant_content_words/dataset/QAbot-relevant.csv'
df = pd.read_csv(dataset_path, sep = '\t', header = None, names = ["sent1", "sent2"])

In [6]:
def cos_sim(v1, v2) -> float:
    return np.dot(v1,v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

In [7]:
def get_embeddings(inputs):
    sentvec = []
    if inputs != None:
        word_list = tokenizer(str(inputs)).split()
        for word in word_list:
            if word in model.key_to_index:
                sentvec.append(model.get_vector(word))
            else:
                sentvec.append(np.random.uniform(-0.01, 0.01, model.vector_size))
        if not sentvec:
                sentvec.append(np.random.uniform(-0.01, 0.01, model.vector_size))
        sentvec = np.mean(sentvec, 0)
    else:
        sentvec= np.random.uniform(-0.01, 0.01, model.vector_size)   

    return sentvec

In [8]:
pos_total_list = []
for i in tqdm(range(len(df))):
    pos_list = []
    cos_relevant_score = []
    sentence1 = df['sent1'][i]
    sentence2 = df['sent2'][i]
    sent1_embedding = get_embeddings(sentence1)
    sent2_embedding = get_embeddings(sentence2)
    cossim_sent1andsent2 = cos_sim(np.squeeze(sent1_embedding), np.squeeze(sent2_embedding))
    doc_sent1 = nlp(sentence1)
    doc_sent2 = nlp(sentence2)
    duplicate_tokens = []
    for token in doc_sent1:
        if token.text in [j.text for j in doc_sent2]:
            duplicate_tokens.append(token.text)

        re_sent1 = sentence1.replace(token.text, '')
        re_sent2 = sentence2.replace(token.text, '')
        
        re_sent1_embedding = get_embeddings(re_sent1)
        re_sent2_embedding = get_embeddings(re_sent2)
        re_cossim_1 = cos_sim(re_sent1_embedding.squeeze(), sent2_embedding.squeeze())
        re_cossim_2 = cos_sim(sent1_embedding.squeeze(), re_sent2_embedding.squeeze())
        pos_list.append(token.pos_)
        relevant_score = cossim_sent1andsent2 - min(re_cossim_1, re_cossim_2)
        cos_relevant_score.append(relevant_score)    
        
    for token in doc_sent2:
        if token.text in duplicate_tokens:
            continue
        else:
            re_sent1 = sentence1.replace(token.text, '')
            re_sent2 = sentence2.replace(token.text, '')
            
            re_sent1_embedding = get_embeddings(re_sent1)
            re_sent2_embedding = get_embeddings(re_sent2)
            re_cossim_1 = cos_sim(re_sent1_embedding.squeeze(), sent2_embedding.squeeze())
            re_cossim_2 = cos_sim(sent1_embedding.squeeze(), re_sent2_embedding.squeeze())
            pos_list.append(token.pos_)
            relevant_score = cossim_sent1andsent2 - min(re_cossim_1, re_cossim_2)
            cos_relevant_score.append(relevant_score)
            
    max_indexs = [index for index, item in enumerate(cos_relevant_score) if item == max(cos_relevant_score)] 
    for ii in range(len(max_indexs)):
        idx = max_indexs[ii]
        pos_total_list.append(pos_list[idx])

100%|███████████████████████████████████████████| 53/53 [00:03<00:00, 13.44it/s]


In [9]:
len(pos_total_list)

64

In [10]:
def unique(lists):
    list_set = set(lists)
    unique_list = (list(list_set))
    for x in unique_list:
        print(x)

In [11]:
unique(pos_total_list)

ADJ
SCONJ
ADP
PART
AUX
NUM
NOUN
VERB


In [12]:
PART_num, SYM_num, X_num, AUX_num, ADP_num, ADV_num, ADJ_num, NUM_num = 0,0,0,0,0,0,0,0
SCONJ_num, PUNCT_num, PROPN_num, NOUN_num, VERB_num, CCONJ_num, DET_num = 0,0,0,0,0,0,0
for i in range(len(pos_total_list)):
    if pos_total_list[i] == "SYM":
        SYM_num+=1
    elif pos_total_list[i] == "X":
        X_num+=1
    elif pos_total_list[i] == "AUX":
        AUX_num+=1
    elif pos_total_list[i] == "ADP":
        ADP_num+=1
    elif pos_total_list[i] == "ADV":
        ADV_num+=1
    elif pos_total_list[i] == "ADJ":
        ADJ_num+=1
    elif pos_total_list[i] == "NUM":
        NUM_num+=1
    elif pos_total_list[i] == "SCONJ":
        SCONJ_num+=1
    elif pos_total_list[i] == "PUNCT":
        PUNCT_num+=1
    elif pos_total_list[i] == "PROPN":
        PROPN_num+=1
    elif pos_total_list[i] == "NOUN":
        NOUN_num+=1
    elif pos_total_list[i] == "VERB":
        VERB_num+=1
    elif pos_total_list[i] == "PART":
        PART_num+=1
    elif pos_total_list[i] == "CCONJ":
        CCONJ_num+=1
    elif pos_total_list[i] == "DET":
        DET_num+=1

In [13]:
results = []

In [14]:
print('NOUN_per:'+"%.2f" %(NOUN_num / len(pos_total_list) *100))

NOUN_per:25.00


In [15]:
results.append('SYM_per:'+"%.2f" %(SYM_num / len(pos_total_list) *100))

In [16]:
results.append('X_per:'+"%.2f" %(X_num / len(pos_total_list) *100))

In [17]:
results.append('AUX_per:'+"%.2f" %(AUX_num / len(pos_total_list) *100))

In [18]:
results.append('ADP_per:'+"%.2f" %(ADP_num / len(pos_total_list) *100))

In [19]:
results.append('ADV_per:'+"%.2f" %(ADV_num / len(pos_total_list) *100))

In [20]:
results.append('ADJ_per:'+"%.2f" %(ADJ_num / len(pos_total_list) *100))

In [21]:
results.append('NUM_per:'+"%.2f" %(NUM_num / len(pos_total_list) *100))

In [22]:
results.append('SCONJ_per:'+"%.2f" %(SCONJ_num / len(pos_total_list) *100))

In [23]:
results.append('PUNCT_per:'+"%.2f" %(PUNCT_num / len(pos_total_list) *100))

In [24]:
results.append('PROPN_per:'+"%.2f" %(PROPN_num / len(pos_total_list) *100))

In [25]:
results.append('NOUN_per:'+"%.2f" %(NOUN_num / len(pos_total_list) *100))

In [26]:
results.append('VERB_per:'+"%.2f" %(VERB_num / len(pos_total_list) *100))

In [27]:
results.append('PART_per:'+"%.2f" %(PART_num / len(pos_total_list) *100))

In [28]:
results.append('CCONJ_per:'+"%.2f" %(CCONJ_num / len(pos_total_list) *100))

In [29]:
results

['SYM_per:0.00',
 'X_per:0.00',
 'AUX_per:21.88',
 'ADP_per:1.56',
 'ADV_per:0.00',
 'ADJ_per:3.12',
 'NUM_per:15.62',
 'SCONJ_per:10.94',
 'PUNCT_per:0.00',
 'PROPN_per:0.00',
 'NOUN_per:25.00',
 'VERB_per:20.31',
 'PART_per:1.56',
 'CCONJ_per:0.00']

In [30]:
file_out = 'Fasttext_QAbot_relevantwords.txt'
with open(file_out, 'w', encoding = 'utf-8') as f:
    for i in range(len(results)):
        f.write(results[i].strip() + '\n')
     
        
f.close() 