In [None]:
import os
import numpy as np
import pandas as pd
import pickle
from gensim.models import Word2Vec, word2vec, doc2vec

def load_pickle(path):
    with open(path, 'rb') as f:
        data = pickle.load(f)
        return data
    
def save_pickle(data, path):
    with open(path, "wb") as f:
        pickle.dump(data, f) 

print('pandas: {}'.format(pd.__version__))
print('numpy: {}'.format(np.__version__))

# Load dataset and model

In [None]:
df = load_pickle('./Data_Cleaned/171275_Cleaned.pickle')
model_w2v = Word2Vec.load("./Extra_Feature/w2v_300d.model")
model_d2v = doc2vec.Doc2Vec.load("./Extra_Feature/d2v_300d.model")

## Log Likelihood Ratio (LLR)

In [None]:
import tmunlp as nlp

def save_txt(data_path, label, text):
    with open(data_path, 'w', encoding='utf-8') as f:
        for idx in range(len(label)):
            f.write((str(label[idx])+'\t'+str(text[idx])+'\n'))

    
def get_LLR_keyword(data_path, label_list, number_of_kw):   
    label_term_weighting = nlp.get_label_term_weighting(data_path, label_list)
    kw_dicts = {}
    for label in label_list:
        kw = nlp.get_keyword(label, label_term_weighting, number_of_kw, True)
        kw_dicts.update({label: kw})                     
    return kw_dicts

## Clinical Narratives Text Representation (CNTR) 

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def distinguish(text, kw_dict):
    text_set = set(text.split())
    keywords = kw_dict.keys()
    num = 0
    for kw in keywords:
        if kw in text_set:
            num += 1
        else:
            continue
    return num


def match(text, kw_dict):
    feature = []
    match_words = []
    text_set = set(text.split())
    for kw in kw_dict.keys():
        if kw in text_set:
            feature.append(kw_dict.get(kw)) 
            match_words.append(kw)
        else:
            feature.append(0)       
    return feature, match_words


def get_similarity(similarity_array):
    double_list = similarity_array.tolist()
    for one_list in double_list:
        for num in one_list:
            similarity = num            
    return similarity


def similarity_sorted(similarity_dict, num_of_similar):
    top_word = []
    top_similarity = []
    rank = sorted(similarity_dict.items(), key=lambda x : x[1], reverse=True)
    for i in range(num_of_similar):
        top_word.append(rank[i][0])
        top_similarity.append(rank[i][1])
    top_similarity_dict = {k: v for k, v in zip(top_word, top_similarity)}
    return top_similarity_dict


def no_match(text, kw_dict, model_d2v, model_w2v, num_of_similar):
    similarity_dict = {}
    similar_kw_list = []
    similar_weight_list = []     
    text_vec = model_d2v.infer_vector(list(text))
    for kw in kw_dict.keys():
        kw_vec = model_w2v.wv[kw]
        similarity_array = cosine_similarity([text_vec], [kw_vec]) 
        similarity_dict[kw] = get_similarity(similarity_array) 
    top_similarity_dict = similarity_sorted(similarity_dict, num_of_similar)        
    for kw in kw_dict.keys(): 
        if kw in top_similarity_dict.keys():          
            similar_weight_list.append(kw_dict.get(kw)*top_similarity_dict.get(kw))  
            similar_kw_list.append(kw)
        else:
            similar_weight_list.append(0)            
    return similar_weight_list, similar_kw_list


def Transformation(texts, label, kw_dict, model_d2v, model_w2v, num_of_similar):
    feature_output = []
    words_output = []
    feature_output_dict = {}
    words_output_dict = {}
    for index in range(len(texts)):
        text = texts[index]  
        num = distinguish(text, kw_dict)
        if num > 0:
            feature, match_words = match(text, kw_dict)
            feature_output.append(feature)
            words_output.append(match_words)
        else:
            similar_weight_list, similar_kw_list = no_match(text, kw_dict, model_d2v, model_w2v, num_of_similar)
            feature_output.append(similar_weight_list)
            words_output.append(similar_kw_list)
    feature_output_dict.update({label: feature_output}) 
    words_output_dict.update({label: words_output}) 
    return feature_output_dict, words_output_dict


def CNTR(df, column_label, column_LLR, path_LLR, label_list, number_of_kw, model_d2v, model_w2v, num_of_similar):  
    CNTR_feature_dictionary = {}
    CNTR_words_dictionary = {}
    for index in range(len(column_LLR)):
        column_name = column_LLR[index]
        labels = df[column_label]
        texts = df[column_name]
        number = number_of_kw.get(column_name) 
        data_path = path_LLR.get(column_name)
        
        print("*** {} ***".format(column_name))
        if len(labels)==len(texts):
            
            print('** Log Likelihood Ratio **')
            save_txt(data_path, labels, texts)  
            kw_dicts = get_LLR_keyword(data_path, label_list, number)        
            
            print('** Transformation **')
            CNTR_feature = {}
            CNTR_words = {}       
            for label in label_list:
                kw_dict = kw_dicts.get(label)
                feature_output_dict, words_output_dict = Transformation(texts, label, kw_dict, model_d2v, model_w2v, num_of_similar)
                CNTR_feature.update(feature_output_dict) 
                CNTR_words.update(words_output_dict)                
            CNTR_feature_dictionary.update({column_name: CNTR_feature}) 
            CNTR_words_dictionary.update({column_name: CNTR_words}) 
            
        else:
            print('Error!')
            print('label: {}'.format(len(label)))
            print('text: {}'.format(len(text)))
            
    return CNTR_feature_dictionary, CNTR_words_dictionary

In [None]:
label_list = ["1", "0"]  
column_label = 'code_final'
column_LLR = ['Chief Complaint', 'Present lllness', 'Medical history']
num_of_similar = 5

path_LLR = {
    column_LLR[0]: './Extra_Feature/for_LLR_chief.txt', 
    column_LLR[1]: './Extra_Feature/for_LLR_present.txt',
    column_LLR[2]: './Extra_Feature/for_LLR_past.txt'
}

number_of_kw = {
    column_LLR[0]: 100, 
    column_LLR[1]: 100,
    column_LLR[2]: 15
}


CNTR_feature_dictionary, CNTR_words_dictionary = CNTR(df, column_label, column_LLR, path_LLR, label_list, number_of_kw, model_d2v, model_w2v, num_of_similar)