# Changes made by kts4 / noami2
- `glove` was not available and so needed to install `mittens` instead
  - `from mittens import GloVe as glove`
- We do not have the `FastText` model and so all references to it and combined model have to be commented out
- The calls to access the `Word2vec` and `FastText` models needed to be updated
  - `w2vec[...]` changed to `w2vec.wv[...]` and `fasttext[...]` changed to `fasttext.wv[...]`
- In Python 3 you can no longer convert from `map` to `np.array`
  - So, `t = np.asarray(map(mean, zip(*avg)))` changed to `t = np.asarray(list(map(mean, zip(*avg))))`
- Accessing `FastText` elements raised an `IndexError`
  - These calls are now wraped in a `try` block
- `new_word2vec_dict` was never initialised
  - Should be a copy of `new_word2vec` with commonalities between it and `FastText` model removed
  - Need to ensure the `diff` set is actually useful
- Changes to newlines / spacings etc.

In [None]:
import pandas as pd
import os
import numpy as np
from gensim.models import Word2Vec, FastText
from mittens import GloVe as glove
from tqdm import tqdm
# import glove
# from glove import Corpus
import torch

import collections
import gc 

import warnings
warnings.filterwarnings('ignore')

## Specify what to Run

In [None]:
run_word2vec = True
run_fastText = True
run_combined = True
run_blueBERT = False
run_clinicalBERT = False

In [None]:
new_notes = pd.read_pickle("data/ner_df.p") # med7

if run_word2vec: w2vec = Word2Vec.load("embeddings/word2vec.model")
if run_fastText: fasttext = FastText.load("embeddings/fasttext.model")

In [None]:
from transformers import AutoTokenizer, AutoModel

if run_blueBERT:
    bluebert_tokenizer = AutoTokenizer.from_pretrained("bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12")
    bluebert_model = AutoModel.from_pretrained("bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12")

if run_clinicalBERT:
    clinicalbert_tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
    clinicalbert_model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

In [None]:
null_index_list = []
for i in new_notes.itertuples():
    
    if len(i.ner) == 0:
        null_index_list.append(i.Index)
new_notes.drop(null_index_list, inplace=True)

In [None]:
med7_ner_data = {}

for ii in new_notes.itertuples():
    
    p_id = ii.SUBJECT_ID
    ind = ii.Index
    
    try:
        new_ner = new_notes.loc[ind].ner
    except:
        new_ner = []
            
    unique = set()
    new_temp = []
    
    for j in new_ner:
        for k in j:
            
            unique.add(k[0])
            new_temp.append(k)

    if p_id in med7_ner_data:
        for i in new_temp:
            med7_ner_data[p_id].append(i)
    else:
        med7_ner_data[p_id] = new_temp

In [None]:
pd.to_pickle(med7_ner_data, "data/new_ner_word_dict.pkl")

In [None]:
def mean(a):
    return sum(a) / len(a)

In [None]:
data_types = [med7_ner_data]
data_names = ["new_ner"]

## Running Word2Vec

In [None]:
if run_word2vec:
    print("w2vec starting..")
    for data, names in zip(data_types, data_names):
   
        new_word2vec = {}
        for k,v in tqdm(data.items()):

            patient_temp = []
            for i in v:
                try:
                    patient_temp.append(w2vec.wv[i[0]])
                except:
                    avg = []
                    num = 0
                    temp = []

                    if len(i[0].split(" ")) > 1:
                        for each_word in i[0].split(" "):
                            try:
                                temp = w2vec.wv[each_word]
                                avg.append(temp)
                                num += 1
                            except:
                                pass
                        if num == 0: 
                            continue
                        avg = np.asarray(avg)
                        t = np.asarray(list(map(mean, zip(*avg))))
                        patient_temp.append(t)
            if len(patient_temp) == 0: 
                continue
            new_word2vec[k] = patient_temp

    print("w2vec finished")

## Running FastText

In [None]:
if run_fastText:
    print("fasttext starting..")
    for data, names in zip(data_types, data_names):
   
        new_fasttextvec = {}

        for k,v in tqdm(data.items()):

            patient_temp = []

            for i in v:
                try:
                    patient_temp.append(fasttext.wv[i[0]])
                except:
                    pass
            if len(patient_temp) == 0: continue
            new_fasttextvec[k] = patient_temp

    print("fasttext finished")

## Running Combined - Word2Vec & FastText

In [None]:
if run_combined:
    print("combined starting..")
    for data, names in zip(data_types, data_names):
   
        new_concatvec = {}

        for k,v in tqdm(data.items()):
            patient_temp = []
        #     if k != 6: continue
            for i in v:
                w2vec_temp = []
                try:
                    w2vec_temp = w2vec.wv[i[0]]
                except:
                    avg = []
                    num = 0
                    temp = []

                    if len(i[0].split(" ")) > 1:
                        for each_word in i[0].split(" "):
                            try:
                                temp = w2vec.wv[each_word]
                                avg.append(temp)
                                num += 1
                            except:
                                pass
                        if num == 0: 
                            w2vec_temp = [0] * 100
                        else:
                            avg = np.asarray(avg)
                            w2vec_temp = np.asarray(list(map(mean, zip(*avg))))
                    else:
                        w2vec_temp = [0] * 100

                try:
                    fasttemp = fasttext.wv[i[0]]
                    appended = np.append(fasttemp, w2vec_temp, 0)
                except:
                    appended = np.append(w2vec_temp, 0)

                # appended = np.append(fasttemp, w2vec_temp, 0)
                patient_temp.append(appended)
            if len(patient_temp) == 0: continue
            new_concatvec[k] = patient_temp

    print("combined finished")



## Running BlueBERT

In [None]:
if run_blueBERT:
    print("BlueBERT starting..")
    for data, names in zip(data_types, data_names):
   
        new_bluebert = {}

        for k,v in tqdm(data.items()):
            patient_temp = []
            for i in v:
                # try:
                input_ids = bluebert_tokenizer.encode(i[0], add_special_tokens=True)
                embeddings = bluebert_model(torch.tensor([input_ids]))[0][0][1:-1]
                patient_temp.append(embeddings)
                # except:
                    # pass
            if len(patient_temp) == 0: 
                continue
            new_bluebert[k] = patient_temp

    print("BlueBERT finished")
    

## Running ClinicalBERT

In [None]:
if run_clinicalBERT:
    print("ClinicalBERT starting..")
    for data, names in zip(data_types, data_names):
   
        new_clinicalbert = {}

        for k,v in tqdm(data.items()):
            patient_temp = []
            for i in v:
                # try:
                input_ids = clinicalbert_tokenizer.encode(i[0], add_special_tokens=True)
                embeddings = clinicalbert_model(torch.tensor([input_ids]))[0][0][1:-1]
                patient_temp.append(embeddings)
                # except:
                    # pass
            if len(patient_temp) == 0: 
                continue
            new_clinicalbert[k] = patient_temp

    print("ClinicalBERT finished")

Save the results

In [None]:
# print(len(new_word2vec), len(new_fasttextvec), len(new_concatvec), len(new_bluebert), len(new_clinicalbert))
if run_word2vec:     pd.to_pickle(new_word2vec, "data/"+names+"_word2vec_dict.pkl")
if run_fastText:     pd.to_pickle(new_fasttextvec, "data/"+names+"_fasttext_dict.pkl")
if run_combined:     pd.to_pickle(new_concatvec, "data/"+names+"_combined_dict.pkl")
if run_blueBERT:     pd.to_pickle(new_bluebert, "data/"+names+"_bluebert_dict.pkl")
if run_clinicalBERT: pd.to_pickle(new_clinicalbert, "data/"+names+"_clinicalbert_dict.pkl")

In [None]:
if run_word2vec and run_fastText and run_combined:

    new_fasttextvec_keys = set(new_fasttextvec.keys())
    new_word2vec_keys    = set(new_word2vec.keys())
    new_concatvec_keys   = set(new_concatvec.keys())
    intersection_keys    = new_fasttextvec_keys.intersection(new_word2vec_keys).intersection(new_concatvec_keys)

    print("Lengths before: {}, {}, {}".format(len(new_word2vec), len(new_fasttextvec), len(new_concatvec)))

    for i in new_fasttextvec_keys - intersection_keys:
        del new_fasttextvec[i]
    for i in new_word2vec_keys - intersection_keys:
        del new_word2vec[i]
    for i in new_concatvec_keys - intersection_keys:
        del new_concatvec[i]

    print("Lengths after:  {}, {}, {}".format(len(new_word2vec), len(new_fasttextvec), len(new_concatvec)))

    pd.to_pickle(new_word2vec, "data/"+"new_ner"+"_word2vec_limited_dict.pkl")
    pd.to_pickle(new_fasttextvec, "data/"+"new_ner"+"_fasttext_limited_dict.pkl")
    pd.to_pickle(new_concatvec, "data/"+"new_ner"+"_combined_limited_dict.pkl")
    print("done with writing the files")