In [0]:
monisha = False
loc = '/content/drive/My Drive/6.871 ML with HC/Project/' if monisha else '/content/drive/My Drive/Project/'

import pandas as pd
from google.colab import drive
drive.mount('/content/drive')
import numpy as np
import pickle
import torch
import random
import ast
!pip install transformers
from transformers import *

# https://huggingface.co/emilyalsentzer/Bio_ClinicalBERT
# Recitation Example: https://colab.research.google.com/drive/1dluu2EDp9NuE8FzMnw4M_6lQ66lcX-nh#scrollTo=h83JKPFeTZ4k
# Use Bio_Clinical BERT (trained on MIMIC III) through the transformers library
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

In [0]:
def obtain_BERT_embeddings_new(txt, abbrev):
  #given a text with a unique abbrev, returns the BERT embeding of abbrev. 
  tokenized_text = tokenizer.encode(txt, add_special_tokens=True)
  tokenized_abbrev = tokenizer.encode(abbrev, add_special_tokens=True)
  #below finds the index corresponding to our abbreviation
  try:
    index = [x for x in range(len(tokenized_text)) if tokenized_text[x:x+len(tokenized_abbrev)-2] == tokenized_abbrev[1:-1]][0] 
    input_id = torch.tensor([tokenizer.encode(txt, add_special_tokens=True)])
    ##need to account for possibility if len(input_id)>512. BERT only takes in max token length 512. 
    mode = 0
    if input_id.shape[1]>512:
      temp = input_id[0][0:512-len(tokenized_abbrev)+2].unsqueeze(0)
      temp2 = input_id[0][index:index+len(tokenized_abbrev)-2].unsqueeze(0)
      input_id = torch.cat([temp, temp2], dim=1) # we take the first 500 ish tokens for the sentence, and just append the abbreviation at the end. We will recover the embedding of the end abbreviation. 
      mode = 1
    try:
      if mode == 1:
        embedding = model(input_id)[0][0][512-len(tokenized_abbrev)+2:512].detach().numpy()      
      else:
        embedding = model(input_id)[0][0][index:index+len(tokenized_abbrev)-2].detach().numpy() #we recover the BERT embedding for the abbreviation.
      return np.mean(embedding, axis=0)
    except IndexError:
      print(txt)
      print(tokenized_text)
      print(tokenized_abbrev)
      print(index)
  except IndexError:
      print(txt)
      print(tokenized_text)
      print(tokenized_abbrev)
      # print(index)
  return None

In [0]:
train = pd.read_csv(loc+"data/train_mimic.csv")
neg_train = pd.read_csv(loc+"data/neg_train_mimic.csv")
temp_df = pd.read_csv(loc+"data/temp_test.csv")
test = pd.read_csv(loc+"data/test.csv")

##temp_df is basically test preprocessing. 

In [0]:
neg_train.head(5)

In [0]:
## due to how we saved the .csv's many entries are strings when they are supposed to be arrays. This converts them back to arrays.
try:
  train["Abbreviation-embedding"] = train["Abbreviation-embedding"].apply(lambda x: ast.literal_eval(x))
  train["cui-embedding"] = train["cui-embedding"].apply(lambda x: ast.literal_eval(x))
  neg_train["Abbreviation-embedding"] = neg_train["Abbreviation-embedding"].apply(lambda x: ast.literal_eval(x))
  neg_train["incorrect-cui-embedding"] = neg_train["incorrect-cui-embedding"].apply(lambda x: ast.literal_eval(x))
  temp_df["Abbreviation-embedding"] = temp_df["Abbreviation-embedding"].apply(lambda x: ast.literal_eval(x))
  temp_df["CUI_embedding"] = temp_df["CUI_embedding"].apply(lambda x: ast.literal_eval(x))
  test["Abbreviation-embedding"] = test["Abbreviation-embedding"].apply(lambda x: ast.literal_eval(x))
  test["CUI_embedding"] = test["CUI_embedding"].apply(lambda x: ast.literal_eval(x))
  test["feature_vector"] = test["feature_vector"].apply(lambda x: ast.literal_eval(x))
except ValueError:
  print("All good!")


train_final = train.drop(["Abbreviation", "Long-form", "Text", "used_jaccard", "cui"], axis=1)
train_final["feature_vector"] = train_final["Abbreviation-embedding"] + train_final["cui-embedding"]
neg_train["feature_vector"] = neg_train["Abbreviation-embedding"]+neg_train["incorrect-cui-embedding"]
combined_train_final = pd.concat([train_final[["label", "feature_vector"]], neg_train[["label","feature_vector"]]])

mask = np.random.rand(len(combined_train_final)) < 0.85
ttrain = combined_train_final[mask]
tvalid = combined_train_final[~mask]
print('Train dataset size: ', len(ttrain))
print('Test dataset size: ', len(tvalid))

labels = ttrain.label.tolist()
features = ttrain["feature_vector"].tolist()

In [0]:
temp_df.to_csv(loc+'data/temp_test.csv')

In [0]:
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV

In [0]:
from sklearn.metrics import f1_score, recall_score, accuracy_score, precision_score

def evaluate_clf(clf, training, test):
  ##return accuracy
  # print(training[0][0:10], training[0][0:10])
  clf.fit(training[0], training[1])
  pred = clf.predict(test[0])
  f1 = f1_score(test[1], pred)
  recall = recall_score(test[1], pred)
  accuracy = accuracy_score(test[1], pred)
  precision = precision_score(test[1], pred)
  # accuracy = sum([pred[i]==test[1][i] for i in range(len(pred))])/len(pred)
  return accuracy, recall, precision, f1, clf

def generate_fake_data(temp_df, fakesize):  
  fake_data = temp_df.sample(fakesize)[["Abbreviation-embedding", "Long_Form"]]
  # fake_data["Abbreviation-embedding"] = fake_data["Abbreviation-embedding"].apply(lambda x: ast.literal_eval(x))
  fake_data["CUI_embedding"] = fake_data["Abbreviation-embedding"].apply(lambda x: random.choice(temp_df["CUI_embedding"].tolist()))
  # fake_data["CUI_embedding"] = fake_data["CUI_embedding"].apply(lambda x: ast.literal_eval(x))
  fake_data["label"] = fake_data["Abbreviation-embedding"].apply(lambda x: 0)
  fake_data["feature_vector"] = fake_data["Abbreviation-embedding"]+ fake_data["CUI_embedding"]
  return fake_data

def compare_without_with_data(clf, CASIsize, weight, features, labels, ttrain, fake=False):
  fakesize = CASIsize*weight
  msk = np.random.rand(len(test)) < 0.75
  new_test_7500 = test[msk]
  left_over = test[~msk]
  add_data = left_over.sample(CASIsize)[["feature_vector", "label"]]
  if fake==True:
    fake_test = generate_fake_data(temp_df, 7500)
  fake_data = generate_fake_data(temp_df, fakesize)
  new_test_7500 = pd.concat([new_test_7500, fake_test])
  new_ttrain = pd.concat([ttrain]+[add_data]*weight + [fake_data])
  new_features = new_ttrain["feature_vector"].tolist()
  new_labels = new_ttrain["label"].tolist()
  
  new_accuracy, recall, precision, f1, clf = evaluate_clf(clf, [new_features, new_labels], [new_test_7500["feature_vector"].tolist(), new_test_7500["label"].tolist()])
  # old_accuracy = evaluate_clf(clf, [features, labels], [test["feature_vector"].tolist(), test["label"].tolist()])
  print("accuracy is ", new_accuracy)
  print("f1 score is ", f1)
  print("recall score is ", recall)
  print("precision score is", precision)
  # print("old accuracy is ", old_accuracy)
  return new_accuracy, clf

In [0]:
# uncomment this if you want to train various models. 

from warnings import filterwarnings
filterwarnings('ignore')

arr = [RandomForestClassifier(), LogisticRegressionCV(), MLPClassifier((100,100,100,100))]
randomweights = [(100,4), (200,4), (1000,4), (2000,4)]
# randomweights=[(0,1)]
accs = []
for w in randomweights:
  w_acc = []
  for c in arr:
    print(w)
    print(c.__class__.__name__)
    acc, c = compare_without_with_data(c, w[0], w[1], features, labels, ttrain, fake=True)
    w_acc.append(acc)
  accs.append(w_acc)

In [0]:
with open(loc+"/data/perfresults1.txt", 'r', encoding = "ISO-8859-1") as f:
      lines1 = f.readlines()

In [0]:
df1 = pd.DataFrame(columns=['Dataset', 'Classifier', 'Accuracy'])
for i in range(0, len(lines1), 3):
  t = [lines1[i][:-1], lines1[i+1][:-1], lines1[i+2][:-1]]
  acc = float(t[2].split(' ')[-1].strip())
  # print([t[0], t[1], acc])
  df1 = df1.append({'Dataset': t[0], 'Classifier': t[1], 'Accuracy': acc}, ignore_index=True)
df1.head()

In [0]:
import seaborn as sns
sns.set()
sns.barplot(x="Dataset", y="Accuracy",
                     hue="Classifier",
                    #  marker = 'o',
                    #  s = 100,
                     data=df1)
plt.gcf().set_size_inches(10, 8)

In [0]:
with open(loc+"/data/perfresults2.txt", 'r', encoding = "ISO-8859-1") as f:
  lines2 = f.readlines()
with open(loc+"/data/perfresults3.txt", 'r', encoding = "ISO-8859-1") as f:
  lines3 = f.readlines()
lines2 = lines2+lines3
# print(lines2)
df = pd.DataFrame(columns=['Dataset', 'Classifier', 'Accuracy', 'F1', 'Recall', 'Precision'])
for i in range(0, len(lines2), 6):
  t = [lines2[i][:-1], lines2[i+1][:-1], lines2[i+2][:-1], lines2[i+3][:-1], lines2[i+4][:-1], lines2[i+5][:-1]]
  acc, f1, recall, precision = float(t[2].split(' ')[-1].strip()), float(t[3].split(' ')[-1].strip()), float(t[4].split(' ')[-1].strip()), float(t[5].split(' ')[-1].strip())
  # print([t[0], t[1], acc, f1, recall, precision])
  df = df.append({'Dataset': t[0], 'Classifier': t[1], 'Accuracy': acc, 'F1': f1, 'Recall': recall, 'Precision': precision}, ignore_index=True)
df

In [0]:
import seaborn as sns
sns.set()
sns.barplot(x="Dataset", y="F1",
                     hue="Classifier",
                    #  marker = 'o',
                    #  s = 100,
                     data=df).set_title('F1-Score')
plt.gcf().set_size_inches(10, 8)


In [0]:
# CASIsize = 1000
# weight = 0
# fakesize = CASIsize*weight
# msk = np.random.rand(len(test)) < 0.75
# new_test_7500 = test[msk]
# left_over = test[~msk]
# add_data = left_over.sample(CASIsize)[["feature_vector", "label"]]
# fake_data = generate_fake_data(temp_df, fakesize)
# new_ttrain = pd.concat([ttrain])
# new_features = new_ttrain["feature_vector"].tolist()
# new_labels = new_ttrain["label"].tolist()

# rf_clf2 = RandomForestClassifier(random_state=0)
# rf_clf2.fit(new_features, new_labels)
# rf_pred2 = rf_clf2.predict(new_test_7500["feature_vector"].tolist())
# print("Accuracy = ", sum([rf_pred2[i]==new_test_7500["label"].tolist()[i] for i in range(len(rf_pred2))])/len(rf_pred2))

## Checking that our model uses character-level information. 
Hypotheses: Bad accuracy on randomized abbreviations. 
Generate acronym-based abbreviations. Use this to discover 'new' abbreviations.

In [0]:
##let's fix a prediction model.
c = MLPClassifier((100,100,100,100))
_, clf = compare_without_with_data(c, 2000, 1, features, labels, ttrain)

In [0]:
def generate_abbreviation(words):
  arr = words.split(' ')
  if len(words.split(' '))>=2:
    return ''.join([i[0] for i in arr]).upper()
  else:
    return (words[0:2]).upper()

import string
df = temp_df.sample(1000)[["Long_Form", "Abbr", "Abbr_In_Note", "Note_Text", "CUI_embedding", "Abbreviation-embedding" ]]
df.head(5)
df["random-abbrev"] = df["Abbr"].apply(lambda x: ''.join(random.choices(string.ascii_uppercase, k=len(x))))
df["random-annotated-text"] = df.apply(lambda x: x['Note_Text'].replace(x["Abbr_In_Note"], x["random-abbrev"]), axis=1 )
df["acronym-abbrev"] =df["Long_Form"].apply(lambda x: generate_abbreviation(x)) 
df["acronym-annotated-text"] = df.apply(lambda x: x['Note_Text'].replace(x["Abbr_In_Note"], x["acronym-abbrev"]), axis=1 )

df["random-abbrev-embed"] = df.apply(lambda x: obtain_BERT_embeddings_new(x["random-annotated-text"], x["random-abbrev"]), axis=1)
df["acronym-abbrev-embed"] = df.apply(lambda x: obtain_BERT_embeddings_new(x["acronym-annotated-text"], x["acronym-abbrev"]), axis=1)

df["random_feature_vectors"] = df.apply(lambda x: np.concatenate([x["random-abbrev-embed"], x["CUI_embedding"]]), axis = 1)
df["feature_vectors"] = df.apply(lambda x: np.concatenate([x["Abbreviation-embedding"], x["CUI_embedding"]]), axis = 1)
df["acronym_feature_vectors"] = df.apply(lambda x: np.concatenate([x["acronym-abbrev-embed"], x["CUI_embedding"]]), axis = 1)
df["model_pred"] = df["feature_vectors"].apply(lambda x: clf.predict([x]))
df["random_pred"] = df["random_feature_vectors"].apply(lambda x: clf.predict([x]))
df["acronym_pred"] = df["acronym_feature_vectors"].apply(lambda x: clf.predict([x]))

results = df[["Long_Form", "Abbr", "Abbr_In_Note", "Note_Text", "random-abbrev", "acronym-abbrev", "model_pred", "random_pred", "acronym_pred"]]

In [0]:
len(results[results["random_pred"] == 1])/len(results)

In [0]:
len(results[results["acronym_pred"]==1])/len(results)

In [0]:
generated_abbreviations = results[(results["acronym-abbrev"]!=results["Abbr"]) & (results["acronym_pred"]==1)]

In [0]:
generated_abbreviations

In [0]:
results[results["acronym-abbrev"].isin(set(generated_abbreviations["acronym-abbrev"]))]