In [None]:
from get_data import get_note_data 
import pandas as pd
import make_models # for_passed_labels, for_one_label
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
import numpy as np
from tensorflow.keras.optimizers import Adam
import multishot
LABELS = ["ABDOMINAL",
        "ADVANCED-CAD",
        "ALCOHOL-ABUSE",
        "ASP-FOR-MI",
        "CREATININE",
        "DIETSUPP-2MOS",
        "DRUG-ABUSE",
        "ENGLISH",
        "HBA1C",
        "KETO-1YR",
        "MAJOR-DIABETES",
        "MAKES-DECISIONS",
        "MI-6MOS"]


In [None]:
patient_notes_data = get_note_data(LABELS, folder_name='test')
patient_notes_data.sort_index().head()

In [None]:
# Example usage of make_models_for_labels
models = make_models.for_passed_labels(['ALCOHOL-ABUSE'], make_models.for_one_label, get_note_data(LABELS), epochs=2, save=True)

In [None]:
loaded_models = {}
for label in LABELS:
    loaded_models[label] = (TFAutoModelForSequenceClassification.from_pretrained(f'{label}_model'), AutoTokenizer.from_pretrained(f'{label}_model'))
print(loaded_models)
    

In [None]:
def get_predictions(model, tokenizer, threshold=.5, verbose=True):
    tokenized_notes = tokenizer(list(patient_notes_data["notes"]), padding='max_length', max_length=512, truncation=True, return_tensors="tf")
    tokenized_data = dict(tokenized_notes)
    model_predictions = model.predict(tokenized_data)
    logits = model_predictions.logits
    probabilities = tf.nn.softmax(logits)
    list_probabilities = probabilities.numpy()
    predictions = (list_probabilities[:,0] < threshold).astype(int).tolist()
    return predictions

# Example getting predictions:
# label_to_predictions = {}
# for label, model_tokenizer in loaded_models.items():
#     print(f"Predicting for model: f{label}")
#     label_to_predictions[label] = get_predictions(model_tokenizer[0], model_tokenizer[1])


In [None]:
def save_preds_and_acc(label_to_accuracy, label_to_predictions):
    dfa = pd.DataFrame.from_dict(label_to_accuracy, orient='index', columns=['accuracy'])
    dfp = pd.DataFrame.from_dict(label_to_predictions, orient='index')
    dfa.to_csv("accuracies.csv")
    dfp.to_csv('predictions.csv')

In [None]:
# Read in prediction data:
dfp = pd.read_csv('predictions.csv')
label_to_predictions = {}
for index, row in dfp.iterrows():
    label_to_predictions[row[0]] = list(row[1:])

In [None]:
def get_accuracy_from_preds(predictions, true_labels, verbose=True):
    accuracy = np.mean(predictions == true_labels)
    if verbose:
        print(f"{label} Accuracy: {accuracy}")
    return accuracy

# Example getting accuracy from preds:
# label_to_accuracy = {}
# for label, predictions in label_to_predictions.items():
#     true_labels = np.array(patient_notes_data[label])
#     accuracy = get_accuracy_from_preds(predictions, true_labels)
#     label_to_accuracy[label] = accuracy

# Or:
dfa = pd.read_csv('accuracies.csv')

In [None]:
dfa.describe()

In [None]:
from sklearn.metrics import f1_score
y_true = []
y_pred = []
for label in LABELS:
    y_true.extend(patient_notes_data[label])
    y_pred.extend(label_to_predictions[label])
f1 = f1_score(y_true, y_pred, average='micro')
print('micro-f1:', f1)