### importing libraries

In [None]:
import pandas as pd
import pandasql as ps
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np


In [None]:
path = '/Users/cc/Desktop/Work/Uni/Berner FH/DataSets/mimicCSV/mimic-iv-3.1/hosp/shortform/preprocessed_for_bert.csv'
balanced_df = pd.read_csv(path)

In [None]:
# Shuffle the DataFrame and select 2000 random patients
shuffled_df = balanced_df.sample(frac=1, random_state=4322)  # Shuffle the DataFrame
selected_df = shuffled_df.sample(n=10000, random_state=49)  # Select 2000 random rows
balanced_df = selected_df

In [None]:
balanced_df.shape

In [None]:
data_texts = balanced_df['text'].tolist()
data_labels = balanced_df['encoded_text'].tolist()

train_texts, val_texts, train_labels, val_labels = train_test_split(
    data_texts, data_labels, test_size=0.2, random_state=124
)
train_texts, test_texts, train_labels, test_labels = train_test_split(
    train_texts, train_labels, test_size=0.01, random_state=124
)

Tokenization

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')


In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)



### tensorflow dataset

In [None]:
batch_size = 16

train_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask']},
    train_labels
)).shuffle(len(train_texts)).batch(batch_size)

val_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': val_encodings['input_ids'], 'attention_mask': val_encodings['attention_mask']},
    val_labels
)).batch(batch_size)

test_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': test_encodings['input_ids'], 'attention_mask': test_encodings['attention_mask']},
    test_labels
)).batch(batch_size)



### model definition

In [None]:
# Load  model
model = TFDistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=2  # Binary classification
)

In [None]:
# compile
optimizer = 'adam'
loss = SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])


In [None]:
#fit it
epochs = 7
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=epochs
)


# eval and save

In [None]:
# Eval on test set
results = model.evaluate(test_dataset)
print(f"Test Accuracy: {results[1]:.2f}")

In [None]:
# Save model
model.save_pretrained('/Users/cc/Desktop/Work/Uni/Berner FH/DataSets/mimicCSV/mimic-iv-3.1/hosp/shortform/BERTFTModel')
tokenizer.save_pretrained('/Users/cc/Desktop/Work/Uni/Berner FH/DataSets/mimicCSV/mimic-iv-3.1/hosp/shortform/BERTFTModel')


# visualize training performance

In [None]:
### val and train accuracy

plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()


# Testing Bert

In [None]:
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
import tensorflow as tf

In [None]:

save_path = '/Users/cc/Desktop/Work/Uni/Berner FH/DataSets/mimicCSV/mimic-iv-3.1/hosp/shortform/BERTFTModel'

# save model and tokns
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

In [None]:
# Test txt
example_texts = [
    "The Patient had a traumatic head injury and suffered a cardiac arrest. Do not resusitate, paliative care",
    "Patient is stable and has no history of serious illnesses."
]


In [None]:
# Tokenize
encodings = tokenizer(example_texts, truncation=True, padding=True, max_length=512, return_tensors='tf')


In [None]:
# predict
outputs = model(encodings['input_ids'], attention_mask=encodings['attention_mask'])


In [None]:
# get probabilities
predictions = tf.nn.softmax(logits, axis=-1)


In [None]:
# print pred.
for i, text in enumerate(example_texts):
    print(f"Text: {text}")
    print(f"Predicted Probabilities: {predictions[i].numpy()}")
    print(f"Predicted Label: {tf.argmax(predictions[i]).numpy()}")  # Class 0 or 1