Required packages:\
pandas==1.4.0\
numpy==1.21.5\
scikit-learn==1.0.2\
tensorflow==2.7.0\
torch==1.10.2\
transformers==4.17.0.dev0\
datasets==1.18.3\
textstat==0.7.2 (if running the ML part)\
xgboost==1.5.2 (if running the ML part)

In [1]:
import pandas as pd
import numpy as np

In [26]:
data = pd.read_csv("data/sample_full.csv", nrows=10)
print(data.head())



                                    Learning_outcome  Remember  Understand  \
0   Analyze the health economic implications of e...       NaN         NaN   
1   Apply research skills to operate effectively ...       NaN         NaN   
2   Assess and synthesise diverse information abo...       NaN         NaN   
3   Describe the general characteristics of the m...       NaN         1.0   
4   Evaluate the different models of perioperativ...       NaN         NaN   

   Apply  Analyze  Evaluate  Create  
0    NaN      1.0       NaN     NaN  
1    1.0      NaN       NaN     NaN  
2    NaN      NaN       1.0     1.0  
3    NaN      NaN       NaN     NaN  
4    NaN      NaN       1.0     NaN  


In [27]:
data.fillna({'Remember': 0, 'Understand': 0, 'Apply': 0, 'Analyze': 0, 'Evaluate': 0, 'Create':0}, inplace=True)

In [28]:
LIWC_data = pd.read_csv("data/LIWC2015 Results (Learning_outcome.csv).csv")
data = data.join(LIWC_data).drop(['A'], axis=1)

In [29]:
data.head()

Unnamed: 0,Learning_outcome,Remember,Understand,Apply,Analyze,Evaluate,Create,WC,Analytic,Clout,...,Comma,Colon,SemiC,QMark,Exclam,Dash,Quote,Apostro,Parenth,OtherP
0,Analyze the health economic implications of e...,0.0,0.0,0.0,1.0,0.0,0.0,9,99.0,50.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Apply research skills to operate effectively ...,0.0,0.0,1.0,0.0,0.0,0.0,14,99.0,92.33,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Assess and synthesise diverse information abo...,0.0,0.0,0.0,0.0,1.0,1.0,26,43.96,77.92,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Describe the general characteristics of the m...,0.0,1.0,0.0,0.0,0.0,0.0,23,99.0,50.0,...,8.7,0.0,0.0,0.0,0.0,4.35,0.0,0.0,0.0,0.0
4,Evaluate the different models of perioperativ...,0.0,0.0,0.0,0.0,1.0,0.0,10,98.58,15.86,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
labels = data[data.columns[1:7]].values.tolist()

In [31]:
data.columns[1:7]

Index(['Remember', 'Understand', 'Apply', 'Analyze', 'Evaluate', 'Create'], dtype='object')

In [32]:
from sklearn.ensemble import RandomForestClassifier

In [33]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, cohen_kappa_score, f1_score

## ML Test

In [34]:
import textstat

In [35]:
def generateX(data_x, test_x, textual_column_index, start_index_LIWC, end_index_LIWC):
    column_names = []
    print("Getting Unigram...")
    uni_cv = CountVectorizer(stop_words='english', ngram_range=(1, 1), max_features=1000)
    unigram = uni_cv.fit_transform(data_x[:, textual_column_index])
    unigram = unigram.toarray()
    unigram_test = uni_cv.transform(test_x[:,textual_column_index]).toarray()
    temp = uni_cv.get_feature_names_out().tolist()
    column_names += ["uni_"+name for name in temp]
    print("Getting Bigram...")
    bi_cv = CountVectorizer(stop_words='english', ngram_range=(2, 2), max_features=1000)
    bigram = bi_cv.fit_transform(data_x[:, textual_column_index])
    bigram = bigram.toarray()
    bigram_test = bi_cv.transform(test_x[:, textual_column_index]).toarray()
    temp = bi_cv.get_feature_names_out().tolist()
    column_names += ["bi_"+name for name in temp]
    print("Getting Tfidf...")
    tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 1), max_features=1000)
    t = tfidf.fit_transform(data_x[:, textual_column_index])
    t = t.toarray()
    t_test = tfidf.transform(test_x[:, textual_column_index]).toarray()
    temp = tfidf.get_feature_names_out().tolist()
    column_names += ["tfidf_"+name for name in temp]
    print("Getting ARI...")
    ari = [textstat.automated_readability_index(text) for text in data_x[:, textual_column_index]]
    ari_test = [textstat.automated_readability_index(text) for text in test_x[:, textual_column_index]]
    column_names.append("ari")
    combined_data_x = []
    combined_test_x = []
    print("Combining...")
    for i in range(len(data_x)):
        combined_data_x.append(unigram[i].tolist()
                              + bigram[i].tolist()
                              + t[i].tolist()
                              + [ari[i]]
                              + data_x[i, start_index_LIWC:end_index_LIWC].tolist())
    for i in range(len(test_x)):
        combined_test_x.append(unigram_test[i].tolist()
                              + bigram_test[i].tolist()
                              + t_test[i].tolist()
                              + [ari_test[i]]
                              + test_x[i, start_index_LIWC:end_index_LIWC].tolist())
    print("Generated feature shape is", np.array(combined_data_x).shape)
    print("Generated test feature is", np.array(combined_test_x).shape)
    return combined_data_x, column_names, combined_test_x

In [36]:
data.drop(columns=list(data.columns[1:7])).iloc[:, 0]

0     Analyze the health economic implications of e...
1     Apply research skills to operate effectively ...
2     Assess and synthesise diverse information abo...
3     Describe the general characteristics of the m...
4     Evaluate the different models of perioperativ...
5     explain key terms and concepts used to engage...
6     Identify an issue of relevance to the practic...
7     Identify the key features of the use of radia...
8     Recognise the key role that human factors pla...
9     Rigorously quantify the mixing of various cla...
Name: Learning_outcome, dtype: object

In [37]:
train_x, test_x, train_y, test_y = train_test_split(data.drop(columns=list(data.columns[1:8])), data[data.columns[1:7]], test_size=0.2, random_state=666)

In [38]:
np.unique(train_y['Remember'].tolist(), return_counts=True), np.unique(test_y['Remember'].tolist(), return_counts=True)

((array([0., 1.]), array([7, 1])), (array([0.]), array([2])))

In [39]:
np.unique(train_y['Understand'].tolist(), return_counts=True), np.unique(test_y['Understand'].tolist(), return_counts=True)

((array([0., 1.]), array([7, 1])), (array([0., 1.]), array([1, 1])))

In [40]:
np.unique(train_y['Apply'].tolist(), return_counts=True), np.unique(test_y['Apply'].tolist(), return_counts=True)

((array([0., 1.]), array([6, 2])), (array([0.]), array([2])))

In [41]:
np.unique(train_y['Analyze'].tolist(), return_counts=True), np.unique(test_y['Analyze'].tolist(), return_counts=True)

((array([0., 1.]), array([6, 2])), (array([0., 1.]), array([1, 1])))

In [42]:
np.unique(train_y['Evaluate'].tolist(), return_counts=True), np.unique(test_y['Evaluate'].tolist(), return_counts=True)

((array([0., 1.]), array([6, 2])), (array([0.]), array([2])))

In [43]:
np.unique(train_y['Create'].tolist(), return_counts=True), np.unique(test_y['Create'].tolist(), return_counts=True)

((array([0., 1.]), array([7, 1])), (array([0.]), array([2])))

In [44]:
one_hot = []
for d in data[data.columns[1:7]].values:
    one_hot.append(np.array2string(d).count("1"))
np.unique(one_hot, return_counts=True)

(array([1, 2]), array([9, 1]))

In [45]:
ml_train_x, column_names, ml_test_x = generateX(train_x.to_numpy(), test_x.to_numpy(), 0, 1, 94)

Getting Unigram...
Getting Bigram...
Getting Tfidf...
Getting ARI...
Combining...
Generated feature shape is (8, 326)
Generated test feature is (2, 326)


In [46]:
column_names += data.columns[7:].tolist()

In [47]:
rf = RandomForestClassifier()
rf.fit(ml_train_x, train_y)

RandomForestClassifier()

In [48]:
pred_y = rf.predict(ml_test_x)

In [49]:
print(classification_report(test_y, pred_y, output_dict=False, target_names=list(data.columns[1:7]), digits=3))

              precision    recall  f1-score   support

    Remember      0.000     0.000     0.000         0
  Understand      0.000     0.000     0.000         1
       Apply      0.000     0.000     0.000         0
     Analyze      0.000     0.000     0.000         1
    Evaluate      0.000     0.000     0.000         0
      Create      0.000     0.000     0.000         0

   micro avg      0.000     0.000     0.000         2
   macro avg      0.000     0.000     0.000         2
weighted avg      0.000     0.000     0.000         2
 samples avg      0.000     0.000     0.000         2



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [50]:
pred_score_y = rf.predict_proba(ml_test_x)

In [51]:
np.array(test_x).shape

(2, 93)

In [52]:
np.array(pred_score_y).shape

(6, 2, 2)

In [53]:
pred_score_y = np.transpose([score[:, 1] for score in rf.predict_proba(ml_test_x)])

In [54]:
roc_auc_score(test_y, pred_score_y, average=None)

ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.

In [None]:
f1_score(test_y, pred_y, average="micro")

In [None]:
accuracy_score(test_y, pred_y)

In [None]:
ml_result_df = pd.DataFrame(data=pred_y, columns=data.columns[1:7])

In [55]:
ml_result_df

NameError: name 'ml_result_df' is not defined

In [56]:
ml_golden_df = pd.DataFrame(data=test_y, columns=data.columns[1:7])

In [57]:
print(accuracy_score(ml_golden_df['Remember'].tolist(), ml_result_df['Remember'].tolist()))
print(accuracy_score(ml_golden_df['Understand'].tolist(), ml_result_df['Understand'].tolist()))
print(accuracy_score(ml_golden_df['Apply'].tolist(), ml_result_df['Apply'].tolist()))
print(accuracy_score(ml_golden_df['Analyze'].tolist(), ml_result_df['Analyze'].tolist()))
print(accuracy_score(ml_golden_df['Evaluate'].tolist(), ml_result_df['Evaluate'].tolist()))
print(accuracy_score(ml_golden_df['Create'].tolist(), ml_result_df['Create'].tolist()))

NameError: name 'ml_result_df' is not defined

In [58]:
print(cohen_kappa_score(ml_golden_df['Remember'].tolist(), ml_result_df['Remember'].tolist()))
print(cohen_kappa_score(ml_golden_df['Understand'].tolist(), ml_result_df['Understand'].tolist()))
print(cohen_kappa_score(ml_golden_df['Apply'].tolist(), ml_result_df['Apply'].tolist()))
print(cohen_kappa_score(ml_golden_df['Analyze'].tolist(), ml_result_df['Analyze'].tolist()))
print(cohen_kappa_score(ml_golden_df['Evaluate'].tolist(), ml_result_df['Evaluate'].tolist()))
print(cohen_kappa_score(ml_golden_df['Create'].tolist(), ml_result_df['Create'].tolist()))

NameError: name 'ml_result_df' is not defined

## BERT

In [59]:
import torch
import tensorflow as tf
from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer, AutoModelForSequenceClassification, EarlyStoppingCallback
from transformers import TFBertPreTrainedModel, TFBertMainLayer, InputFeatures
from datasets import load_metric, list_metrics

In [60]:
class EncodeDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)




In [61]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', problem_type="multi_label_classification")
model = AutoModelForSequenceClassification.from_pretrained('multilabel/checkpoint-250', local_files_only=True)



OSError: We couldn't connect to 'https://huggingface.co' to load this file, couldn't find it in the cached files and it looks like multilabel/checkpoint-250 is not the path to a directory containing a file named config.json.
Checkout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'.

In [38]:
train_x, test_x, train_y, test_y = train_test_split(data['Learning_outcome'].tolist(), labels, test_size=0.2, random_state=666)
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.2, random_state=666)

In [39]:
train_encoded = tokenizer(train_x, truncation=True, padding=True, max_length=100)
val_encoded = tokenizer(val_x, truncation=True, padding=True, max_length=100)
test_encoded = tokenizer(test_x, truncation=True, padding=True, max_length=100)

In [40]:
train_set, val_set, test_set = EncodeDataset(train_encoded, train_y), EncodeDataset(val_encoded, val_y), EncodeDataset(test_encoded, test_y)

In [41]:
training_args = TrainingArguments(
        output_dir='multilabel',          # output directory
        overwrite_output_dir=True,
        num_train_epochs=3,              # total number of training epochs
        per_device_train_batch_size=64,  # batch size per device during training
        per_device_eval_batch_size=64,   # batch size for evaluation
        warmup_steps=5,                # number of warmup steps for learning rate scheduler
        weight_decay=0.05,               # strength of weight decay
        logging_dir='./logs',            # directory for storing logs
        logging_steps=10,
        evaluation_strategy="steps",
        save_strategy="steps",
        save_steps=10,
        load_best_model_at_end=True
    )

In [42]:
def getClassResult(predicted):
    results = []
    for probs in predicted.numpy():
        result = []
        for prob in probs:
            if prob < 0.5:
                result.append(0)
            else:
                result.append(1)
        results.append(result)
    return results

metric = load_metric("f1")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = tf.keras.activations.sigmoid(logits)
    predicted = getClassResult(predictions)
    return metric.compute(predictions=predicted, references=labels, average="micro")

In [43]:
trainer = Trainer(model=model, args=training_args, train_dataset=train_set, eval_dataset=val_set, callbacks=[EarlyStoppingCallback(early_stopping_patience=5)])

In [44]:
trainer.train()

***** Running training *****
  Num examples = 13683
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 642


Step,Training Loss,Validation Loss
10,0.3287,0.320655
20,0.3016,0.272382
30,0.2772,0.240769
40,0.2299,0.2164
50,0.221,0.19418
60,0.1868,0.182231
70,0.1865,0.169153
80,0.1703,0.163196
90,0.1611,0.160195
100,0.1603,0.149396


***** Running Evaluation *****
  Num examples = 3421
  Batch size = 64
Saving model checkpoint to multilabel/checkpoint-10
Configuration saved in multilabel/checkpoint-10/config.json
Model weights saved in multilabel/checkpoint-10/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3421
  Batch size = 64
Saving model checkpoint to multilabel/checkpoint-20
Configuration saved in multilabel/checkpoint-20/config.json
Model weights saved in multilabel/checkpoint-20/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3421
  Batch size = 64
Saving model checkpoint to multilabel/checkpoint-30
Configuration saved in multilabel/checkpoint-30/config.json
Model weights saved in multilabel/checkpoint-30/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3421
  Batch size = 64
Saving model checkpoint to multilabel/checkpoint-40
Configuration saved in multilabel/checkpoint-40/config.json
Model weights saved in multilabel/checkpoint-40/pytorch_model.bin
****

TrainOutput(global_step=300, training_loss=0.15403384764989217, metrics={'train_runtime': 4311.8885, 'train_samples_per_second': 9.52, 'train_steps_per_second': 0.149, 'total_flos': 986033813713200.0, 'train_loss': 0.15403384764989217, 'epoch': 1.4})

In [44]:
logits = trainer.predict(test_set)

***** Running Prediction *****
  Num examples = 4276
  Batch size = 64


In [45]:
logits.predictions.shape

(4276, 6)

In [46]:
predicted = tf.keras.activations.sigmoid(logits.predictions)

Metal device set to: Apple M1 Pro


2022-03-10 00:49:38.848119: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-03-10 00:49:38.848561: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [47]:
predicted.numpy()

array([[0.01409502, 0.9816279 , 0.01466793, 0.0106061 , 0.01333568,
        0.01498337],
       [0.01131314, 0.97574437, 0.01378885, 0.00961971, 0.01381588,
        0.01985262],
       [0.01220088, 0.25224018, 0.02930349, 0.01424369, 0.41727734,
        0.00650762],
       ...,
       [0.00437115, 0.01897231, 0.95080817, 0.00888529, 0.01537037,
        0.03060839],
       [0.60755473, 0.33469725, 0.02623363, 0.0142573 , 0.8560901 ,
        0.01503711],
       [0.00936071, 0.02604666, 0.5798291 , 0.02834429, 0.9835946 ,
        0.05021281]], dtype=float32)

In [48]:
predicted_label = getClassResult(predicted)

In [49]:
count = 0
for pred in predicted_label:
    if pred.count(1) > 1:
        count += 1
count

453

In [50]:
print(classification_report(test_y, predicted_label, output_dict=False, target_names=list(data.columns[1:7]), digits=3))

              precision    recall  f1-score   support

    Remember      0.860     0.852     0.856       237
  Understand      0.921     0.918     0.920      1200
       Apply      0.929     0.895     0.912      1216
     Analyze      0.939     0.877     0.907       701
    Evaluate      0.947     0.914     0.930       799
      Create      0.915     0.832     0.872       739

   micro avg      0.926     0.890     0.907      4892
   macro avg      0.919     0.881     0.899      4892
weighted avg      0.926     0.890     0.907      4892
 samples avg      0.919     0.907     0.907      4892



  _warn_prf(average, modifier, msg_start, len(result))


In [51]:
roc_auc_score(test_y, predicted.numpy(), average=None)

array([0.98839375, 0.98205773, 0.97611275, 0.98406073, 0.98888998,
       0.97096306])

In [58]:
accuracy_score(np.array(test_y), predicted_label)

0.8528999064546305

In [52]:
dl_result_df = pd.DataFrame(data=predicted_label, columns=data.columns[1:7])

In [54]:
print(accuracy_score(ml_golden_df['Remember'].tolist(), dl_result_df['Remember'].tolist()))
print(accuracy_score(ml_golden_df['Understand'].tolist(), dl_result_df['Understand'].tolist()))
print(accuracy_score(ml_golden_df['Apply'].tolist(), dl_result_df['Apply'].tolist()))
print(accuracy_score(ml_golden_df['Analyze'].tolist(), dl_result_df['Analyze'].tolist()))
print(accuracy_score(ml_golden_df['Evaluate'].tolist(), dl_result_df['Evaluate'].tolist()))
print(accuracy_score(ml_golden_df['Create'].tolist(), dl_result_df['Create'].tolist()))

0.9840972871842844
0.9550982226379794
0.9506548175865295
0.970533208606174
0.9742750233863424
0.9576707202993452


In [55]:
print(cohen_kappa_score(ml_golden_df['Remember'].tolist(), dl_result_df['Remember'].tolist()))
print(cohen_kappa_score(ml_golden_df['Understand'].tolist(), dl_result_df['Understand'].tolist()))
print(cohen_kappa_score(ml_golden_df['Apply'].tolist(), dl_result_df['Apply'].tolist()))
print(cohen_kappa_score(ml_golden_df['Analyze'].tolist(), dl_result_df['Analyze'].tolist()))
print(cohen_kappa_score(ml_golden_df['Evaluate'].tolist(), dl_result_df['Evaluate'].tolist()))
print(cohen_kappa_score(ml_golden_df['Create'].tolist(), dl_result_df['Create'].tolist()))

0.8475165217354823
0.8886774810112577
0.8773959293050357
0.8895938602599291
0.9141876451080061
0.8464440044283781
