#Installing Dependencies
This step makes sure that the depencies used in the notebook are installed. If the conda environment is active. The installation should state that the requirements are already satisfied

In [None]:
!pip install pytorch-lightning==1.6.0 pandas transformers==4.18.0 lightning-flash==0.7.3 nltk==3.7 gensim==4.1.2 scikit-learn==1.0.2 seaborn==0.11.2 'lightning-flash[text]'

# Cloning the dataset
Since the dataset is publicly available on github, it is cloned as its own repository and renamed as 'data'. 


In [None]:
import os
!git clone https://github.com/neffjulian/MLfHC-Project-2
os.chdir('MLfHC-Project-2')
!git clone https://github.com/Franck-Dernoncourt/pubmed-rct
os.rename('pubmed-rct', 'data')
os.chdir('notebooks')

Cloning into 'MLfHC-Project-2'...
remote: Enumerating objects: 224, done.[K
remote: Counting objects: 100% (224/224), done.[K
remote: Compressing objects: 100% (145/145), done.[K
remote: Total 224 (delta 106), reused 175 (delta 70), pack-reused 0[K
Receiving objects: 100% (224/224), 364.00 KiB | 9.58 MiB/s, done.
Resolving deltas: 100% (106/106), done.
Cloning into 'pubmed-rct'...
remote: Enumerating objects: 33, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 33 (delta 5), reused 5 (delta 5), pack-reused 25[K
Unpacking objects: 100% (33/33), done.


#Creating Dataset Object
For pytorch, and its higher level library flash to interpret the dataset, we use an object model to load it and create the csv files which then can be passed into the data loader

In [None]:
import os
import re
import torch
import numpy as np
import pandas as pd

from argparse import ArgumentParser

from pytorch_lightning import LightningDataModule
from torch.utils.data import Dataset, DataLoader

import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
from gensim.models import Word2Vec, phrases
from sklearn.base import BaseEstimator, TransformerMixin
nltk.download('all')

In [None]:
class PreprocessData():
    def __init__(self, data_dir="data", dataset="PubMed_20k_RCT", lower=True, rem_stop_words=True, stemming=True, lemmatisation=False, **kwargs):
        self.data_dir = data_dir
        self.dataset = dataset
        self.lower = lower
        self.rem_stop_words = rem_stop_words
        self.stemming = stemming
        self.lemmatisation = lemmatisation

        os.makedirs(os.path.join(self.data_dir, "processed_" +
                    self.dataset), exist_ok=True)

    def read(self, file_name):
        path = os.path.join(self.data_dir, self.dataset, file_name + ".txt")
        labels, sentences = [], []
        with open(path, "r") as f:
            for line in f.readlines():
                if not line.startswith("#") and line.strip() != "":
                    label, sentence = line.split("\t")
                    labels.append(label)
                    sentences.append(sentence)

        return pd.DataFrame(list(zip(labels, sentences)), columns=["Labels", "Sentences"])

    def preprocess_sentence(self, sentence: str):
        words = sentence.split()

        words = [word.translate(str.maketrans("","", string.punctuation)) for word in words]

        if(self.lower):
            words = map(lambda w: w.lower(), words)

        if(self.rem_stop_words):
            stop_words = set(stopwords.words('english'))
            words = [w for w in words if not w.lower() in stop_words]

        if(self.stemming):
            ps = PorterStemmer()
            words = map(lambda w: ps.stem(w), words)

        if(self.lemmatisation):
            lz = WordNetLemmatizer()
            words = map(lambda w: lz.lemmatize(w), words)

        sentence = ' '.join(words)
        sentence = re.sub(r"\s+", " ", sentence) # remove extra spaces between words
        return ' '.join(words)

    def load(self, file):
        df = self.read(file)
        path = os.path.join(self.data_dir, "processed_" + self.dataset)
        df["Sentences"] = [s.split(' ') for s in df["Sentences"]]
        df.reset_index(drop=True, inplace=True)
        df.to_csv(os.path.join(path, file + ".csv"), index=False)

        return df

    def createFiles(self):
        print("preprocessing files")
        dev = self.load("dev")
        train = self.load("train")
        test = self.load("test")
        print("finished preprocessing")

        return dev, train, test

    @staticmethod
    def add_preprocessor_args(parent_parser: ArgumentParser):
        parser = parent_parser.add_argument_group("Data preprocessing")
        parser.add_argument("--lower", action="store_true",
                            help="Transform sentences to lowercase")
        parser.add_argument("--rem_stop_words",
                            action="store_true", help="Remove stopwords")
        parser.add_argument(
            "--stemming", action="store_true", help="Use stemmer")
        parser.add_argument("--lemmatisation",
                            action="store_true", help="Use lemmatisation")
        parser.add_argument(
            "--data_dir", help="Path to the data directory", default="data")
        parser.add_argument(
            "--dataset", help="The dataset (i.e. PubMed_20k_RCT", default="PubMed_20k_RCT")
        return parent_parser
os.chdir('/content/MLfHC-Project-2/')

In [None]:
PATH_TO_DATA = 'data'
DATASET_NAME = 'PubMed_20k_RCT'
preprocessor = PreprocessData(data_dir=PATH_TO_DATA, dataset=DATASET_NAME, lower=True, rem_stop_words=True, stemming=True, lemmatisation=True)
dev, train, test = preprocessor.createFiles()

preprocessing files
finished preprocessing


# Creating the Model Instance
In this section of the notebook, we initialise the model by downloading it from hugging face and the data-module with the correctly formated files

In [None]:
import flash
import torch
from flash.text import TextClassificationData, TextClassifier
import os

In [None]:
train_file = "data/processed_PubMed_20k_RCT/train.csv"
val_file = "data/processed_PubMed_20k_RCT/dev.csv"
test_file = "data/processed_PubMed_20k_RCT/test.csv"
datamodule = TextClassificationData.from_csv("Sentences", "Labels", train_file=train_file, val_file=val_file, test_file=test_file, batch_size=64)
model = TextClassifier(backbone="emilyalsentzer/Bio_ClinicalBERT", labels=datamodule.labels)

Using custom data configuration default-ce422592412942ca


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-ce422592412942ca/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-ce422592412942ca/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/180040 [00:00<?, ?ex/s]

Using custom data configuration default-6ee0f5c284e34d29


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-6ee0f5c284e34d29/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-6ee0f5c284e34d29/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/30212 [00:00<?, ?ex/s]

Using custom data configuration default-8ba5ebbaa2493f4e


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-8ba5ebbaa2493f4e/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-8ba5ebbaa2493f4e/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/30135 [00:00<?, ?ex/s]

Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Using 'emilyalsentzer/Bio_ClinicalBERT' provided by Hugging Face/transformers (https://github.com/huggingface/transformers).


Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model

# Training the model
In this stage we fine-tune the model. It is possible to chose to train the 108M parameters or *just* the 3.4k parameters of final layers with the parameter 'freeze' or 'no_freeze'

In [None]:
trainer = flash.Trainer(max_epochs=25, gpus=torch.cuda.device_count())

print(trainer.finetune(model, datamodule=datamodule, strategy ='freeze'))
print(trainer.test(model, datamodule=datamodule))

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type                          | Params
----------------------------------------------------------------
0 | train_metrics | ModuleDict                    | 0     
1 | val_metrics   | ModuleDict                    | 0     
2 | test_metrics  | ModuleDict                    | 0     
3 | model         | BertForSequenceClassification | 108 M 
----------------------------------------------------------------
3.8 K     Trainable params
108 M     Non-trainable params
108 M     Total params
433.256   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]