In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
DATASET_PUBMED_RCT_DIR = './gdrive/Shareddrives/DATASETS/PUBMED_RCT/'
OUTPUT_MODEL_DIR = './gdrive/Shareddrives/MODELS/'

In [None]:
!pip install transformers -q
!pip install sentence_transformers -q
!pip install datasets -q

[K     |████████████████████████████████| 4.7 MB 5.1 MB/s 
[K     |████████████████████████████████| 6.6 MB 50.8 MB/s 
[K     |████████████████████████████████| 101 kB 12.7 MB/s 
[K     |████████████████████████████████| 596 kB 72.4 MB/s 
[K     |████████████████████████████████| 85 kB 3.1 MB/s 
[K     |████████████████████████████████| 1.3 MB 32.1 MB/s 
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 365 kB 5.2 MB/s 
[K     |████████████████████████████████| 115 kB 76.1 MB/s 
[K     |████████████████████████████████| 141 kB 74.3 MB/s 
[K     |████████████████████████████████| 212 kB 72.3 MB/s 
[K     |████████████████████████████████| 127 kB 72.2 MB/s 
[?25h

In [None]:
import pandas as pd

In [None]:
df_train = pd.read_parquet(DATASET_PUBMED_RCT_DIR + 'train.parquet')
df_test = pd.read_parquet(DATASET_PUBMED_RCT_DIR + 'test.parquet')

In [None]:
df_test.columns

Index(['pmid', 'label', 'sentence', 'label_id'], dtype='object')

In [None]:
# same text but with different labels
print(f'{len(df_test.index)}')
mask = df_test.groupby('sentence')['label_id'].transform('nunique') > 1
df_test = df_test[~mask].copy()
print(f'{len(df_test.index)}')

30135
30122


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased', model_max_length=512)

Downloading config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/223k [00:00<?, ?B/s]

In [None]:
from transformers import AutoModelForSequenceClassification, AutoConfig, AutoTokenizer

model_path = OUTPUT_MODEL_DIR + 'pubmed_rct_classification/model'
config = AutoConfig.from_pretrained(model_path)

tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
model = AutoModelForSequenceClassification.from_pretrained(model_path, config=config)

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
model = model.to(device)
model.device

device(type='cuda', index=0)

In [None]:
from transformers import TextClassificationPipeline

pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=0)

In [None]:
predictions = pipe(list(df_test['sentence']))

In [None]:
y_pred = []
for pred in predictions:
  y_pred.append(config.label2id[pred['label']])

In [None]:
y_test_true = list(df_test['label_id'])

In [None]:
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import f1_score

output_dict = {}
output_dict['f1_macro'] = f1_score(y_test_true, y_pred, average='macro')
output_dict['f1_micro'] = f1_score(y_test_true, y_pred, average='micro')
output_dict

{'f1_macro': 0.7605377088737277, 'f1_micro': 0.8211938118318837}

In [None]:
config.id2label

{0: 'objective', 1: 'methods', 2: 'results', 3: 'conclusions', 4: 'background'}

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test_true, y_pred, target_names=config.id2label.values()))

              precision    recall  f1-score   support

   objective       0.71      0.55      0.62      2331
     methods       0.86      0.94      0.90      9892
     results       0.92      0.84      0.88      9710
 conclusions       0.71      0.83      0.76      4570
  background       0.68      0.61      0.64      3619

    accuracy                           0.82     30122
   macro avg       0.77      0.75      0.76     30122
weighted avg       0.82      0.82      0.82     30122

