In [1]:
!pip install simpletransformers
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from typing import Dict, List, Any
import logging
from sklearn.preprocessing import LabelBinarizer,LabelEncoder
import jsonlines
from simpletransformers.classification import ClassificationModel, ClassificationArgs

import random 
import numpy as np
import torch

def random_seed(seed_value): 
    np.random.seed(seed_value) 
    torch.manual_seed(seed_value)
    random.seed(seed_value) 


random_seed(42)

Collecting simpletransformers
  Downloading simpletransformers-0.63.7-py3-none-any.whl (249 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m249.8/249.8 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting streamlit
  Downloading streamlit-1.10.0-py2.py3-none-any.whl (9.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting pydeck>=0.1.dev5
  Downloading pydeck-0.7.1-py2.py3-none-any.whl (4.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.3/4.3 MB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting validators
  Downloading validators-0.20.0.tar.gz (30 kB)
  Preparing me

In [2]:
def read_give_instances(path):
    
    texts = []
    labels = []
    with jsonlines.open(path) as f_in:
        for json_object in f_in:
            text=json_object.get('text')
            label=json_object.get('label')
            metadata=json_object.get('metadata')
            texts.append(text)
            labels.append(label)
            
    return texts, labels

In [3]:
train_texts, train_labels = read_give_instances("../input/scibert-dataset/text_classification/chemprot/train.txt")
dev_texts, dev_labels = read_give_instances("../input/scibert-dataset/text_classification/chemprot/dev.txt")
test_texts, test_labels = read_give_instances("../input/scibert-dataset/text_classification/chemprot/test.txt")

In [4]:
np.unique(train_labels)

array(['ACTIVATOR', 'AGONIST', 'AGONIST-ACTIVATOR', 'AGONIST-INHIBITOR',
       'ANTAGONIST', 'DOWNREGULATOR', 'INDIRECT-DOWNREGULATOR',
       'INDIRECT-UPREGULATOR', 'INHIBITOR', 'PRODUCT-OF', 'SUBSTRATE',
       'SUBSTRATE_PRODUCT-OF', 'UPREGULATOR'], dtype='<U22')

In [5]:
train_texts[4], train_labels[:5]

('<< Epidermal growth factor receptor >> inhibitors currently under investigation include the small molecules gefitinib (Iressa, ZD1839) and erlotinib ([[ Tarceva ]], OSI-774), as well as monoclonal antibodies such as cetuximab (IMC-225, Erbitux).',
 ['INHIBITOR', 'INHIBITOR', 'INHIBITOR', 'INHIBITOR', 'INHIBITOR'])

In [6]:
df_train = pd.DataFrame({"text":train_texts,"labels":train_labels})
df_val = pd.DataFrame({"text":dev_texts,"labels":dev_labels})
df_test = pd.DataFrame({"text":test_texts,"labels":test_labels})
df_train.head()

Unnamed: 0,text,labels
0,<< Epidermal growth factor receptor >> inhibit...,INHIBITOR
1,<< Epidermal growth factor receptor >> inhibit...,INHIBITOR
2,<< Epidermal growth factor receptor >> inhibit...,INHIBITOR
3,<< Epidermal growth factor receptor >> inhibit...,INHIBITOR
4,<< Epidermal growth factor receptor >> inhibit...,INHIBITOR


In [7]:
len(df_train['labels'].unique())

13

In [8]:
lb = LabelEncoder()

df_train['labels'] = lb.fit_transform(df_train['labels'])
df_val['labels'] = lb.transform(df_val['labels'])
df_test['labels'] = lb.transform(df_test['labels'])
df_train.head()

Unnamed: 0,text,labels
0,<< Epidermal growth factor receptor >> inhibit...,8
1,<< Epidermal growth factor receptor >> inhibit...,8
2,<< Epidermal growth factor receptor >> inhibit...,8
3,<< Epidermal growth factor receptor >> inhibit...,8
4,<< Epidermal growth factor receptor >> inhibit...,8


In [9]:
len(df_train['labels'].unique())

13

In [10]:
model_args = ClassificationArgs(eval_batch_size=64,
                               learning_rate = 2e-5,
                               max_seq_length = 256,
                               no_save = True,
                               num_train_epochs = 4,
                               train_batch_size = 32)

In [11]:
model = ClassificationModel("bert","AnonymousSub/fpdm_models_scibert_hybrid_epochs_4",args = model_args,num_labels=13)

Downloading:   0%|          | 0.00/706 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/419M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at AnonymousSub/fpdm_models_scibert_hybrid_epochs_4 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading:   0%|          | 0.00/427 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/223k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [12]:
model.train_model(df_train,eval_df = df_val)

  0%|          | 0/4169 [00:00<?, ?it/s]



Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/131 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/131 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/131 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/131 [00:00<?, ?it/s]

(524, 0.7035870176405852)

In [13]:
predictions, raw_outputs = model.predict(df_test['text'].tolist())

  0%|          | 0/3469 [00:00<?, ?it/s]

  0%|          | 0/55 [00:00<?, ?it/s]

In [14]:
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score

In [15]:
print(classification_report(df_test['labels'], predictions))

              precision    recall  f1-score   support

           0       0.74      0.86      0.80       292
           1       0.76      0.88      0.82       182
           2       0.00      0.00      0.00         4
           3       0.00      0.00      0.00        12
           4       0.87      0.89      0.88       293
           5       0.11      0.03      0.04        72
           6       0.72      0.84      0.77       340
           7       0.82      0.73      0.78       334
           8       0.91      0.93      0.92      1255
           9       0.69      0.68      0.69       191
          10       0.85      0.82      0.83       453
          12       0.00      0.00      0.00        41

    accuracy                           0.83      3469
   macro avg       0.54      0.55      0.54      3469
weighted avg       0.81      0.83      0.81      3469



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
print(accuracy_score(df_test['labels'], predictions))
print(precision_score(df_test['labels'], predictions,average='micro'))
print(recall_score(df_test['labels'], predictions,average='micro'))
print(f1_score(df_test['labels'], predictions,average='micro'))

0.8273277601614298
0.8273277601614298
0.8273277601614298
0.8273277601614298
