In [None]:
!pip install datasets transformers evaluate

In [None]:
!pip install git+https://github.com/nina-adhikari/disease_prediction

Collecting git+https://github.com/nina-adhikari/disease_prediction
  Cloning https://github.com/nina-adhikari/disease_prediction to /tmp/pip-req-build-cp7dcobb
  Running command git clone --filter=blob:none --quiet https://github.com/nina-adhikari/disease_prediction /tmp/pip-req-build-cp7dcobb
  Resolved https://github.com/nina-adhikari/disease_prediction to commit 8022b6bdd769bfd32d673a71ca0ed5f8287db9fe
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from disease_prediction.data import datasets as ds
from disease_prediction.models import text_classification as tc
from disease_prediction.models import classification_helper as ch

In [None]:
DRIVE = 'drive/MyDrive/Disease-Prediction/ddx-dataset/'

In [None]:
ch.DIRECTORY = DRIVE

In [None]:
ch.DATA_ARGS

DataTrainingArguments(train_file='text-train.json', validation_file='text-validate.json', test_file='text-test.json', max_seq_length=128, overwrite_cache=True, pad_to_max_length=False, max_train_samples=10000, max_val_samples=1000, max_test_samples=1000)

In [None]:
ch.TRAINING_ARGS.do_eval = True
ch.TRAINING_ARGS.do_train = True
ch.TRAINING_ARGS.do_predict = False

In [None]:
import pandas as pd

### Auxiliary stuff

In [None]:
val_data = pd.read_json(DRIVE + 'validate_symp.jsonl', lines=True)

In [None]:
val_data.drop(columns=['index'], inplace=True)

AttributeError: 'NoneType' object has no attribute 'rename'

In [None]:
val_data.rename(columns={'Description': 'sentence1', 'PATHOLOGY': 'label'}, inplace=True)

In [None]:
val_data

Unnamed: 0,sentence1,label
0,"As a 68-year-old female, I am experiencing sha...",Influenza
1,"As a 10-year-old male, I have a runny nose, sh...",Allergic sinusitis
2,"At 56 years old, I am dealing with exhausting ...",SLE
3,"Being a 14-year-old female, I am suffering fro...",Influenza
4,"At infancy, I have burning pain in my left mid...",Influenza
...,...,...
4042,"My main symptom is pain, accompanied by extrem...",HIV (initial infection)
4043,"I am experiencing pain, cough, and shortness o...",SLE
4044,"I have chills along with pain, skin lesions or...",Influenza
4045,"My main symptom is cough with fever, shortness...",Allergic sinusitis


In [None]:
val_data.to_json(DRIVE + 'validate_try_1.json', orient='records')

### Combining train and validate

In [None]:
df = {}

SUBSETS = ['train', 'validate', 'test']

for subset in SUBSETS:
  df[subset] = pd.read_json(DRIVE + subset + '_try_1.json')

df['validation'] = df['validate']
df.pop('validate')

In [None]:
df_new = pd.concat([df['train'], df['validate']])

In [None]:
df_new

Unnamed: 0,sentence1,label
0,"As a 21-year-old male, I experience excessive ...",HIV (initial infection)
1,"An 8-year-old male, I have an itchy nose along...",Allergic sinusitis
2,"As a 49-year-old female, I have episodes of lo...",Anaphylaxis
3,"At 69, I suffer from a persistent cough, pain ...",Tuberculosis
4,"Being a 30-year-old female, I have a concernin...",Tuberculosis
...,...,...
4042,"My main symptom is pain, accompanied by extrem...",HIV (initial infection)
4043,"I am experiencing pain, cough, and shortness o...",SLE
4044,"I have chills along with pain, skin lesions or...",Influenza
4045,"My main symptom is cough with fever, shortness...",Allergic sinusitis




In [None]:
df_new.to_json(DRIVE + 'train_and_validate.json', orient='records')

## Figuring out a different way to load the data

In [None]:
from datasets import load_dataset
from datasets import Dataset

def load_datasets(dataframe=None):
    if dataframe is not None:
        ds = {}
        for key in dataframe.keys():
            ds[key] = Dataset.from_pandas(dataframe[key])
        return ds
    data_files = {"train": ch.DATA_ARGS.train_file, "validation": ch.DATA_ARGS.validation_file, "test": ch.DATA_ARGS.test_file}
    data_files = {key: file for key, file in data_files.items() if file is not None}

    for key in data_files.keys():
        tc.LOGGER.info(f"Loading a local file for {key}: {data_files[key]}")

    if ch.DATA_ARGS.input_file_extension == "csv":
        # Loading a dataset from local csv files
        ds = load_dataset(
            "csv",
            data_files=data_files,
            cache_dir=ch.MODEL_ARGS.cache_dir,
            token=ch.MODEL_ARGS.token,
        )
    else:
        # Loading a dataset from local json files
        ds = load_dataset("json", data_files=data_files, cache_dir=ch.MODEL_ARGS.cache_dir)
    return ds

In [None]:
ds = load_datasets(df)

In [None]:
ds

{'train': Dataset({
     features: ['sentence1', 'label'],
     num_rows: 34521
 }),
 'test': Dataset({
     features: ['sentence1', 'label'],
     num_rows: 4049
 }),
 'validation': Dataset({
     features: ['sentence1', 'label'],
     num_rows: 4047
 })}

## Continue

In [None]:
ch.DATA_ARGS.train_file = DRIVE + 'train_and_validate.json'
ch.DATA_ARGS.validation_file = DRIVE + 'test_try_1.json'
ch.DATA_ARGS.test_file = DRIVE + 'test_try_1.json'

In [None]:
ch.DATA_ARGS.max_train_samples = 34521
ch.DATA_ARGS.max_val_samples = 4049
ch.DATA_ARGS.max_test_samples = 4049

In [None]:
ch.TRAINING_ARGS

In [None]:
df = {}

SUBSETS = ['train', 'validate', 'test']

for subset in SUBSETS:
  df[subset] = pd.read_json(DRIVE + subset + '_try_1.json')

df['validation'] = df['validate']
df.pop('validate')

Unnamed: 0,sentence1,label
0,"As a 68-year-old female, I am experiencing sha...",Influenza
1,"As a 10-year-old male, I have a runny nose, sh...",Allergic sinusitis
2,"At 56 years old, I am dealing with exhausting ...",SLE
3,"Being a 14-year-old female, I am suffering fro...",Influenza
4,"At infancy, I have burning pain in my left mid...",Influenza
...,...,...
4042,"My main symptom is pain, accompanied by extrem...",HIV (initial infection)
4043,"I am experiencing pain, cough, and shortness o...",SLE
4044,"I have chills along with pain, skin lesions or...",Influenza
4045,"My main symptom is cough with fever, shortness...",Allergic sinusitis


### First we evaluate without fine-tuning

In [None]:
tc.setup_from_scratch(df)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

Map:   0%|          | 0/34521 [00:00<?, ? examples/s]

Map:   0%|          | 0/4049 [00:00<?, ? examples/s]

Map:   0%|          | 0/4047 [00:00<?, ? examples/s]

In [None]:
tc.evaluate()

INFO:disease_prediction.models.classification_helper:Computing metrics on validation data...
Cause: for/else statement not yet supported


Cause: for/else statement not yet supported


INFO:disease_prediction.models.classification_helper:Eval loss: 2.32634, Eval accuracy: 10.1310%


### Fine-tuning

In [None]:
df_combined = pd.concat([df['train'], df['validation'], df['test']])

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_combined['sentence1'], df_combined['label'], test_size=0.1, random_state=42)

In [None]:
!pip install imblearn

In [None]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X_train.to_numpy().reshape(-1, 1), y_train.to_numpy())

In [None]:
len(X_resampled)

13310

In [None]:
len(X_test)

4262

In [None]:
df_resampled = pd.DataFrame({'sentence1': X_resampled.reshape(-1), 'label': y_resampled})

In [None]:
df_resampled

Unnamed: 0,sentence1,label
0,"I am a 12-year-old male with an itchy nose, sh...",Allergic sinusitis
1,"I am suffering from eye itching with pain, ski...",Allergic sinusitis
2,I am a 20-year-old female suffering from an it...,Allergic sinusitis
3,I am a 68-year-old male with eye itching as th...,Allergic sinusitis
4,I am a 24-year-old female with runny nose and ...,Allergic sinusitis
...,...,...
13305,"My main symptom is vomiting after coughing, al...",Whooping cough
13306,I am a 3-year-old male with wheezing on inhale...,Whooping cough
13307,"Having vomiting cough at 40 years old, with fe...",Whooping cough
13308,As a 50-year-old male experiencing wheezing on...,Whooping cough


In [None]:
ch.DATA_ARGS.max_train_samples = 13310
ch.DATA_ARGS.max_val_samples = 4262
ch.DATA_ARGS.max_test_samples = 4262

In [None]:
df['train'] = df_resampled
df['validation'] = pd.DataFrame({'sentence1': X_test, 'label': y_test})
df['test'] = pd.DataFrame({'sentence1': X_test, 'label': y_test})

In [None]:
df['test']

Unnamed: 0,sentence1,label
16432,"As a newborn female, I have lesions, swollen l...",SLE
19927,"I am a 56-year-old female with swelling, pain,...",Localized edema
3049,"At 42 years old, I have an itchy nose, forehea...",Allergic sinusitis
10695,I have muscle pain along with sharp pain in my...,Influenza
259,"At 77, I, a female, am experiencing swelling a...",Anaphylaxis
...,...,...
454,"I have vaginal discharge along with pain, skin...",Sarcoidosis
20734,I am an 81-year-old female who has gained weig...,Localized edema
3563,I am experiencing a runny nose as my main symp...,Influenza
29940,"I am a 32-year-old female with chills, experie...",Influenza


In [None]:
tc.setup_from_scratch(df)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

Map:   0%|          | 0/13310 [00:00<?, ? examples/s]

Map:   0%|          | 0/4262 [00:00<?, ? examples/s]

Map:   0%|          | 0/4262 [00:00<?, ? examples/s]

In [None]:
#ch.TRAINING_ARGS.num_train_epochs = 1
#ch.TRAINING_ARGS.learning_rate = 2e-05
#ch.TRAINING_ARGS.warmup_steps=1

In [None]:
tc.train()



In [None]:
tc.evaluate()

INFO:disease_prediction.models.classification_helper:Computing metrics on validation data...




INFO:disease_prediction.models.classification_helper:Eval loss: 0.96710, Eval accuracy: 58.6814%


In [None]:
tc.WRAPPER.save_pretrained(DRIVE + 'model')

In [None]:
tc.setup_from_finetuned(DRIVE + 'model')

Some layers from the model checkpoint at drive/MyDrive/Disease-Prediction/ddx-dataset/model were not used when initializing TFDistilBertForSequenceClassification: ['dropout_119']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at drive/MyDrive/Disease-Prediction/ddx-dataset/model and are newly initialized: ['dropout_139']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
tc.evaluate()

INFO:disease_prediction.models.classification_helper:Computing metrics on validation data...




INFO:disease_prediction.models.classification_helper:Eval loss: 0.96710, Eval accuracy: 58.6814%


In [None]:
from google.colab import userdata
tok = userdata.get('HF_TOLEN_WRITE')

In [None]:
ch.MODEL_ARGS.token

In [None]:
tc.WRAPPER.model.push_to_hub('distilbert-finetuned-medical-diagnosis', token=tok)

tf_model.h5:   0%|          | 0.00/263M [00:00<?, ?B/s]

In [None]:
tc.DATASETS

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'label'],
        num_rows: 13310
    })
    test: Dataset({
        features: ['sentence1', 'label', '__index_level_0__'],
        num_rows: 4262
    })
    validation: Dataset({
        features: ['sentence1', 'label', '__index_level_0__'],
        num_rows: 4262
    })
})

In [None]:
df['train']

Unnamed: 0,sentence1,label
0,"I am a 12-year-old male with an itchy nose, sh...",Allergic sinusitis
1,"I am suffering from eye itching with pain, ski...",Allergic sinusitis
2,I am a 20-year-old female suffering from an it...,Allergic sinusitis
3,I am a 68-year-old male with eye itching as th...,Allergic sinusitis
4,I am a 24-year-old female with runny nose and ...,Allergic sinusitis
...,...,...
13305,"My main symptom is vomiting after coughing, al...",Whooping cough
13306,I am a 3-year-old male with wheezing on inhale...,Whooping cough
13307,"Having vomiting cough at 40 years old, with fe...",Whooping cough
13308,As a 50-year-old male experiencing wheezing on...,Whooping cough


In [None]:
df['test'].reset_index(drop=True, inplace=True)

In [None]:
df['test']

Unnamed: 0,sentence1,label
0,"As a newborn female, I have lesions, swollen l...",SLE
1,"I am a 56-year-old female with swelling, pain,...",Localized edema
2,"At 42 years old, I have an itchy nose, forehea...",Allergic sinusitis
3,I have muscle pain along with sharp pain in my...,Influenza
4,"At 77, I, a female, am experiencing swelling a...",Anaphylaxis
...,...,...
4257,"I have vaginal discharge along with pain, skin...",Sarcoidosis
4258,I am an 81-year-old female who has gained weig...,Localized edema
4259,I am experiencing a runny nose as my main symp...,Influenza
4260,"I am a 32-year-old female with chills, experie...",Influenza


In [None]:
df['train'].to_json(DRIVE + 'final_train.json', orient='records')
df['test'].to_json(DRIVE + 'final_test.json', orient='records')

In [None]:
tc.WRAPPER.tokenizer.push_to_hub('distilbert-finetuned-medical-diagnosis', token=tok)

README.md:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ninaa510/distilbert-finetuned-medical-diagnosis/commit/b9df3b68aedccd68c21b97997a7fa8f86564363e', commit_message='Upload tokenizer', commit_description='', oid='b9df3b68aedccd68c21b97997a7fa8f86564363e', pr_url=None, pr_revision=None, pr_num=None)

## A different model?

In [None]:
ch.MODEL_ARGS.model_name_or_path = "FacebookAI/roberta-base"
ch.MODEL_ARGS.tokenizer_name_or_path = "FacebookAI/roberta-base"

In [None]:
tc.setup_from_scratch()

INFO:disease_prediction.models.classification_helper:Loading a local file for train: drive/MyDrive/Disease-Prediction/ddx-dataset/train_and_validate.json
INFO:disease_prediction.models.classification_helper:Loading a local file for validation: drive/MyDrive/Disease-Prediction/ddx-dataset/test_try_1.json
INFO:disease_prediction.models.classification_helper:Loading a local file for test: drive/MyDrive/Disease-Prediction/ddx-dataset/test_try_1.json


config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predicti

Map:   0%|          | 0/38568 [00:00<?, ? examples/s]

Map:   0%|          | 0/4049 [00:00<?, ? examples/s]

Map:   0%|          | 0/4049 [00:00<?, ? examples/s]

In [None]:
tc.train()



## Different dataset

In [None]:
!mv 'archive (3).zip' 'archive.zip'

In [None]:
!unzip archive.zip

Archive:  archive.zip
  inflating: Symptom2Disease.csv     


In [None]:
newset = pd.read_csv('Symptom2Disease.csv')

In [None]:
newset.drop(columns=['Unnamed: 0'], inplace=True)

In [None]:
newset.rename(columns={'text': 'sentence1'}, inplace=True)

In [None]:
X = newset['sentence1'].copy()
y = newset['label'].copy()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
newset_train = pd.DataFrame({'sentence1': X_train, 'label': y_train})
newset_test = pd.DataFrame({'sentence1': X_test, 'label': y_test})

In [None]:
newset_train.to_json('train_symp.json', orient='records')
newset_test.to_json('test_symp.json', orient='records')

In [None]:
ch.DATA_ARGS.train_file = 'train_symp.json'
ch.DATA_ARGS.validation_file = 'test_symp.json'
ch.DATA_ARGS.test_file = 'test_symp.json'

In [None]:
ch.TRAINING_ARGS.do_eval = True
ch.TRAINING_ARGS.do_train = True
ch.TRAINING_ARGS.do_predict = False

In [None]:
ch.DATA_ARGS.max_train_samples = 960
ch.DATA_ARGS.max_eval_samples = 240
ch.DATA_ARGS.max_test_samples = 240
ch.DATA_ARGS.max_val_samples = 240
ch.DATA_ARGS.max_predict_samples = 240

In [None]:
tc.LOGGER = ch.setup_logging()

In [None]:
tc.setup_from_scratch()

INFO:disease_prediction.models.classification_helper:Loading a local file for train: train_symp.json
INFO:disease_prediction.models.classification_helper:Loading a local file for validation: test_symp.json
INFO:disease_prediction.models.classification_helper:Loading a local file for test: test_symp.json
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification

Map:   0%|          | 0/960 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

In [None]:
tc.evaluate()

INFO:disease_prediction.models.classification_helper:Computing metrics on validation data...




INFO:disease_prediction.models.classification_helper:Eval loss: 3.18293, Eval accuracy: 4.1667%


In [None]:
train_pd = pd.DataFrame(tc.TF_DATA['train'])

In [None]:
tc.WRAPPER.model.compile(optimizer=tc.WRAPPER.model.optimizer, metrics=['accuracy'])

In [None]:
[d['input_ids'].shape for d in train_pd[0]]

[TensorShape([8, 73]),
 TensorShape([8, 56]),
 TensorShape([8, 68]),
 TensorShape([8, 47]),
 TensorShape([8, 57]),
 TensorShape([8, 60]),
 TensorShape([8, 61]),
 TensorShape([8, 58]),
 TensorShape([8, 55]),
 TensorShape([8, 60]),
 TensorShape([8, 67]),
 TensorShape([8, 56]),
 TensorShape([8, 65]),
 TensorShape([8, 51]),
 TensorShape([8, 56]),
 TensorShape([8, 50]),
 TensorShape([8, 52]),
 TensorShape([8, 66]),
 TensorShape([8, 54]),
 TensorShape([8, 64]),
 TensorShape([8, 53]),
 TensorShape([8, 60]),
 TensorShape([8, 63]),
 TensorShape([8, 59]),
 TensorShape([8, 63]),
 TensorShape([8, 55]),
 TensorShape([8, 55]),
 TensorShape([8, 50]),
 TensorShape([8, 54]),
 TensorShape([8, 57]),
 TensorShape([8, 63]),
 TensorShape([8, 72]),
 TensorShape([8, 63]),
 TensorShape([8, 63]),
 TensorShape([8, 55]),
 TensorShape([8, 61]),
 TensorShape([8, 50]),
 TensorShape([8, 71]),
 TensorShape([8, 68]),
 TensorShape([8, 48]),
 TensorShape([8, 64]),
 TensorShape([8, 66]),
 TensorShape([8, 61]),
 TensorShap

In [None]:
tc.WRAPPER.train_and_validate(tc.TF_DATA['train'], tc.TF_DATA['validation'])



INFO:disease_prediction.models.classification_helper:Computing metrics on validation data...




INFO:disease_prediction.models.classification_helper:Eval loss: 2.14500, Eval accuracy: 70.0000%


In [None]:
ch.TRAINING_ARGS.num_train_epochs = 5

In [None]:
tc.WRAPPER.train_and_validate(tc.TF_DATA['train'], tc.TF_DATA['validation'])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


INFO:disease_prediction.models.classification_helper:Computing metrics on validation data...




INFO:disease_prediction.models.classification_helper:Eval loss: 2.14500, Eval accuracy: 70.0000%


## Appendix

In [None]:
pd.DataFrame(tc.DATASETS['train'])

Unnamed: 0,label,sentence1
0,HIV (initial infection),I am male and 21 years old. I have been experi...
1,Allergic sinusitis,I am male and 8 years old. I have been experie...
2,Anaphylaxis,I am female and 49 years old. I have been expe...
3,Tuberculosis,I am male and 69 years old. I have been experi...
4,Tuberculosis,I am female and 30 years old. I have been expe...
...,...,...
203003,HIV (initial infection),I am male and 30 years old. I have been experi...
203004,HIV (initial infection),I am female and 7 years old. I have been exper...
203005,HIV (initial infection),I am female and 66 years old. I have been expe...
203006,HIV (initial infection),I am male and 54 years old. I have been experi...


In [None]:
vapd = pd.DataFrame(tc.TF_DATA['validation'])

In [None]:
vapd.loc[0][0]

{'input_ids': <tf.Tensor: shape=(8, 44), dtype=int64, numpy=
 array([[  101,  1249,   170,  5599,   118,  1214,   118,  1385,  2130,
           117,   146,  1821, 13992,  4295,  2489,  1107,  1139,  1286,
          2342,  1105,  1138,  1894,  2241, 26052,  1113,  1139,  3678,
           119,   102,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  1249,   170,  1275,   118,  1214,   118,  1385,  2581,
           117,   146,  1138,   170,  1576,  3382,  3678,   117,  4295,
          2489,  1107,  1139, 11030,  1596,  1298,   117,  5325,   187,
         10733,  1113,  1139,  1268, 10845,   117,  1105, 20085,  1113,
          1139,  3678,   119,   102,     0,     0,     0,     0],
        [  101,  1335,  4376,  1201,  1385,   117,   146,  1821,  6705,
          1114, 16287,  1158,  2489,  1120,  1103,  1171,  1104,  1139,
          1246,   117,  5325,   187, 10733,  1113,  1103,  1286,  1334,
          1104,

In [None]:
tc.WRAPPER.config

DistilBertConfig {
  "_name_or_path": "distilbert/distilbert-base-cased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "Allergic sinusitis",
    "1": "Anaphylaxis",
    "2": "Chagas",
    "3": "Ebola",
    "4": "HIV (initial infection)",
    "5": "Influenza",
    "6": "Localized edema",
    "7": "SLE",
    "8": "Sarcoidosis",
    "9": "Tuberculosis",
    "10": "Whooping cough"
  },
  "initializer_range": 0.02,
  "label2id": {
    "Allergic sinusitis": 0,
    "Anaphylaxis": 1,
    "Chagas": 2,
    "Ebola": 3,
    "HIV (initial infection)": 4,
    "Influenza": 5,
    "Localized edema": 6,
    "SLE": 7,
    "Sarcoidosis": 8,
    "Tuberculosis": 9,
    "Whooping cough": 10
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif