## Imports

In [2]:
#from cw_pierre_fct import ProteinClassifier, ProteinDataModule, ProteinSequenceDataset
import os
import torch
from torch import nn
import torch.utils.data
import torch.utils.data.distributed
from torch.utils.data import Dataset, DataLoader, RandomSampler, TensorDataset
import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torchmetrics
from pytorch_lightning.accelerators import MPSAccelerator
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from torchmetrics.classification import MulticlassAUROC, MulticlassAccuracy

from pytorch_lightning.loggers import CSVLogger
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning import Trainer, seed_everything
import datetime
from datetime import datetime
#from pytorch_lightning.metrics.sklearns import Accuracy

import torchvision

%load_ext autoreload
%autoreload 2

In [3]:
import platform
platform.processor()

'arm'

In [5]:
from model_fct import ProteinClassifier, ProteinDataModule, ProteinSequenceDataset

In [6]:
train_df = pd.read_pickle('train_df.pkl')
test_df = pd.read_pickle('test_df.pkl')
val_df = pd.read_pickle('val_df.pkl')
blind_df = pd.read_pickle('blind_df.pkl')

## Logger and checkpoint

In [6]:
def setup_testube_logger() -> CSVLogger:
    """ Function that sets the TestTubeLogger to be used. """
    now = datetime.now()
    dt_string = now.strftime("%d-%m-%Y--%H-%M-%S")

    return CSVLogger(
        save_dir="experiments/",
        version=dt_string,
        name="lightning_logs",
    )

logger = setup_testube_logger()

In [7]:
ckpt_path = os.path.join(
    logger.save_dir,
    logger.name,
    f"version_{logger.version}",
    "checkpoints",
)

c = ModelCheckpoint(
    dirpath=ckpt_path + "/" + "{epoch}-{val_loss:.2f}-{val_acc:.2f}",
    verbose=True,
    monitor='val_acc',
    mode="max",
)

## Set up experiment

In [11]:
TARGETS = ['cyto', 'mito', 'nucleus','other', 'secreted']
PRE_TRAINED_MODEL_NAME = 'Rostlab/prot_bert_bfd_localization'
#PRE_TRAINED_MODEL_NAME = 'Rostlab/prot_bert_bfd'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME, do_lower_case=False)

EPOCHS = 2
BATCH_SIZE = 1
MAX_LENGTH = 1500

In [12]:
dm = ProteinDataModule(
    train_df, 
    test_df,
    val_df,
    blind_df,
    tokenizer, 
    target_list=TARGETS,
    batch_size=BATCH_SIZE,
    max_len = MAX_LENGTH
)

model = ProteinClassifier(
    n_classes=5,
    target_list=TARGETS,
    steps_per_epoch=len(train_df)//BATCH_SIZE, 
    n_epochs=EPOCHS
)

Some weights of the model checkpoint at Rostlab/prot_bert_bfd were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
trainer = pl.Trainer(max_epochs=EPOCHS,
                     logger=logger,
                     accelerator='mps',
                     #callbacks = checkpoint_callback
                     default_root_dir='experiments/lightning_logs'
                    )

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [12]:
trainer.fit(model, dm)


  | Name       | Type               | Params
--------------------------------------------------
0 | bert       | BertModel          | 419 M 
1 | classifier | Sequential         | 5.1 K 
2 | criterion  | CrossEntropyLoss   | 0     
3 | metric_acc | MultilabelAccuracy | 0     
--------------------------------------------------
419 M     Trainable params
0         Non-trainable params
419 M     Total params
1,679.745 Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  output, inverse_indices, counts = torch._unique2(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=2` reached.


In [13]:
trainer.test(dataloaders=dm)

  rank_zero_warn(
Restoring states from the checkpoint path at experiments/lightning_logs/18-02-2023--13-33-00/checkpoints/epoch=1-step=4.ckpt
Loaded model weights from checkpoint at experiments/lightning_logs/18-02-2023--13-33-00/checkpoints/epoch=1-step=4.ckpt


Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_loss           1.6335856914520264
         val_acc            0.20000000298023224
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss': 1.6335856914520264, 'val_acc': 0.20000000298023224}]

## Testing and predicting

In [25]:
#change for best one - manually check which one is the best
best_checkpoint_path = '/Users/pierredemetz/UCL_work/COMP0082-CW/code/experiments/lightning_logs/18-02-2023--00-13-33/checkpoints/epoch=1-step=4.ckpt'

In [14]:
trainer = Trainer(resume_from_checkpoint=best_checkpoint_path)

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [15]:
trainer.test(model, dataloaders=dm)

Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_loss           1.6638630628585815
         val_acc            0.20000000298023224
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss': 1.6638630628585815, 'val_acc': 0.20000000298023224}]

In [19]:
outputs = trainer.predict(model, dm)
results = []
for item in outputs:
    tensor = item[1]
    max_prob, max_target_idx = torch.max(tensor, dim=1)
    max_target = TARGETS[max_target_idx]
    results.append((max_prob.item(), max_target))

print(results)

Predicting: 0it [00:00, ?it/s]

[(0, tensor([[0.4520, 0.5345, 0.4608, 0.5594, 0.5244]])),
 (0, tensor([[0.4512, 0.5352, 0.4602, 0.5605, 0.5250]])),
 (0, tensor([[0.4516, 0.5348, 0.4605, 0.5599, 0.5247]])),
 (0, tensor([[0.4540, 0.5329, 0.4623, 0.5567, 0.5231]])),
 (0, tensor([[0.4521, 0.5344, 0.4609, 0.5592, 0.5243]])),
 (0, tensor([[0.4516, 0.5349, 0.4605, 0.5599, 0.5247]])),
 (0, tensor([[0.4524, 0.5342, 0.4611, 0.5589, 0.5242]])),
 (0, tensor([[0.4522, 0.5344, 0.4610, 0.5591, 0.5243]])),
 (0, tensor([[0.4519, 0.5346, 0.4607, 0.5596, 0.5245]])),
 (0, tensor([[0.4525, 0.5341, 0.4612, 0.5587, 0.5241]])),
 (0, tensor([[0.4505, 0.5358, 0.4597, 0.5614, 0.5255]])),
 (0, tensor([[0.4536, 0.5332, 0.4621, 0.5572, 0.5233]])),
 (0, tensor([[0.4517, 0.5348, 0.4606, 0.5598, 0.5247]])),
 (0, tensor([[0.4541, 0.5328, 0.4625, 0.5565, 0.5229]])),
 (0, tensor([[0.4521, 0.5345, 0.4609, 0.5592, 0.5244]])),
 (0, tensor([[0.4534, 0.5334, 0.4619, 0.5575, 0.5235]])),
 (0, tensor([[0.4531, 0.5337, 0.4617, 0.5579, 0.5237]])),
 (0, tensor([[

In [17]:
target_list = ['cyto', 'mito', 'nucleus','other', 'secreted']
n_classes = 5

protein_classifier = ProteinClassifier(n_classes, target_list)
protein_classifier = protein_classifier.load_from_checkpoint(
    checkpoint_path=best_checkpoint_path,
    n_classes=n_classes,
    target_list=target_list
)

protein_classifier.eval()
protein_classifier.freeze()

Some weights of the model checkpoint at Rostlab/prot_bert_bfd were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


FileNotFoundError: [Errno 2] No such file or directory: '/Users/pierredemetz/UCL_work/COMP0082-CW/code/experiments/lightning_logs/18-02-2023--00-13-33/checkpoints/epoch=1-step=4.ckpt'

In [111]:
sample = {
  "seq": "M S T D T G V S L P S Y E E D Q G S K L I R K A K E A P F V P V G I A G F A A I V A Y G L Y K L K S R G N T K M S I H L I H M R V A A Q G F V V G A M T V G M G Y S M Y R E F W A K P K P",
}

predictions = protein_classifier.predict_step(sample, batch_idx=0)

print("Sequence Localization Ground Truth is: {} - prediction is: {}".format('Mitochondrion',predictions['predicted_label']))



TypeError: forward() missing 1 required positional argument: 'attention_mask'

## MISC

In [59]:
import re
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False )
model = BertModel.from_pretrained("Rostlab/prot_bert")
sequence_Example = "A E T C Z A O"
sequence_Example = re.sub(r"[UZOB]", "X", sequence_Example)
encoded_input = tokenizer(sequence_Example, return_tensors='pt')
output = model(**encoded_input)

Some weights of the model checkpoint at Rostlab/prot_bert were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print(x)
else:
    print ("MPS device not found.")

tensor([1.], device='mps:0')


  nonzero_finite_vals = torch.masked_select(


In [60]:
accelerator_registry=torch.device("mps")

In [61]:
accelertorch.backends.mps

NameError: name 'accelertorch' is not defined

In [62]:
MPSAccelerator.register_accelerators(device='mps')

TypeError: register_accelerators() got an unexpected keyword argument 'device'

In [59]:
!pip install tensorflow-metal

Collecting tensorflow-metal
  Downloading tensorflow_metal-0.7.1-cp38-cp38-macosx_12_0_x86_64.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: tensorflow-metal
Successfully installed tensorflow-metal-0.7.1


In [63]:
import transformers

In [66]:
!exit

In [67]:
!arch

i386
