## Imports

In [70]:
from model_fct import ProteinClassifier, ProteinDataModule, ProteinSequenceDataset
import os
import torch
from torch import nn
import torch.utils.data
import torch.utils.data.distributed
from torch.utils.data import Dataset, DataLoader, RandomSampler, TensorDataset
import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torchmetrics
from pytorch_lightning.accelerators import MPSAccelerator
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from torchmetrics.classification import MulticlassAUROC, MulticlassAccuracy, MultilabelF1Score
from torchmetrics import Recall, Precision

from pytorch_lightning.loggers import CSVLogger
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning import Trainer, seed_everything
import datetime
from datetime import datetime
#from pytorch_lightning.metrics.sklearns import Accuracy

import torchvision

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [71]:
import platform
platform.processor()

'arm'

In [72]:
train_df = pd.read_pickle('train_df.pkl')
test_df = pd.read_pickle('test_df.pkl')
val_df = pd.read_pickle('val_df.pkl')
blind_df = pd.read_pickle('blind_df.pkl')

## Logger and checkpoint

In [73]:
def setup_testube_logger() -> CSVLogger:
    """ Function that sets the TestTubeLogger to be used. """
    now = datetime.now()
    dt_string = now.strftime("%d-%m-%Y--%H-%M-%S")

    return CSVLogger(
        save_dir="experiments/",
        version=dt_string,
        name="lightning_logs",
    )

logger = setup_testube_logger()

In [74]:
ckpt_path = os.path.join(
    logger.save_dir,
    logger.name,
    f"version_{logger.version}",
    "checkpoints",
)

c = ModelCheckpoint(
    dirpath=ckpt_path + "/" + "tanh_3epochs",
    verbose=True,
    monitor='val_acc',
    mode="max",
)

## Set up experiment

In [75]:
TARGETS = ['cyto', 'mito', 'nucleus','other', 'secreted']
PRE_TRAINED_MODEL_NAME = 'Rostlab/prot_bert_bfd_localization'
#PRE_TRAINED_MODEL_NAME = 'Rostlab/prot_bert_bfd'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME, do_lower_case=False)

EPOCHS = 1
BATCH_SIZE = 1
MAX_LENGTH = 1500

In [76]:
dm = ProteinDataModule(
    train_df, 
    test_df,
    val_df,
    blind_df,
    tokenizer, 
    target_list=TARGETS,
    batch_size=BATCH_SIZE,
    max_len = MAX_LENGTH
)

model = ProteinClassifier(
    n_classes=5,
    target_list=TARGETS,
    steps_per_epoch=len(train_df)//BATCH_SIZE, 
    n_epochs=EPOCHS
)

Some weights of the model checkpoint at Rostlab/prot_bert_bfd were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [77]:
trainer = pl.Trainer(max_epochs=EPOCHS,
                     logger=logger,
                     accelerator='mps',
                     #callbacks = checkpoint_callback
                     default_root_dir='experiments/lightning_logs'
                    )

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [78]:
trainer.fit(model, dm)


  | Name       | Type             | Params
------------------------------------------------
0 | bert       | BertModel        | 419 M 
1 | classifier | Sequential       | 5.1 K 
2 | criterion  | CrossEntropyLoss | 0     
------------------------------------------------
419 M     Trainable params
0         Non-trainable params
419 M     Total params
1,679.745 Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  input = module(input)


the accuracy is 0.00
the precision is 0.00
the recall is 0.00
the f1 is 0.00
   precision  recall   f1  accuracy  num_samples
0        0.0     0.0  0.0       1.0            0
1        0.0     0.0  0.0       0.0            1
2        0.0     0.0  0.0       0.0            1
[[0 0 0]
 [1 0 0]
 [1 0 0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

  input = module(input)


Validation: 0it [00:00, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


the accuracy is 0.14
the precision is 0.03
the recall is 0.15
the f1 is 0.05
   precision  recall    f1  accuracy  num_samples
0       0.00    0.00  0.00      0.00            5
1       0.15    0.75  0.25      0.75            4
2       0.00    0.00  0.00      0.00            4
3       0.00    0.00  0.00      0.00            6
4       0.00    0.00  0.00      0.00            3
[[0 5 0 0 0]
 [1 3 0 0 0]
 [0 4 0 0 0]
 [1 5 0 0 0]
 [0 3 0 0 0]]


`Trainer.fit` stopped: `max_epochs=1` reached.


In [80]:
trainer.test(dataloaders=dm)

  rank_zero_warn(
Restoring states from the checkpoint path at experiments/lightning_logs/22-02-2023--01-15-11/checkpoints/epoch=0-step=20.ckpt
Loaded model weights from checkpoint at experiments/lightning_logs/22-02-2023--01-15-11/checkpoints/epoch=0-step=20.ckpt


Testing: 0it [00:00, ?it/s]

  input = module(input)


tensor([[0.2219, 0.2301, 0.1904, 0.1860, 0.1717]], device='mps:0')
tensor([[0.2220, 0.2290, 0.1904, 0.1864, 0.1722]], device='mps:0')
tensor([[0.2217, 0.2274, 0.1908, 0.1870, 0.1731]], device='mps:0')
tensor([[0.2218, 0.2301, 0.1901, 0.1861, 0.1719]], device='mps:0')
tensor([[0.2223, 0.2305, 0.1903, 0.1857, 0.1713]], device='mps:0')
tensor([[0.2223, 0.2302, 0.1904, 0.1857, 0.1714]], device='mps:0')
tensor([[0.2217, 0.2277, 0.1906, 0.1870, 0.1729]], device='mps:0')
tensor([[0.2221, 0.2297, 0.1906, 0.1858, 0.1718]], device='mps:0')
tensor([[0.2222, 0.2306, 0.1903, 0.1856, 0.1713]], device='mps:0')
tensor([[0.2224, 0.2308, 0.1902, 0.1856, 0.1711]], device='mps:0')
tensor([[0.2225, 0.2307, 0.1901, 0.1855, 0.1711]], device='mps:0')
tensor([[0.2216, 0.2266, 0.1911, 0.1872, 0.1736]], device='mps:0')
tensor([[0.2218, 0.2272, 0.1908, 0.1870, 0.1731]], device='mps:0')
tensor([[0.2221, 0.2302, 0.1902, 0.1859, 0.1716]], device='mps:0')
tensor([[0.2218, 0.2284, 0.1907, 0.1864, 0.1727]], device='mps

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[{}]

In [58]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support


# example target and output lists
targets = [0, 1, 2, 3, 4, 0, 1, 2, 3, 4]
outputs = [0, 1, 1, 3, 4, 0, 2, 2, 3, 4]

def classification_metrics(targets, outputs):
    # compute confusion matrix
    cm = confusion_matrix(targets, outputs)
    
    # compute total number of samples for each class
    total_per_class = np.sum(cm, axis=1)
    
    # compute number of correctly classified samples for each class
    correct_per_class = np.diagonal(cm)
    
    # compute precision, recall, and f1 score for each class
    p, r, f1, _ = precision_recall_fscore_support(targets, outputs, average=None)
    
    # compute accuracy for each class
    accuracy_per_class = np.divide(correct_per_class, total_per_class, where=total_per_class!=0)
    
    # create a dataframe to hold the results
    df = pd.DataFrame({
        'precision': p,
        'recall': r,
        'f1': f1,
        'accuracy': accuracy_per_class,
        'num_samples': total_per_class
    })
    
    print(df)
    print(cm)
    #return df

In [59]:
classification_metrics(targets, outputs)

   precision  recall   f1  accuracy  num_samples
0        1.0     1.0  1.0       1.0            2
1        0.5     0.5  0.5       0.5            2
2        0.5     0.5  0.5       0.5            2
3        1.0     1.0  1.0       1.0            2
4        1.0     1.0  1.0       1.0            2
[[2 0 0 0 0]
 [0 1 1 0 0]
 [0 1 1 0 0]
 [0 0 0 2 0]
 [0 0 0 0 2]]


In [26]:
from sklearn.metrics import confusion_matrix, classification_report

# Define your target and output lists
targets = [0, 1, 2, 3, 4]
outputs = [1, 1, 2, 3, 4]

# Create a confusion matrix
cm = confusion_matrix(targets, outputs)

# Print the confusion matrix
print("Confusion Matrix:\n", cm)

# Calculate classification report
report = classification_report(targets, outputs)

# Print classification report
print("Classification Report:\n", report)

Confusion Matrix:
 [[0 1 0 0 0]
 [0 1 0 0 0]
 [0 0 1 0 0]
 [0 0 0 1 0]
 [0 0 0 0 1]]
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.50      1.00      0.67         1
           2       1.00      1.00      1.00         1
           3       1.00      1.00      1.00         1
           4       1.00      1.00      1.00         1

    accuracy                           0.80         5
   macro avg       0.70      0.80      0.73         5
weighted avg       0.70      0.80      0.73         5



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
df.groupby('target')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x104ac80a0>

## Testing and predicting

In [15]:
#change for best one - manually check which one is the best
best_checkpoint_path = '/Users/pierredemetz/UCL_work/COMP0082-CW/code/experiments/lightning_logs/20-02-2023--21-58-19/checkpoints/epoch=1-step=14366.ckpt'




In [17]:
trainer = Trainer(resume_from_checkpoint=best_checkpoint_path)

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [53]:
outputs = trainer.predict(model, dm)
results = []
for item in outputs:
    tensor = item[1]
    max_prob, max_target_idx = torch.max(tensor, dim=1)
    max_target = TARGETS[max_target_idx]
    results.append((max_prob.item(), max_target))

print(results)

Predicting: 20it [00:00, ?it/s]

  input = module(input)


[(0.23777472972869873, 'secreted'), (0.2390023022890091, 'secreted'), (0.23801231384277344, 'secreted'), (0.23591041564941406, 'secreted'), (0.23733216524124146, 'secreted'), (0.23813442885875702, 'secreted'), (0.23768913745880127, 'secreted'), (0.23749719560146332, 'secreted'), (0.23838873207569122, 'secreted'), (0.23754209280014038, 'secreted'), (0.2393353134393692, 'secreted'), (0.23688319325447083, 'secreted'), (0.23795557022094727, 'secreted'), (0.23580197989940643, 'secreted'), (0.23750881850719452, 'secreted'), (0.2364635318517685, 'secreted'), (0.2366548478603363, 'secreted'), (0.2381429672241211, 'secreted'), (0.23695126175880432, 'secreted'), (0.2360747903585434, 'secreted')]


In [54]:
outputs

[(0, tensor([[0.2052, 0.1628, 0.1809, 0.2133, 0.2378]])),
 (0, tensor([[0.2051, 0.1622, 0.1803, 0.2135, 0.2390]])),
 (0, tensor([[0.2053, 0.1629, 0.1811, 0.2128, 0.2380]])),
 (0, tensor([[0.2055, 0.1642, 0.1823, 0.2121, 0.2359]])),
 (0, tensor([[0.2056, 0.1630, 0.1808, 0.2132, 0.2373]])),
 (0, tensor([[0.2053, 0.1627, 0.1808, 0.2131, 0.2381]])),
 (0, tensor([[0.2051, 0.1631, 0.1810, 0.2131, 0.2377]])),
 (0, tensor([[0.2053, 0.1632, 0.1812, 0.2128, 0.2375]])),
 (0, tensor([[0.2051, 0.1627, 0.1810, 0.2128, 0.2384]])),
 (0, tensor([[0.2053, 0.1631, 0.1812, 0.2130, 0.2375]])),
 (0, tensor([[0.2054, 0.1619, 0.1801, 0.2134, 0.2393]])),
 (0, tensor([[0.2050, 0.1639, 0.1820, 0.2122, 0.2369]])),
 (0, tensor([[0.2054, 0.1628, 0.1807, 0.2132, 0.2380]])),
 (0, tensor([[0.2056, 0.1643, 0.1821, 0.2122, 0.2358]])),
 (0, tensor([[0.2053, 0.1630, 0.1810, 0.2132, 0.2375]])),
 (0, tensor([[0.2055, 0.1638, 0.1819, 0.2123, 0.2365]])),
 (0, tensor([[0.2054, 0.1635, 0.1813, 0.2132, 0.2367]])),
 (0, tensor([[

## LEGACY

In [17]:
target_list = ['cyto', 'mito', 'nucleus','other', 'secreted']
n_classes = 5

protein_classifier = ProteinClassifier(n_classes, target_list)
protein_classifier = protein_classifier.load_from_checkpoint(
    checkpoint_path=best_checkpoint_path,
    n_classes=n_classes,
    target_list=target_list
)

protein_classifier.eval()
protein_classifier.freeze()

Some weights of the model checkpoint at Rostlab/prot_bert_bfd were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


FileNotFoundError: [Errno 2] No such file or directory: '/Users/pierredemetz/UCL_work/COMP0082-CW/code/experiments/lightning_logs/18-02-2023--00-13-33/checkpoints/epoch=1-step=4.ckpt'

In [111]:
sample = {
  "seq": "M S T D T G V S L P S Y E E D Q G S K L I R K A K E A P F V P V G I A G F A A I V A Y G L Y K L K S R G N T K M S I H L I H M R V A A Q G F V V G A M T V G M G Y S M Y R E F W A K P K P",
}

predictions = protein_classifier.predict_step(sample, batch_idx=0)

print("Sequence Localization Ground Truth is: {} - prediction is: {}".format('Mitochondrion',predictions['predicted_label']))



TypeError: forward() missing 1 required positional argument: 'attention_mask'

## MISC

In [59]:
import re
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False )
model = BertModel.from_pretrained("Rostlab/prot_bert")
sequence_Example = "A E T C Z A O"
sequence_Example = re.sub(r"[UZOB]", "X", sequence_Example)
encoded_input = tokenizer(sequence_Example, return_tensors='pt')
output = model(**encoded_input)

Some weights of the model checkpoint at Rostlab/prot_bert were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print(x)
else:
    print ("MPS device not found.")

tensor([1.], device='mps:0')


  nonzero_finite_vals = torch.masked_select(


In [60]:
accelerator_registry=torch.device("mps")

In [61]:
accelertorch.backends.mps

NameError: name 'accelertorch' is not defined

In [62]:
MPSAccelerator.register_accelerators(device='mps')

TypeError: register_accelerators() got an unexpected keyword argument 'device'

In [59]:
!pip install tensorflow-metal

Collecting tensorflow-metal
  Downloading tensorflow_metal-0.7.1-cp38-cp38-macosx_12_0_x86_64.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: tensorflow-metal
Successfully installed tensorflow-metal-0.7.1


In [63]:
import transformers

In [66]:
!exit

In [67]:
!arch

i386
