In [1]:
import multiprocessing as mp
import pandas as pd
from tifffile import imread,imwrite
import torch
from hydra import initialize, compose
from hydra.utils import instantiate
from torch.utils.data import Dataset
from src.data import DataHandler
# from transformers import Trainer, TrainingArguments
import src.utils as utils
from src.model import ResNet, ConvNext, Dummy, ModelTrainer, ViT, DeiT
from src.data.dataset import CellPaintingDatasetCached

In [2]:
import sys

In [3]:
# !{sys.executable} -m pip install transformers datasets tifffile hydra-core wandb torch torchvision jupyter tqdm --force-reinstall

In [4]:
import os
os.chdir('/Users/maciej.filanowicz/CellPainting/src')
initialize(version_base="1.2", config_path="src/conf", job_name="test_app")
cfg = compose(config_name="config", return_hydra_config=True)
dataset_config = instantiate(cfg.dataset)
train_config = instantiate(cfg.train)

In [5]:
data_handler = DataHandler(dataset_config)
# data_loaders = data_handler.get_data_loaders()

In [6]:
from transformers import DeiTForImageClassification

In [7]:
model = DeiT()

Some weights of the model checkpoint at facebook/deit-base-distilled-patch16-224 were not used when initializing DeiTForImageClassification: ['cls_classifier.bias', 'cls_classifier.weight', 'distillation_classifier.weight', 'distillation_classifier.bias']
- This IS expected if you are initializing DeiTForImageClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DeiTForImageClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DeiTForImageClassification were not initialized from the model checkpoint at facebook/deit-base-distilled-patch16-224 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream t

In [8]:
data_handler.cache_dataset()

                                    folder_name  \
0    HepG2_Exp3_Plate1_FX9__2021-04-08T16_16_48   
1    HepG2_Exp3_Plate1_FX9__2021-04-08T16_16_48   
2    HepG2_Exp3_Plate1_FX9__2021-04-08T16_16_48   
3    HepG2_Exp3_Plate1_FX9__2021-04-08T16_16_48   
4    HepG2_Exp3_Plate1_FX9__2021-04-08T16_16_48   
..                                          ...   
859  HepG2_Exp3_Plate1_FX9__2021-04-08T16_16_48   
860  HepG2_Exp3_Plate1_FX9__2021-04-08T16_16_48   
861  HepG2_Exp3_Plate1_FX9__2021-04-08T16_16_48   
862  HepG2_Exp3_Plate1_FX9__2021-04-08T16_16_48   
863  HepG2_Exp3_Plate1_FX9__2021-04-08T16_16_48   

                         file_name1                      file_name2  \
0    r01c01f01p01-ch1sk1fk1fl1.tiff  r01c01f01p01-ch2sk1fk1fl1.tiff   
1    r01c01f02p01-ch1sk1fk1fl1.tiff  r01c01f02p01-ch2sk1fk1fl1.tiff   
2    r01c01f03p01-ch1sk1fk1fl1.tiff  r01c01f03p01-ch2sk1fk1fl1.tiff   
3    r01c01f04p01-ch1sk1fk1fl1.tiff  r01c01f04p01-ch2sk1fk1fl1.tiff   
4    r01c01f05p01-ch1sk1fk1fl1.t

In [9]:
import pandas as pd
metadata = pd.read_csv("../data/processed/meta_data.csv")
train_dataset = CellPaintingDatasetCached(
    metadata[metadata.folder_name == "train"], dataset_config, dataset_config.train_transforms
)
test_dataset = CellPaintingDatasetCached(
    metadata[metadata.folder_name == "test"], dataset_config, dataset_config.test_transforms
)
val_dataset = CellPaintingDatasetCached(
    metadata[metadata.folder_name == "val"], dataset_config, dataset_config.test_transforms
)

In [10]:
metadata[metadata.folder_name=='train']['compound_name'].value_counts()

DFSO                  279
Berberine Chloride     63
Fluphenazine           63
Latrunculin B          63
Rotenone               63
Tetrandrine            63
Brefeldin A            54
Nocodazole             54
Rapamycin              54
Name: compound_name, dtype: int64

In [12]:
metadata[metadata.folder_name=='test']['compound_name'].value_counts()

Berberine Chloride    9
Latrunculin B         9
Name: compound_name, dtype: int64

In [17]:
from transformers import default_data_collator
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")
data_collator = default_data_collator
def compute_metrics(eval_pred):

    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

ImportError: dlopen(/Users/maciej.filanowicz/opt/anaconda3/envs/cellpaint/lib/python3.9/site-packages/pyarrow/lib.cpython-39-darwin.so, 2): Library not loaded: @rpath/libre2.0.dylib
  Referenced from: /Users/maciej.filanowicz/opt/anaconda3/envs/cellpaint/lib/libarrow.400.1.0.dylib
  Reason: image not found

In [None]:
args = TrainingArguments(
    "test",
    do_train=True,
    do_eval=True,
    logging_steps=20,
    save_steps=20,
    evaluation_strategy = "steps",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    num_train_epochs=30,
    dataloader_num_workers=24,
    weight_decay=0.01,
    load_best_model_at_end=True,
    prediction_loss_only=True,
    report_to='wandb',
    logging_dir='logs',
)

In [None]:
model = ViT().to('cuda')

trainer = Trainer(
    model,
    args,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    data_collator = data_collator,
    compute_metrics = compute_metrics,
)