In [1]:
import multiprocessing as mp
import pandas as pd
from tifffile import imread,imwrite
import torch
from hydra import initialize, compose
from hydra.utils import instantiate
from torch.utils.data import Dataset
from src.data import DataHandler
from transformers import Trainer, TrainingArguments
import src.utils as utils
from src.model import ResNet, ConvNext, Dummy, ModelTrainer, ViT
from src.data.dataset import CellPaintingDatasetCached

In [2]:
import os
os.chdir('/Users/maciej.filanowicz/CellPainting/src')
initialize(version_base="1.2", config_path="src/conf", job_name="test_app")
cfg = compose(config_name="config", return_hydra_config=True)
dataset_config = instantiate(cfg.dataset)
train_config = instantiate(cfg.train)

In [3]:
data_handler = DataHandler(dataset_config)
# data_loaders = data_handler.get_data_loaders()

In [4]:
data_handler.cache_dataset()

In [None]:
import pandas as pd
metadata = pd.read_csv('../data/processed/meta_data.csv')
train_dataset = CellPaintingDatasetCached(metadata[metadata.subset=='TRAIN'], dataset_config, dataset_config.transforms)
test_dataset = CellPaintingDatasetCached(metadata[metadata.subset=='TEST'], dataset_config, dataset_config.transforms)
val_dataset = CellPaintingDatasetCached(metadata[metadata.subset=='VAL'], dataset_config, dataset_config.transforms)

In [7]:
from transformers import default_data_collator
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")
data_collator = default_data_collator
def compute_metrics(eval_pred):

    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

ModuleNotFoundError: No module named 'transformers'

In [None]:
args = TrainingArguments(
    "test",
    do_train=True,
    do_eval=True,
    logging_steps=20,
    save_steps=20,
    evaluation_strategy = "steps",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    num_train_epochs=30,
    dataloader_num_workers=24,
    weight_decay=0.01,
    load_best_model_at_end=True,
    prediction_loss_only=True,
    report_to='wandb',
    logging_dir='logs',
)

In [None]:
model = ViT().to('cuda')

trainer = Trainer(
    model,
    args,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    data_collator = data_collator,
    compute_metrics = compute_metrics,
)