In [3]:
import cosmos
from cosmos.execution_types import TRAINING_MODEL

cosmos.initialization(host="alogin1.bsc.es")

[cosmos.initialization] Creating SSH connection to alogin1.bsc.es
[cosmos.initialization] Connection established and remote path '/gpfs/projects/bsc14/executions' verified.


In [None]:
result = cosmos.run(
    module_path="trans_cpt.preprocessing",
    function_name="get_dataset",
    queue="acc_debug",
    user="bsc14",
    kwargs={
        "repository": "huggingface",
        "dataset_name": "DT4H/the_chilean_waiting_list_corpus"
    },
    requirements=["python-dotenv", "datasets", "fsspec"],
    modules=[],
    partition="debug",
    nodes=1,
    cpus=20,
    gpus=1,
    venv_path="/gpfs/projects/bsc14/environments/trans_cpt",
    watch=True,
    execute_with_slurm=False,
)

In [4]:
job = cosmos.run(
    module_path="trans_cpt.training",
    function_name="training_pipeline",
    queue="acc_debug",
    user="bsc14",
    args=[{
        "data_path": "/gpfs/projects/bsc14/abecerr1/datasets/DT4H___wikipedia_cardiology_es/default/0.0.0/b20f70bf02ea8c0f5e0181e333b7b9ab3c610c4f",
    }],
    requirements=[
        "datasets",
        "transformers",
        "torch",
        "accelerate",
        "tqdm",
        "tensorboard"
    ],
    modules=[
        "cuda/12.6"
    ],
    partition="debug",
    nodes=1,
    cpus=80,
    gpus=4,
    venv_path="/gpfs/projects/bsc14/environments/trans_cpt",
    custom_command="accelerate launch --config_file ./trans_cpt/accelerate_config.yaml",
    execution_type=TRAINING_MODEL,
    training_logs_path="training_logs",
    watch=True,
)


[cosmos.run] Preparing configuration to execute 'training_pipeline'
[cosmos.run] Creating virtual environment
[cosmos.run] All requirements already installed
[cosmos.run] Environment ready in /gpfs/projects/bsc14/environments/trans_cpt. Requirements: ['datasets', 'transformers', 'torch', 'accelerate', 'tqdm', 'tensorboard']
[cosmos.run] Output remote logs: /gpfs/projects/bsc14/executions/job_20250205_152231/job_20250205_152231.out
[cosmos.run] Error remote logs: /gpfs/projects/bsc14/executions/job_20250205_152231/job_20250205_152231.err
[cosmos.run] Submitted batch job 15232383
[cosmos.run] Job job_20250205_152231 (ID: 15232383) sent.

[job.out_file] === Dependencies installed ===                                                       
[job.out_file] absl-py==2.1.0
[job.out_file] accelerate==1.3.0
[job.out_file] aiohappyeyeballs==2.4.4
[job.out_file] aiohttp==3.11.11
[job.out_file] aiosignal==1.3.2
[job.out_file] attrs==24.3.0
[job.out_file] certifi==2024.12.14
[job.out_file] charset-no

In [None]:
cosmos.check_status(job)
# cosmos.print_logs(job)
# cosmos.cancel_job(job)

In [None]:
result = cosmos.run(
    module_path="trans_cpt.training",
    function_name="inference_pipeline",
    queue="acc_debug",
    user="bsc14",
    args=[{
        "model_path": "/gpfs/projects/bsc14/storage/models/transcpt/CardioBERTa_2025-01-17_15-27-01",
        "text": (
            "Con el diagnóstico de endocarditis infecciosa sobre válvula protésica por Bacteroides fragilis,"
            "se comenzó tratamiento con metronidazol 500 mg/8 horas y amoxicilina-clavulánico 1000 mg/200mg/8 "
            "horas intravenoso. La paciente permaneció <mask> durante todo el ingreso, senegativizaron los hemocultivos "
            "de forma precoz y evolucionó de forma favorables de su ligera descompensación cardiaca con tratamiento"
            "diurético. Tras 6 semanas de tratamiento antibiótico intravenoso dirigido, estando estable hemodinámicamente "
            "y en buena clase funcional se dio de alta hospitalaria."
        ),
    }],
    requirements=[
        "datasets",
        "transformers",
        "torch",
        "accelerate",
        "tqdm",
        "tensorboard"
    ],
    modules=[
        "cuda/12.6"
    ],
    partition="debug",
    nodes=1,
    cpus=80,
    gpus=4,
    venv_path="/gpfs/projects/bsc14/environments/trans_cpt",
    custom_command="accelerate launch --config_file ./trans_cpt/accelerate_config.yaml",
    watch=True,
)