In [12]:
import sys
sys.path.insert(0, "../..")

In [13]:
from src.datasets import get_arabic
from src.settings import presets

In [14]:
presets

Settings(datadir=PosixPath('/home/admindme/code/MLopdracht/data/raw'), testurl=HttpUrl('https://archive.ics.uci.edu/ml/machine-learning-databases/00195/Test_Arabic_Digit.txt', ), trainurl=HttpUrl('https://archive.ics.uci.edu/ml/machine-learning-databases/00195/Train_Arabic_Digit.txt', ), testfile=PosixPath('ArabicTest.txt'), trainfile=PosixPath('ArabicTrain.txt'), modeldir=PosixPath('/home/admindme/code/MLopdracht/models'), logdir=PosixPath('/home/admindme/code/MLopdracht/logs'), modelname='model.pt', batchsize=64)

In [15]:
trainstreamer, teststreamer = get_arabic(presets)

2023-06-03 13:47:45.351 | INFO     | src.data_tools:get_file:61 - File /home/admindme/code/MLopdracht/data/raw/ArabicTrain.txt already exists, skip download
2023-06-03 13:47:45.352 | INFO     | src.data_tools:get_file:61 - File /home/admindme/code/MLopdracht/data/raw/ArabicTest.txt already exists, skip download
2023-06-03 13:47:45.353 | INFO     | src.datasets:get_arabic:33 - Loading data from /home/admindme/code/MLopdracht/data/raw/ArabicTrain.txt and /home/admindme/code/MLopdracht/data/raw/ArabicTest.txt
2023-06-03 13:47:46.216 | INFO     | src.datasets:get_arabic:49 - Returning trainstreamer, teststreamer


In [16]:
x, y = next(iter(trainstreamer.stream()))
x.shape, y.shape

(torch.Size([64, 59, 13]), torch.Size([64]))

In [1]:
from ray import tune
import os
import shutil
import time
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
from ray.tune import JupyterNotebookReporter

%matplotlib inline
plt.style.use('dark_background')
import plotly.io as pio
pio.renderers.default = 'plotly_mimetype+notebook'

import sys
sys.path.insert(0, "../../")

DELETE = True # to delete the tunedir at the end of the notebook

In [2]:
NUM_SAMPLES = 40 # we run 40 experiments
NUM_DATA = 200 # our data has 200 observations
MAX_ITER = 15 # we run every experiment for a max of 15 epochs
MODE = "min" # we want to maximize the mean. This can also be "min"

In [3]:
from hypertune import train

In [4]:
from pathlib import Path

tune_dir = Path("../../models/ray/")
tune_dir.exists(), tune_dir.resolve()

(True, PosixPath('/home/admindme/code/MLopdracht/models/ray'))

In [None]:
timer = {}
best_config = {}

In [9]:
from src import datasets
from src.models import rnn_models, metrics, train_model
from src.settingshypertuning import SearchSpace
from pathlib import Path
from ray.tune import JupyterNotebookReporter
from ray import tune
import torch
import ray
from typing import Dict
from ray.tune import CLIReporter
from ray.tune.schedulers import AsyncHyperBandScheduler
from ray.tune.schedulers.hb_bohb import HyperBandForBOHB
from ray.tune.search.bohb import TuneBOHB
from loguru import logger
from filelock import FileLock


def train(config: Dict, checkpoint_dir=None):
    """
    The train function should receive a config file, which is a Dict
    ray will modify the values inside the config before it is passed to the train
    function.
    """

    # we lock the datadir to avoid parallel instances trying to
    # access the datadir
    data_dir = config["data_dir"]
    with FileLock(data_dir / ".lock"):
        trainloader, testloader = datasets.get_arabic(
            data_dir=data_dir, split=0.8, batchsize=64
        )

    # we set up the metric
    accuracy = metrics.Accuracy()
    # and create the model with the config
    model = rnn_models.GRUmodel(config)

    # and we start training.
    # because we set tunewriter=True
    # the trainloop wont try to report back to tensorboard,
    # but will report back with tune.report
    # this way, ray will know whats going on,
    # and can start/pause/stop a loop
    model = train_model.trainloop(
        epochs=50,
        model=model,
        optimizer=torch.optim.Adam,
        learning_rate=1e-3,
        loss_fn=torch.nn.CrossEntropyLoss(),
        metrics=[accuracy],
        train_dataloader=trainloader,
        test_dataloader=testloader,
        log_dir=".",
        train_steps=len(trainloader),
        eval_steps=len(testloader),
        patience=5,
        factor=0.5,
        tunewriter=["ray"],
    )


if __name__ == "__main__":
    if ray.is_initialized():
        ray.shutdown()
    ray.init()

    # have a look in src.settings to see how SearchSpace is created.
    # If you want to search other ranges, you change this in the settings file.
    config = SearchSpace(
        input_size=3,
        output_size=20,
        tune_dir=Path("models/ray").resolve(),
        data_dir=Path("data/raw/").resolve(),
    )

    reporter = CLIReporter()
    reporter.add_metric_column("Accuracy")

    bohb_hyperband = HyperBandForBOHB(
        time_attr="training_iteration",
        max_t=50,
        reduction_factor=3,
        stop_last_trials=False,
    )

    bohb_search = TuneBOHB()

    analysis = tune.run(
        train,
        config=config.dict(),
        metric="test_loss",
        mode="min",
        progress_reporter=reporter,
        local_dir=config.tune_dir,
        num_samples=50,
        search_alg=bohb_search,
        scheduler=bohb_hyperband,
        verbose=1,
    )

    ray.shutdown()

2023-06-05 17:53:51,614	INFO worker.py:1625 -- Started a local Ray instance.


FileNotFoundError: Make sure the datadir exists.
 Found /home/admindme/code/MLopdracht/dev/notebooks/data/raw to be non-existing.