In [None]:
import pandas as pd
import os
import numpy as np
import wandb 
import torch
import yaml
import import_ipynb
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig   
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from transformers import set_seed, enable_full_determinism
from datasets import Dataset, DatasetDict, disable_caching

In [None]:
disable_caching()

In [None]:
import helpers

In [None]:
os.environ["WANDB_API_KEY"] = "my key"
wandb.login()

## Import config 

In [None]:
conf = yaml.safe_load(open('config.yaml'))

In [None]:
config_model = conf['sweep_model']

## Reproducability

In [None]:
REPRO_SEED = conf['seeds']['repro_seed']
helpers.enable_reproducability(REPRO_SEED)

## Setup Torch Device

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [None]:
print("GPU is available: ", torch.cuda.is_available())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Prepare Data

In [None]:
MODEL = config_model['model']
SAMPLING_SEED = conf['seeds']['sampling_seed']

In [None]:
train = pd.read_pickle('path_to_data/train.pkl')
val = pd.read_pickle('path_to_data/val.pkl')

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [None]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

def prepare_data(train, val, remove_footer, remove_emojis, downsampling, sampling_seed):
    if downsampling:
        train = helpers.downsample(train, sampling_seed)
    
    train = helpers.select_text(train, remove_footer, remove_emojis)
    val = helpers.select_text(val, remove_footer, remove_emojis)
    
    train_ds = Dataset.from_pandas(train[["text", "label"]])
    val_ds = Dataset.from_pandas(val[["text", "label"]])
    
    ds = DatasetDict({"train": train_ds, "validation": val_ds})
    ds_encoded = ds.map(tokenize)
    
    ds_encoded['train'] = ds_encoded['train'].remove_columns(["text", "__index_level_0__", "token_type_ids"])
    ds_encoded['validation'] = ds_encoded['validation'].remove_columns(["text", "__index_level_0__", "token_type_ids"])
    
    return ds_encoded

In [None]:
t_tmp = train.copy()
v_tmp = val.copy()
tt = prepare_data(t_tmp, v_tmp, remove_footer=True, remove_emojis=True, downsampling=False, sampling_seed=SAMPLING_SEED)

In [None]:
t_tmp = train.copy()
v_tmp = val.copy()
ttd = prepare_data(t_tmp, v_tmp, remove_footer=True, remove_emojis=True, downsampling=True, sampling_seed=SAMPLING_SEED)

In [None]:
t_tmp = train.copy()
v_tmp = val.copy()
ff = prepare_data(t_tmp, v_tmp, remove_footer=False, remove_emojis=False, downsampling=False, sampling_seed=SAMPLING_SEED)

In [None]:
t_tmp = train.copy()
v_tmp = val.copy()
ffd = prepare_data(t_tmp, v_tmp, remove_footer=False, remove_emojis=False, downsampling=True, sampling_seed=SAMPLING_SEED)  

In [None]:
t_tmp = train.copy()
v_tmp = val.copy()
tf = prepare_data(t_tmp, v_tmp, remove_footer=True, remove_emojis=False, downsampling=False, sampling_seed=SAMPLING_SEED) 

In [None]:
t_tmp = train.copy()
v_tmp = val.copy()
tfd = prepare_data(t_tmp, v_tmp, remove_footer=True, remove_emojis=False, downsampling=True, sampling_seed=SAMPLING_SEED)

In [None]:
t_tmp = train.copy()
v_tmp = val.copy()
ft = prepare_data(t_tmp, v_tmp, remove_footer=False, remove_emojis=True, downsampling=False, sampling_seed=SAMPLING_SEED)

In [None]:
t_tmp = train.copy()
v_tmp = val.copy()
ftd = prepare_data(t_tmp, v_tmp, remove_footer=False, remove_emojis=True, downsampling=True, sampling_seed=SAMPLING_SEED)

## Sweep config

### For bayesian hyperparameter tuning:
- Change configuration to 'sweep_config_bayes' 
- Adapt trainig arguments settings in train function

In [None]:
sweep_config = conf['sweep_config_grid']
sweep_config['parameters']= sweep_config['sweep_parameters']
sweep_trainingargs = sweep_config['sweep_trainingargs']

## Inizialize sweep

In [None]:
sweep_id = wandb.sweep(sweep_config, project='project name')

In [None]:
def train(config=None):

    with wandb.init(config=config, resume=True):

        # set sweep configuration
        config = wandb.config

        # select data 
        remove_footer = config.remove_footer
        remove_emojis = config.remove_emojis
        downsampling = config.downsampling

        if remove_footer:
            if remove_emojis:
                if downsampling:
                    ds_encoded = ttd
                else:
                    ds_encoded = tt
            else:
                if downsampling:
                    ds_encoded = tfd
                else:
                    ds_encoded = tf
        else:
            if remove_emojis:
                if downsampling:
                    ds_encoded = ftd
                else:
                    ds_encoded = ft
            else:
                if downsampling:
                    ds_encoded = ffd
                else:
                    ds_encoded = ff  
                    
                    
        # set training arguments
        training_args = TrainingArguments(
        output_dir="output_path ",
        report_to='wandb',  
        num_train_epochs=config.epochs,
        learning_rate=config.learning_rate,
        weight_decay=config.weight_decay,
        per_device_train_batch_size=config.batch_size,
        per_device_eval_batch_size=sweep_trainingargs['per_device_eval_batch_size'],
        save_strategy=sweep_trainingargs['save_strategy'],
        evaluation_strategy=sweep_trainingargs['evaluation_strategy'],
        metric_for_best_model=sweep_trainingargs['metric_for_best_model'],
        load_best_model_at_end=sweep_trainingargs['load_best_model_at_end'],
        remove_unused_columns=sweep_trainingargs['remove_unused_columns'],
        logging_strategy=sweep_trainingargs['logging_strategy'],
        log_level=sweep_trainingargs['log_level'],
        disable_tqdm=sweep_trainingargs['disable_tqdm']
        )
        

        def model_init():
            
            model_config = AutoConfig.from_pretrained(MODEL)
            model_config.num_labels = config_model['num_labels']
            model_config.hidden_dropout_prob = config.hidden_dropout_prob 
            model_config.attention_probs_dropout_prob = config.attention_probs_dropout_prob
            
            model = (AutoModelForSequenceClassification
             .from_pretrained(MODEL, config=model_config)
             .to(device))
            return model
        
        # define training loop
        trainer = Trainer(
            model_init=model_init,
            args=training_args,
            train_dataset=ds_encoded["train"],
            eval_dataset=ds_encoded["validation"],
            tokenizer=tokenizer,
            compute_metrics=helpers.compute_metrics
        )

        # start training loop
        trainer.train()

In [None]:
wandb.agent(sweep_id, function=train) # prive additionally number of runs if needed