In [53]:
import argparse
import glob
import json
import os
from itertools import chain
import boto3

import pandas as pd
from joblib import Parallel, delayed
from sklearn.model_selection import StratifiedKFold
from collections import defaultdict
import sys 
from IPython.display import Image, display, HTML

In [54]:
sys.path.append('/root/code/utils')
sys.path.append('/root/code/model')

In [55]:
#---------------------------------------------------
# UTILS
#---------------------------------------------------

from metric_utils import compute_metrics
from train_utils import (EMA, AverageMeter, as_minutes, get_lr,
                                   print_line, init_wandb
                                  )
from dotdict import dotdict


#---------------------------------------------------
# DATASET, MODEL, AND DATALOADER
#---------------------------------------------------


from benetech_dataset import (TOKEN_MAP, BenetechDataset,
                                     create_train_transforms)
from benetech_dataloader import BenetechCollator
from benetech_model import BenetechModel

In [None]:
def num2str(num):
    try:
        num = "{:.2e}".format(num)
    except Exception as e:
        pass
    return num


def _process_json(fp):
    """process JSON files with annotations

    :param fp: file path
    :type fp: str
    :return: parsed annotation
    :rtype: dict
    """

    # read annotations ---
    with open(fp, "r") as f:
        anno = json.load(f)

    # store necessary data for labels ---
    chart_id = fp.split("/")[-1].split(".")[0]
    chart_source = anno["source"]
    chart_type = anno['chart-type']

    labels = []

    labels.append(
        {
            "id": chart_id,
            "source": chart_source,
            "chart_type": chart_type,
        }
    )

    return labels

# Create Folds Dataframe

In [None]:
local_data_path = '/root/data'

In [None]:
def process_annotations(anno_paths, num_jobs=8):
    anno_paths = glob.glob(f"{local_data_path}/train/annotations/*.json")
    annotations = Parallel(n_jobs=num_jobs, verbose=1,backend ="threading")(delayed(_process_json)(file_path) for file_path in anno_paths)
    labels_df = pd.DataFrame(list(chain(*annotations)))
    return labels_df

In [None]:
print("creating folds ...")
fold_df = process_annotations(local_data_path)

In [None]:
fold_df.head()

In [None]:
fold_df = fold_df[["id", "source", "chart_type"]].copy()
fold_df = fold_df.drop_duplicates()
fold_df = fold_df.reset_index(drop=True)
fold_df.head()

In [None]:
len(fold_df)

In [None]:
skf = StratifiedKFold(
        n_splits=4,
        shuffle=True,
        random_state=42
    )

In [None]:
for f, (t_, v_) in enumerate(skf.split(fold_df, fold_df["chart_type"].values)):
    fold_df.loc[v_, "kfold"] = f
fold_df["kfold"] = fold_df["kfold"].astype(int)

fold_df["kfold"] = fold_df[["kfold", "source"]].apply(
        lambda x: x[0] if x[1] == "extracted" else 99, axis=1,
    )

# Upload Fold Data to S3

In [None]:
fold_df_final = fold_df[["id", "kfold"]].copy()
fold_df_final = fold_df_final.reset_index(drop=True)
fold_df_final.to_parquet('cv_map_4_folds.parquet')

In [None]:
s3 = boto3.resource('s3')
file_name = 'cv_map_4_folds.parquet'
s3.meta.client.upload_file(file_name, 'sagemaker-benetech', 'cv_map_4_folds.parquet')

# Wandb Login

In [None]:
import wandb
wandb.login()

In [None]:
wandb.sagemaker_auth(path="/root/benetech-aws/")

# Make Sure Images and Annotations Match - Check ones that don't

In [None]:
import s3fs
s3 = s3fs.S3FileSystem(anon=False)
anno_paths = s3.glob('s3://sagemaker-benetech/train/annotations/*')
img_paths =  s3.glob('s3://sagemaker-benetech/train/images/*')

In [None]:
img_paths[0].split('/')[-1].split('.')[0]

In [None]:
img_id = [file.split('/')[-1].split('.')[0] for file in img_paths]
anno_id = [file.split('/')[-1].split('.')[0] for file in anno_paths]

In [None]:
list(set(img_id).symmetric_difference(anno_id))

# Hyperparameters

In [227]:
hyperparams = {
'backbone_path': "google/matcha-base",
'max_length':  1024,
'max_patches': 2048,
'patch_size': 16,
'train_bs': 1, 
'valid_bs':1,
'num_epochs': 2,
'use_augmentations': True,
'len_tokenizer': 0,
'pad_token_id': 0,
'decoder_start_token_id': 0, 
'bos_token_id': 0,
'optimizer_lr': 2e-5,
'optimizer_weight_decay': 1e-5,
'grad_accumulation': 8,
'warmup_pct': 0.05, 
'extracted_multiplier': 16,
'original_multiplier': 3,
'n_folds':4,
'all_data': False,
'use_random_seed':True,
'fold':0,
'save_trigger': -1.0,
'use_ema': False,
'decay_rate': 0.9925,
'grad_clip_value': 5.0,
'eval_frequency':30000,
'max_length_generation': 16,
'patience':100, 
'use_wandb':True,
'project':'benetech-aws',
'run_name':'final-v1',
'all_data': False
}

In [228]:
hyperparams = dotdict(hyperparams)

In [229]:
type(hyperparams.n_folds)

int

# Train

In [225]:
from sagemaker.pytorch import PyTorch
import sagemaker
role = sagemaker.get_execution_role()

In [230]:
estimator = PyTorch(
    source_dir = "benetech-aws",
    entry_point="train.py",
    role=role,
    py_version="py39",
    framework_version="1.13.1",
    instance_count=1,
    instance_type="ml.g5.2xlarge",
    hyperparameters=hyperparams,
)

In [None]:
estimator.fit('s3://sagemaker-benetech/train/')

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.


Using provided s3_resource


INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: pytorch-training-2023-07-26-16-25-00-324


2023-07-26 16:25:01 Starting - Starting the training job...
2023-07-26 16:25:21 Starting - Preparing the instances for training......
2023-07-26 16:26:32 Downloading - Downloading input data............
2023-07-26 16:28:33 Training - Downloading the training image......
2023-07-26 16:29:34 Training - Training image download completed. Training in progress.....[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2023-07-26 16:29:57,623 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2023-07-26 16:29:57,638 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2023-07-26 16:29:57,648 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2023-07-26 16:29:57,649 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2023-07-26 16:29:57,848 s

# Inspect Output

In [None]:
import boto3

s3 = boto3.client('s3')
s3.download_file('sagemaker-us-east-1-905609232955', 'pytorch-training-2023-07-25-16-53-39-256/output/output.tar.gz', 'outputs.tar.gz')

In [None]:
import tarfile

fname = '/root/outputs.tar.gz'

if fname.endswith("tar.gz"):
    tar = tarfile.open(fname, "r:gz")
    tar.extractall(path='/root/outputs/')
    tar.close()
elif fname.endswith("tar"):
    tar = tarfile.open(fname, "r:")
    tar.extractall(path='/root/outputs/')
    tar.close()

In [None]:
sys.path.append('/root/outputs')

In [None]:
output_df = pd.read_csv('outputs/result_df_fold_0_best.csv')