# Fine tune GPTJ on SageMaker Training

In [14]:
!pip install sagemaker boto3 --upgrade

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com


## Build container

In [15]:
%%writefile ../Finetune_GPTNEO_GPTJ6B/train
#!/bin/bash

df -h
cd finetuning_repo

deepspeed --num_gpus=$num_gpus run_clm.py --deepspeed $deepspeed --model_name_or_path EleutherAI/gpt-j-6B --train_file /opt/ml/input/data/train/train.csv --validation_file /opt/ml/input/data/validation/validation.csv --do_train --do_eval --fp16 --overwrite_cache --evaluation_strategy=$evaluation_strategy --output_dir $output_dir --num_train_epochs $num_train_epochs  --eval_steps $eval_steps --gradient_accumulation_steps $gradient_accumulation_steps --per_device_train_batch_size $per_device_train_batch_size --use_fast_tokenizer $use_fast_tokenizer --learning_rate $learning_rate --warmup_steps $warmup_steps --save_total_limit $save_total_limit --save_steps $save_steps --save_strategy $save_strategy --tokenizer_name $tokenizer_name --load_best_model_at_end=$load_best_model_at_end --block_size=$block_size --weight_decay=$weight_decay

Overwriting ../Finetune_GPTNEO_GPTJ6B/train


In [16]:
%%sh 

cd ../Finetune_GPTNEO_GPTJ6B
chmod +x train


In [17]:
%%sh
cd ../Finetune_GPTNEO_GPTJ6B
./build_push_image.sh

Login Succeeded
Sending build context to Docker daemon  65.28MB
Step 1/20 : FROM nvidia/cuda:11.7.1-devel-ubuntu20.04
 ---> c4f8d3c928f4
Step 2/20 : ARG DEBIAN_FRONTEND=noninteractive
 ---> Using cache
 ---> 88306a3cab77
Step 3/20 : SHELL [ "/bin/bash","-c" ]
 ---> Using cache
 ---> 823a73990dba
Step 4/20 : ENV NVIDIA_VISIBLE_DEVICES all
 ---> Using cache
 ---> 72cf8e27e0ab
Step 5/20 : ENV NVIDIA_DRIVER_CAPABILITIES compute,video,utility
 ---> Using cache
 ---> 9490e3109300
Step 6/20 : RUN apt update -y && apt upgrade -y
 ---> Using cache
 ---> 969f84ed6f63
Step 7/20 : RUN apt install wget -y && apt install git -y && apt install libaio-dev -y && apt install libaio1 -y
 ---> Using cache
 ---> f42adf5187a7
Step 8/20 : RUN apt install python3.9 -y && apt install python3-pip -y && apt install python-is-python3 -y
 ---> Using cache
 ---> 896beed91d20
Step 9/20 : RUN pip install torch torchvision torchaudio
 ---> Using cache
 ---> e18593d3bfb5
Step 10/20 : RUN pip install datasets && pip ins

https://docs.docker.com/engine/reference/commandline/login/#credentials-store



In [18]:
import boto3
import sagemaker 
from sagemaker import get_execution_role
from sagemaker.predictor import csv_serializer

from sagemaker import local

sagemaker_session = sagemaker.Session()

#local_sagemaker_session = local.LocalSession()

role = get_execution_role()

account = sagemaker_session.boto_session.client('sts').get_caller_identity()['Account']
region = sagemaker_session.boto_session.region_name

image = '{}.dkr.ecr.{}.amazonaws.com/gptj-finetune:latest'.format(account, region)

sm_model = sagemaker.estimator.Estimator(
    image_uri=image,
    role=role,
    instance_count = 1,
    #instance_type='local_gpu', 
    #sagemaker_session=local_sagemaker_session,
    sagemaker_session= sagemaker_session,
    instance_type = 'ml.g5.48xlarge',
    environment = {
    "num_gpus": "8",
    "deepspeed": "ds_config_stage3.json",
    "evaluation_strategy": "steps",
    "output_dir": "/opt/ml/model/finetune",
    "num_train_epochs": "12",
    "eval_steps": "20",
    "gradient_accumulation_steps": "1",
    "per_device_train_batch_size": "4",
    "use_fast_tokenizer": "False",
    "learning_rate": "5e-06",
    "warmup_steps": "10",
    "save_total_limit": "1",
    "save_steps": "20",
    "save_strategy": "steps",
    "tokenizer_name": "gpt2",
    "load_best_model_at_end": "True",
    "block_size": "2048",
    "weight_decay": "0.1"
    },
    
    metric_definitions=[
    {'Name': 'eval:loss', 'Regex': "'eval_loss': ([0-9]+\.[0-9]+)"},
    {'Name': 'eval:runtime', 'Regex': "'eval_runtime': ([0-9]+\.[0-9]+)"},
    {'Name': 'eval:samples_per_second', 'Regex': "'eval_samples_per_second': ([0-9]+\.[0-9]+)"},
    {'Name': 'eval:eval_steps_per_second', 'Regex': "'eval_steps_per_second': ([0-9]+\.[0-9]+)"},
]

)
    



UnknownServiceError: Unknown service: 'sagemaker'. Valid service names are: accessanalyzer, account, acm, acm-pca, alexaforbusiness, amp, amplify, amplifybackend, amplifyuibuilder, apigateway, apigatewaymanagementapi, apigatewayv2, appconfig, appconfigdata, appflow, appintegrations, application-autoscaling, application-insights, applicationcostprofiler, appmesh, apprunner, appstream, appsync, arc-zonal-shift, athena, auditmanager, autoscaling, autoscaling-plans, backup, backup-gateway, backupstorage, batch, billingconductor, braket, budgets, ce, chime, chime-sdk-identity, chime-sdk-media-pipelines, chime-sdk-meetings, chime-sdk-messaging, chime-sdk-voice, cleanrooms, cloud9, cloudcontrol, clouddirectory, cloudformation, cloudfront, cloudhsm, cloudhsmv2, cloudsearch, cloudsearchdomain, cloudtrail, cloudtrail-data, cloudwatch, codeartifact, codebuild, codecatalyst, codecommit, codedeploy, codeguru-reviewer, codeguruprofiler, codepipeline, codestar, codestar-connections, codestar-notifications, cognito-identity, cognito-idp, cognito-sync, comprehend, comprehendmedical, compute-optimizer, config, connect, connect-contact-lens, connectcampaigns, connectcases, connectparticipant, controltower, cur, customer-profiles, databrew, dataexchange, datapipeline, datasync, dax, detective, devicefarm, devops-guru, directconnect, discovery, dlm, dms, docdb, docdb-elastic, drs, ds, dynamodb, dynamodbstreams, ebs, ec2, ec2-instance-connect, ecr, ecr-public, ecs, efs, eks, elastic-inference, elasticache, elasticbeanstalk, elastictranscoder, elb, elbv2, emr, emr-containers, emr-serverless, es, events, evidently, finspace, finspace-data, firehose, fis, fms, forecast, forecastquery, frauddetector, fsx, gamelift, gamesparks, glacier, globalaccelerator, glue, grafana, greengrass, greengrassv2, groundstation, guardduty, health, healthlake, honeycode, iam, identitystore, imagebuilder, importexport, inspector, inspector2, iot, iot-data, iot-jobs-data, iot-roborunner, iot1click-devices, iot1click-projects, iotanalytics, iotdeviceadvisor, iotevents, iotevents-data, iotfleethub, iotfleetwise, iotsecuretunneling, iotsitewise, iotthingsgraph, iottwinmaker, iotwireless, ivs, ivschat, kafka, kafkaconnect, kendra, kendra-ranking, keyspaces, kinesis, kinesis-video-archived-media, kinesis-video-media, kinesis-video-signaling, kinesis-video-webrtc-storage, kinesisanalytics, kinesisanalyticsv2, kinesisvideo, kms, lakeformation, lambda, lex-models, lex-runtime, lexv2-models, lexv2-runtime, license-manager, license-manager-linux-subscriptions, license-manager-user-subscriptions, lightsail, location, logs, lookoutequipment, lookoutmetrics, lookoutvision, m2, machinelearning, macie, macie2, managedblockchain, marketplace-catalog, marketplace-entitlement, marketplacecommerceanalytics, mediaconnect, mediaconvert, medialive, mediapackage, mediapackage-vod, mediastore, mediastore-data, mediatailor, memorydb, meteringmarketplace, mgh, mgn, migration-hub-refactor-spaces, migrationhub-config, migrationhuborchestrator, migrationhubstrategy, mobile, mq, mturk, mwaa, neptune, network-firewall, networkmanager, nimble, oam, omics, opensearch, opensearchserverless, opsworks, opsworkscm, organizations, outposts, panorama, personalize, personalize-events, personalize-runtime, pi, pinpoint, pinpoint-email, pinpoint-sms-voice, pinpoint-sms-voice-v2, pipes, polly, pricing, privatenetworks, proton, qldb, qldb-session, ram, rbin, rds, rds-data, redshift, redshift-data, redshift-serverless, rekognition, resiliencehub, resource-explorer-2, resource-groups, resourcegroupstaggingapi, robomaker, rolesanywhere, route53, route53-recovery-cluster, route53-recovery-control-config, route53-recovery-readiness, route53domains, route53resolver, rum, s3, s3control, s3outposts, sagemaker-a2i-runtime, sagemaker-edge, sagemaker-featurestore-runtime, sagemaker-geospatial, sagemaker-metrics, sagemaker-runtime, savingsplans, scheduler, schemas, sdb, secretsmanager, securityhub, securitylake, serverlessrepo, service-quotas, servicecatalog, servicecatalog-appregistry, servicediscovery, ses, sesv2, shield, signer, simspaceweaver, sms, sms-voice, snow-device-management, snowball, sns, sqs, ssm-contacts, ssm-incidents, ssm-sap, sso, sso-admin, sso-oidc, stepfunctions, storagegateway, sts, support, support-app, swf, synthetics, textract, timestream-query, timestream-write, transcribe, transfer, translate, voice-id, waf, waf-regional, wafv2, wellarchitected, wisdom, workdocs, worklink, workmail, workmailmessageflow, workspaces, workspaces-web, xray

In [None]:
!aws s3 cp ../Finetune_GPTNEO_GPTJ6B/quotes_dataset/train.csv s3://karpmar-random/gptj/train/train.csv
!aws s3 cp ../Finetune_GPTNEO_GPTJ6B/quotes_dataset/validation.csv s3://karpmar-random/gptj/val/validation.csv


In [None]:

from sagemaker.session import TrainingInput

train_input = TrainingInput(
    "s3://karpmar-random/gptj/train/train.csv", content_type="csv"
)
validation_input = TrainingInput(
    "s3://karpmar-random/gptj/val/validation.csv", content_type="csv"
)

sm_model.fit({"train": train_input, "validation": validation_input}, wait=True)
