## Introduction to Fine-Tuning on SageMaker

In [None]:
!pip install s3fs

## 1. Preprocessing

We will be using the datasets library to download and preprocess the imdb dataset. After preprocessing, the dataset will be uploaded to our sagemaker bucket to be used within our training job. The imdb dataset consists of 25000 training and 25000 testing highly polar movie reviews.

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

dataset = load_dataset('imdb')
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased', model_max_length=512)

In [None]:
train_dataset, test_dataset = load_dataset('imdb', split=['train', 'test'])
test_dataset = test_dataset.shuffle().select(range(10000)) # limit test dataset to 10k samples

def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True)
# tokenize dataset
train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# set format for pytorch
train_dataset =  train_dataset.rename_column("label", "labels")
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset = test_dataset.rename_column("label", "labels")
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

## 2. Upload Training and Test Data to S3

In [None]:
bucket = <insert_bucket_name_here>

# save train_dataset to s3
training_input_path = f's3://{bucket}/imdb/train'
train_dataset.save_to_disk(training_input_path)

# save test_dataset to s3
test_input_path = f's3://{bucket}/imdb/test'
test_dataset.save_to_disk(test_input_path)

## 3. Start SageMaker Training Job

In [None]:
from sagemaker.huggingface import HuggingFace


role = sagemaker.get_execution_role()

# hyperparameters, which are passed into the training job
hyperparameters={
    'epochs': 1,
    'train_batch_size': 32,
    'model_name':'distilbert-base-uncased'
}

In [None]:
huggingface_estimator = HuggingFace(entry_point='train.py',
    source_dir='./train',
    instance_type='ml.p3.2xlarge',
    instance_count=1,
    role=role,
    transformers_version='4.26',
    pytorch_version='1.13',
    py_version='py39',
    hyperparameters = hyperparameters)

In [None]:
# starting the train job with our uploaded datasets as input
huggingface_estimator.fit({'train': training_input_path, 'test': test_input_path})

## 4. Deploying Trained Model

In [None]:
predictor = huggingface_estimator.deploy(1, "ml.g4dn.xlarge")

In [None]:
sentiment_input= {"inputs":"I love using SageMaker for training."}

predictor.predict(sentiment_input)

## 5. Cleanup

predictor.delete_model()
predictor.delete_endpoint()