# LLM Fine-Tuning for Job Description Classification with SageMaker & MLflow

This notebook orchestrates a SageMaker Pipeline to fine-tune Llama 3 for job description classification. 
It uses pre-generated raw data from S3 (created by `scripts/python/generate_and_upload_raw_data.py`), 
a dedicated training script (`scripts/python/finetune_entrypoint.py`), and MLflow for experiment tracking.

## 1. Setup and Dependencies

**Important:** If you encounter `OSError: [Errno 2] No such file or directory: '/opt/conda/lib/python3.10/site-packages/fsspec-XYZ.dist-info/METADATA'`, run the following in a new cell first, then **RESTART THE KERNEL** and run this cell again:
```python
# !pip install --ignore-installed --no-deps --no-cache-dir fsspec==2023.6.0 # Or the problematic version
```

In [None]:
!pip install --ignore-installed --no-deps --no-cache-dir fsspec==2023.6.0

In [1]:
!pip install sagemaker  datasets transformers mlflow sagemaker-mlflow --quiet

In [None]:
# !pip install "sagemaker>=2.200.0" "datasets>=2.18.0" "transformers>=4.38.0,<4.41.0" "mlflow>=2.9.0" "sagemaker-mlflow>=0.1.0" --quiet
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 --quiet # Ensure cuXXX matches your training instance GPU CUDA version
# !pip install "accelerate>=0.28.0" "bitsandbytes>=0.41.0,<0.43.0" "scikit-learn>=1.0.0" "pandas" "matplotlib" "seaborn" "huggingface_hub" "s3fs" "peft>=0.9.0,<0.12.0" --quiet # Pin peft for transformers 4.36-4.40 compatibility

In [4]:
%load_ext autoreload
%autoreload 2

import sagemaker
import boto3
import os
import json
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.function_step import step
from sagemaker.workflow.parameters import ParameterString, ParameterInteger, ParameterFloat
from sagemaker.workflow.execution_variables import ExecutionVariables

# --- Add project root to sys.path for local module imports ---
import sys
notebook_dir = os.getcwd() 
project_root = os.path.abspath(os.path.join(notebook_dir, '..'))
if project_root not in sys.path:
    print(f"Adding project root to sys.path: {project_root}")
    sys.path.insert(0, project_root)
else:
    print(f"Project root already in sys.path: {project_root}")
# --- End Path Setup ---

# Import functions from your scripts
from steps.preprocess_job_descriptions import preprocess_data
from steps.finetune_llama3_classifier import launch_hf_training_job # This is your LAUNCHER script's main function
from steps.evaluation_classifier import evaluate_model

os.environ["SAGEMAKER_USER_CONFIG_OVERRIDE"] = os.getcwd()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Project root already in sys.path: /home/sagemaker-user


2025-05-29 21:57:35.573356: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## 2. SageMaker Session and IAM Role

In [7]:
import os
print(os.environ.get("AWS_ROLE_ARN"))

import boto3
try:
    client = boto3.client('sts')
    identity = client.get_caller_identity()
    print(identity)
except Exception as e:
    print(f"Error getting caller identity: {e}")

None
{'UserId': 'AROASRK2CX7WPM2ML6UZA:SageMaker', 'Account': '174671970284', 'Arn': 'arn:aws:sts::174671970284:assumed-role/AmazonSageMaker-ExecutionRole-20240216T153805/SageMaker', 'ResponseMetadata': {'RequestId': 'deb030f4-3e00-42c0-90c7-3bc16f70f630', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'deb030f4-3e00-42c0-90c7-3bc16f70f630', 'content-type': 'text/xml', 'content-length': '470', 'date': 'Thu, 29 May 2025 22:00:48 GMT'}, 'RetryAttempts': 0}}


In [8]:
# try:
#     role = sagemaker.get_execution_role()
#     print(f"SageMaker Execution Role: {role}")
# except ValueError:
#     iam = boto3.client("iam")
#     print("Could not automatically get SageMaker execution role. Please ensure it's configured or specify manually.")
#     raise ValueError("SageMaker execution role not found.")

role = "arn:aws:iam::174671970284:role/service-role/AmazonSageMaker-ExecutionRole-20240216T153805"
sess = sagemaker.Session()
region = sess.boto_region_name
default_bucket = sess.default_bucket()
print(f"SageMaker Session region: {region}, bucket: {default_bucket}")

ValueError: Unable to load the config file from the location: /home/sagemaker-user/job-classification-sagemaker-mlflowProvide a valid file path

## 3. Configuration

In [None]:
pipeline_name = "JobDescClassification-Llama3-Pipeline-V5" 
base_job_prefix = "job-desc-classify" 

mlflow_tracking_server_arn = "arn:aws:sagemaker:your-region:your-aws-account-id:mlflow-tracking-server/your-tracking-server-name" # <--- REPLACE THIS
mlflow_experiment_name = "JobDescriptionClassification-Llama3-FineTuning"

model_id_default = "meta-llama/Meta-Llama-3-8B" 

processed_data_s3_prefix = f"{base_job_prefix}/processed_data/v3" 

default_raw_data_s3_uri = f"s3://{default_bucket}/raw_job_data/poc_multilingual_01/raw_jds_translated.jsonl" 

preprocess_instance_type = "ml.m5.large"
finetune_launcher_instance_type = "ml.m5.large" 
default_training_instance_type = "ml.g5.12xlarge" 
evaluation_instance_type = "ml.g5.2xlarge" 

default_hf_training_image_uri = f'763104351884.dkr.ecr.{region}.amazonaws.com/huggingface-pytorch-training:2.1.0-transformers4.36.0-gpu-py310-cu121-ubuntu20.04'

if "your-region" in mlflow_tracking_server_arn:
    print("ERROR: MLflow Tracking Server ARN is a placeholder. Please replace it.")
if "raw_job_data/poc_multilingual_01/raw_jds_translated.jsonl" in default_raw_data_s3_uri:
    # This is a very basic check and might not be accurate for all S3 path styles
    print(f"WARNING: Default RawDatasetS3URI is set to '{default_raw_data_s3_uri}'. Ensure this S3 URI points to your generated raw dataset or override this parameter when starting the pipeline.")

## 4. Pipeline Parameters

In [None]:
param_raw_data_s3_uri = ParameterString(name="RawDatasetS3URI", default_value=default_raw_data_s3_uri)
param_job_desc_column = ParameterString(name="JobDescriptionColumn", default_value="job_description_text")
param_category_column = ParameterString(name="CategoryColumn", default_value="category_label")
param_test_split_fraction = ParameterFloat(name="TestSplitFraction", default_value=0.15)
param_validation_split_fraction = ParameterFloat(name="ValidationSplitFraction", default_value=0.15)
param_max_samples_per_split = ParameterInteger(name="MaxSamplesPerSplit", default_value=-1)

param_model_id = ParameterString(name="ModelIdentifier", default_value=model_id_default)
param_training_instance_type = ParameterString(name="TrainingInstanceType", default_value=default_training_instance_type)
param_training_instance_count = ParameterInteger(name="TrainingInstanceCount", default_value=1)
param_hf_training_image_uri = ParameterString(name="HFTrainingImageURI", default_value=default_hf_training_image_uri)

param_finetune_epochs = ParameterInteger(name="FineTuneEpochs", default_value=1)
param_per_device_train_batch_size = ParameterInteger(name="PerDeviceTrainBatchSize", default_value=1)
param_learning_rate = ParameterFloat(name="LearningRate", default_value=0.0002)
param_lora_r = ParameterInteger(name="LoraR", default_value=8)
param_lora_alpha = ParameterInteger(name="LoraAlpha", default_value=16)
param_lora_dropout = ParameterFloat(name="LoraDropout", default_value=0.05)
param_lora_target_modules = ParameterString(name="LoraTargetModules", default_value="q_proj,v_proj,k_proj,o_proj")
param_merge_weights = ParameterString(name="MergeWeights", default_value="True")
param_hf_token = ParameterString(name="HuggingFaceToken", default_value="OPTIONAL_HF_TOKEN_PLACEHOLDER")

param_eval_batch_size = ParameterInteger(name="EvaluationBatchSize", default_value=4)

## 5. Define Pipeline Steps using `@step` decorator

In [None]:
# A. Preprocessing Step
@step(
    name="PreprocessJobData",
    instance_type=preprocess_instance_type,
    keep_alive_period_in_seconds=300
)
def sm_pipeline_preprocess_data_step(
    raw_s3_uri: str,
    s3_bucket: str,
    s3_prefix: str,
    jd_column: str,
    cat_column: str,
    test_frac_val: float,
    val_frac_val: float,
    max_samples_val: int,
    mlflow_arn_val: str,
    mlflow_exp_val: str,
    exec_id: str,
):
    actual_max_samples = None if max_samples_val < 0 else max_samples_val
    return preprocess_data(
        raw_dataset_identifier=raw_s3_uri,
        s3_output_bucket=s3_bucket,
        s3_output_prefix=s3_prefix,
        job_desc_column=jd_column,
        category_column=cat_column,
        test_split_fraction=test_frac_val,
        validation_from_train_fraction=val_frac_val,
        max_samples_per_split=actual_max_samples,
        mlflow_arn=mlflow_arn_val,
        experiment_name=mlflow_exp_val,
        run_name=exec_id,
    )

# B. Fine-tuning Launcher Step
@step(
    name="LaunchHFFineTuning",
    instance_type=finetune_launcher_instance_type, 
    keep_alive_period_in_seconds=300 
)
def sm_pipeline_finetune_launcher_step(
    processed_data_info_dict: dict,
    sagemaker_iam_role: str,
    train_instance_type_str: str,
    train_instance_count_int: int,
    hf_image_uri_str: str,
    model_id_str: str,
    epochs_int: int,
    batch_size_int: int,
    lr_float: float,
    lora_r_int: int,
    lora_alpha_int: int,
    lora_dropout_float: float,
    lora_targets_str: str,
    merge_weights_str: str,
    hf_token_str: str,
    mlflow_arn_str: str,
    mlflow_exp_str: str,
    pipeline_exec_id_str: str,
):
    merge_weights_bool = merge_weights_str.lower() == 'true'
    actual_hf_token = hf_token_str if hf_token_str and hf_token_str != "OPTIONAL_HF_TOKEN_PLACEHOLDER" else None
    
    # launch_hf_training_job is from steps.finetune_llama3_classifier (your launcher script)
    # Ensure this launcher script internally sets source_dir="scripts" and entry_point="python/finetune_entrypoint.py"
    return launch_hf_training_job(
        role=sagemaker_iam_role,
        image_uri=hf_image_uri_str,
        instance_type=train_instance_type_str,
        instance_count=train_instance_count_int,
        train_s3_uri=processed_data_info_dict['train'],
        validation_s3_uri=processed_data_info_dict['validation'],
        # The following two are now expected to be hardcoded/managed within your launcher script:
        entry_point_script="python/finetune_entrypoint.py", # This should match what your launcher expects or uses internally
        source_directory="scripts", # This should match what your launcher expects or uses internally
        model_id_hf=model_id_str,
        epochs_val=epochs_int,
        per_device_train_batch_size_val=batch_size_int,
        learning_rate_val=lr_float,
        lora_r_val=lora_r_int,
        lora_alpha_val=lora_alpha_int,
        lora_dropout_val=lora_dropout_float,
        lora_target_modules_val=lora_targets_str,
        merge_weights_val=merge_weights_bool,
        hf_token_val=actual_hf_token,
        mlflow_tracking_arn=mlflow_arn_str,
        mlflow_experiment=mlflow_exp_str,
        pipeline_run_id=pipeline_exec_id_str,
        base_job_name_prefix=f"job-clf-{model_id_str.split('/')[-1].replace('_','-')}"
    )

# C. Evaluation Step
@step(
    name="EvaluateClassifier",
    instance_type=evaluation_instance_type,
    image_uri=default_hf_training_image_uri, 
    keep_alive_period_in_seconds=600
)
def sm_pipeline_evaluate_model_step(
    finetune_launcher_output_dict: dict, 
    processed_data_info_dict: dict, 
    eval_batch_size_int: int, 
    mlflow_arn_str: str,
    mlflow_exp_str: str,
    pipeline_exec_id_str: str 
):
    mlflow_model_uri = f"runs:/{pipeline_exec_id_str}/fine_tuned_classifier_model" 
    
    return evaluate_model(
        model_s3_path_or_mlflow_uri=mlflow_model_uri, 
        test_data_s3_path=processed_data_info_dict['test'],
        poc_categories_s3_path=processed_data_info_dict['categories_s3_path'],
        batch_size=eval_batch_size_int,
        mlflow_arn=mlflow_arn_str,
        experiment_name=mlflow_exp_str,
        run_id=pipeline_exec_id_str 
    )

## 6. Construct the Pipeline

In [None]:
preprocess_step_output_data = sm_pipeline_preprocess_data_step(
    raw_s3_uri=param_raw_data_s3_uri,
    s3_bucket=default_bucket, 
    s3_prefix=processed_data_s3_prefix,
    jd_column=param_job_desc_column,
    cat_column=param_category_column,
    test_frac_val=param_test_split_fraction,
    val_frac_val=param_validation_split_fraction,
    max_samples_val=param_max_samples_per_split, 
    mlflow_arn_val=mlflow_tracking_server_arn, 
    mlflow_exp_val=mlflow_experiment_name, 
    exec_id=ExecutionVariables.PIPELINE_EXECUTION_ID
)

finetune_step_output_data = sm_pipeline_finetune_launcher_step(
    processed_data_info_dict=preprocess_step_output_data, 
    sagemaker_iam_role=role, 
    train_instance_type_str=param_training_instance_type,
    train_instance_count_int=param_training_instance_count,
    hf_image_uri_str=param_hf_training_image_uri,
    model_id_str=param_model_id, 
    epochs_int=param_finetune_epochs,
    batch_size_int=param_per_device_train_batch_size,
    lr_float=param_learning_rate,
    lora_r_int=param_lora_r,
    lora_alpha_int=param_lora_alpha,
    lora_dropout_float=param_lora_dropout,
    lora_targets_str=param_lora_target_modules,
    merge_weights_str=param_merge_weights,
    hf_token_str=param_hf_token,
    mlflow_arn_str=mlflow_tracking_server_arn,
    mlflow_exp_str=mlflow_experiment_name,
    pipeline_exec_id_str=ExecutionVariables.PIPELINE_EXECUTION_ID
)

evaluate_step_output_data = sm_pipeline_evaluate_model_step(
    finetune_launcher_output_dict=finetune_step_output_data, 
    processed_data_info_dict=preprocess_step_output_data, 
    eval_batch_size_int=param_eval_batch_size,
    mlflow_arn_str=mlflow_tracking_server_arn,
    mlflow_exp_str=mlflow_experiment_name,
    pipeline_exec_id_str=ExecutionVariables.PIPELINE_EXECUTION_ID
)

pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        param_raw_data_s3_uri,
        param_job_desc_column,
        param_category_column,
        param_test_split_fraction,
        param_validation_split_fraction,
        param_max_samples_per_split,
        param_model_id,
        param_training_instance_type,
        param_training_instance_count,
        param_hf_training_image_uri,
        param_finetune_epochs,
        param_per_device_train_batch_size,
        param_learning_rate,
        param_lora_r,
        param_lora_alpha,
        param_lora_dropout,
        param_lora_target_modules,
        param_merge_weights,
        param_hf_token,
        param_eval_batch_size
    ],
    steps=[evaluate_step_output_data],
    sagemaker_session=sess
)

## 7. Upsert and Execute Pipeline

In [None]:
if "your-region" in mlflow_tracking_server_arn:
    print("ERROR: MLflow Tracking Server ARN is a placeholder. Update it in cell [3].")
else:
    print("\nUpserting the pipeline...")
    try:
        pipeline.upsert(role_arn=role)
        print(f"Pipeline '{pipeline_name}' upserted successfully.")

        print("\nStarting pipeline execution...")
        execution = pipeline.start(
            parameters={}
        )
        print(f"Pipeline execution started with ARN: {execution.arn}")
        execution.describe()
    except Exception as e:
        print(f"An error occurred during pipeline upsert or start: {e}")

## 8. Clean up (Optional)

In [None]:
# To delete the pipeline definition from SageMaker:
# try:
#     pipeline.delete()
#     print(f"Pipeline '{pipeline_name}' deleted.")
# except Exception as e:
#     print(f"Error deleting pipeline '{pipeline_name}': {e}")