# LLM Fine-Tuning for Job Description Classification with SageMaker & MLflow

This notebook demonstrates fine-tuning Llama 3 for job description classification using SageMaker Pipelines and MLflow for experiment tracking.

## 1. Setup and Dependencies

In [None]:
!pip install "sagemaker>=2.200.0" "datasets>=2.18.0" "transformers>=4.38.0" "mlflow>=2.9.0" "sagemaker-mlflow>=0.1.0" --quiet
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 --quiet # Adjust cuXXX version if needed
!pip install "accelerate>=0.28.0" "bitsandbytes>=0.42.0" "scikit-learn" "pandas" "matplotlib" "seaborn" "huggingface_hub" "s3fs" --quiet # s3fs for datasets from s3

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sagemaker
import boto3
import os
import json
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.steps import ProcessingStep, TrainingStep 
from sagemaker.workflow.function_step import step
from sagemaker.workflow.parameters import ParameterString, ParameterInteger, ParameterFloat
from sagemaker.workflow.execution_variables import ExecutionVariables
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput
from sagemaker.huggingface import HuggingFace

# Ensure these .py files are in a 'steps' subdirectory relative to the notebook,
# or adjust the path accordingly. Python sys.path might need adjustment if running locally
# and 'steps' isn't automatically discoverable.
# For SageMaker pipeline execution, the source_dir parameter for @step or ScriptProcessor handles this.
from steps.preprocess_job_descriptions import preprocess_data
from steps.finetune_llama3_classifier import finetune_model
from steps.evaluation_classifier import evaluate_model

os.environ["SAGEMAKER_USER_CONFIG_OVERRIDE"] = os.getcwd() # For custom images/config if needed

## 2. SageMaker Session and IAM Role

In [None]:
try:
    role = sagemaker.get_execution_role()
    print(f"SageMaker Execution Role: {role}")
except ValueError:
    iam = boto3.client("iam")
    # Option 1: Replace with your specific role name if not using the default pattern
    # role_name = "YourSageMakerExecutionRoleName"
    # role = iam.get_role(RoleName=role_name)["Role"]["Arn"]
    
    # Option 2: Try to find a role with 'AmazonSageMaker-ExecutionRole' in its name (less reliable)
    print("Could not automatically get SageMaker execution role. Please ensure it's configured or specify manually.")
    print("Attempting to find a suitable role...")
    roles = iam.list_roles(MaxItems=200) # List some roles
    sagemaker_roles = [r['Arn'] for r in roles['Roles'] if 'AmazonSageMaker-ExecutionRole' in r['RoleName']]
    if sagemaker_roles:
        role = sagemaker_roles[0] # Take the first one found
        print(f"Found and using role: {role}")
    else:
        raise ValueError("SageMaker execution role not found. Please create one or specify its ARN.")

sess = sagemaker.Session()
region = sess.boto_region_name
default_bucket = sess.default_bucket()
print(f"SageMaker Session region: {region}, bucket: {default_bucket}")

## 3. Configuration

In [None]:
# Pipeline and Experiment Configuration
pipeline_name = "JobDescClassification-Llama3-Pipeline-V2" # Added V2 to distinguish
base_job_prefix = "job-desc-classify" # Base prefix for SageMaker jobs

# MLflow Configuration - **REPLACE WITH YOUR MLFLOW TRACKING SERVER ARN**
mlflow_tracking_server_arn = "arn:aws:sagemaker:your-region:your-aws-account-id:mlflow-tracking-server/your-tracking-server-name" # <--- REPLACE THIS
mlflow_experiment_name = "JobDescriptionClassification-Llama3-FineTuning"

# Model Configuration
model_id = "meta-llama/Meta-Llama-3-8B" 

# Data Configuration
output_s3_data_prefix = f"s3://{default_bucket}/{base_job_prefix}/data" # Used by preprocess script

# Instance Configuration for Pipeline Steps
preprocess_instance_type = "ml.m5.large"
finetune_instance_type = "ml.g5.12xlarge" # For Llama 3 8B QLoRA
evaluation_instance_type = "ml.g5.2xlarge" 

# Check if MLflow ARN is a placeholder
if "your-region" in mlflow_tracking_server_arn:
    print("WARNING: MLflow Tracking Server ARN is a placeholder. Please replace it with your actual ARN.")

## 4. Pipeline Parameters

In [None]:
param_num_samples_per_category = ParameterInteger(name="NumSamplesPerCategory", default_value=75)
param_languages = ParameterString(name="Languages", default_value="en,es") 
param_finetune_epochs = ParameterInteger(name="FineTuneEpochs", default_value=1)
param_lora_r = ParameterInteger(name="LoraR", default_value=8)
param_lora_alpha = ParameterInteger(name="LoraAlpha", default_value=16)
param_learning_rate = ParameterFloat(name="LearningRate", default_value=0.0002) # 2e-4

# Parameter for Hugging Face Token (optional, can be handled via environment variables too)
# If you set a default value other than a placeholder, be mindful of security.
param_hf_token = ParameterString(name="HuggingFaceToken", default_value="OPTIONAL_HF_TOKEN_PLACEHOLDER")

## 5. Define Pipeline Steps using `@step` decorator

The `@step` decorator packages the Python functions and their dependencies. Ensure the `steps` directory (containing `preprocess_job_descriptions.py`, etc.) is in the same directory as this notebook, or adjust `source_dir` if needed.

For steps requiring specific libraries not in the default SageMaker image (especially GPU libraries like `bitsandbytes`), a custom Docker image or a pre-built SageMaker Deep Learning Container (DLC) image specified via `image_uri` in `@step` is crucial.

In [None]:
# A. Preprocessing Step
@step(
    name="PreprocessJobDescriptions",
    instance_type=preprocess_instance_type,
    keep_alive_period_in_seconds=300,
    # 'dependencies="auto"' can work for simple cases.
    # For more complex dependencies or to ensure the 'steps' module is found:
    # source_dir='./', # Or path to a dir containing 'steps' and any requirements.txt
    # entry_point='steps/preprocess_job_descriptions.py', # if not directly calling the imported function
    # For @step with direct function call, it usually infers from the function's module.
)
def pipeline_preprocess_step(
    output_s3_bucket_name: str,
    num_samples: int,
    langs: str,
    mlflow_arn_tracking: str,
    mlflow_exp_name: str,
    pipeline_exec_id: str,
):
    # Imports are typically inside the function for @step to package them correctly if not using source_dir/entry_point
    # However, since we imported them at the top and they are in 'steps/', it should work.
    # from steps.preprocess_job_descriptions import preprocess_data 
    
    print(f"Executing preprocessing. Output bucket: {output_s3_bucket_name}, Samples/cat: {num_samples}, Langs: {langs}")
    print(f"MLflow ARN: {mlflow_arn_tracking}, Experiment: {mlflow_exp_name}, Run: {pipeline_exec_id}")
    
    languages_list = [lang.strip() for lang in langs.split(',')]
    s3_paths = preprocess_data(
        output_s3_bucket=output_s3_bucket_name,
        num_samples_per_category=num_samples,
        mlflow_arn=mlflow_arn_tracking,
        experiment_name=mlflow_exp_name,
        run_name=pipeline_exec_id, 
        languages=languages_list
    )
    print(f"Preprocessing output S3 paths: {s3_paths}")
    return s3_paths

# B. Fine-tuning Step
hf_pytorch_image_uri = sagemaker.image_uris.retrieve(
    "huggingface-pytorch-training",
    region=region,
    version="4.31.0", # Check for latest compatible version for Transformers 4.38+
    py_version="py310",
    instance_type=finetune_instance_type, 
    image_scope="training"
)
print(f"Using HuggingFace PyTorch image for fine-tuning: {hf_pytorch_image_uri}")

@step(
    name="FineTuneLlama3Classifier",
    instance_type=finetune_instance_type,
    image_uri=hf_pytorch_image_uri, 
    keep_alive_period_in_seconds=3600, # Longer for training, e.g., 1 hour
    # environment variable for Hugging Face token. The step function will receive it as parameter.
)
def pipeline_finetune_step(
    processed_data_s3_paths: dict, 
    model_identifier: str,
    epochs_ft: int,
    lora_r_val: int,
    lora_alpha_val: int,
    lr_val: float,
    mlflow_arn_tracking: str,
    mlflow_exp_name: str,
    pipeline_exec_id: str,
    hf_auth_token: str # Parameter for HF token
):
    # from steps.finetune_llama3_classifier import finetune_model
    
    print(f"Executing fine-tuning. Model: {model_identifier}, Epochs: {epochs_ft}, LoraR: {lora_r_val}")
    print(f"MLflow ARN: {mlflow_arn_tracking}, Experiment: {mlflow_exp_name}, Run: {pipeline_exec_id}")
    
    # Use the HF token if provided and not the placeholder
    actual_hf_token = None
    if hf_auth_token and hf_auth_token != "OPTIONAL_HF_TOKEN_PLACEHOLDER":
        actual_hf_token = hf_auth_token
    elif os.environ.get("HF_TOKEN"):
         actual_hf_token = os.environ.get("HF_TOKEN") # If set in environment
    
    if not actual_hf_token and "meta-llama" in model_identifier:
        print("WARNING: Hugging Face token not provided for a gated model. Fine-tuning might fail.")
        
    # The finetune_model script saves to /opt/ml/model by default.
    # SageMaker @step automatically uploads contents of /opt/ml/model to an S3 location.
    # The S3 path of this uploaded model artifact will be the output of this step.
    # The function itself can return the local path, and @step handles the S3 part.
    local_model_output_path = finetune_model(
        model_id=model_identifier,
        train_data_s3_path=processed_data_s3_paths['train'],
        eval_data_s3_path=processed_data_s3_paths['validation'],
        epochs=epochs_ft,
        per_device_train_batch_size=1, # Keep small for Llama3-8B on g5.12xl with QLoRA
        learning_rate=lr_val,
        lora_r=lora_r_val,
        lora_alpha=lora_alpha_val,
        mlflow_arn=mlflow_arn_tracking,
        experiment_name=mlflow_exp_name,
        run_id=pipeline_exec_id, 
        hf_token=actual_hf_token
    )
    print(f"Fine-tuning script completed. Local model output path: {local_model_output_path}")
    # The actual S3 path will be implicitly returned by @step based on /opt/ml/model content.
    # To make it explicit for the next step to consume, we can return a dict. The key here is conventional.
    return {"model_s3_path_from_opt_ml_model": local_model_output_path} 

# C. Evaluation Step
@step(
    name="EvaluateFineTunedClassifier",
    instance_type=evaluation_instance_type,
    image_uri=hf_pytorch_image_uri, # Reuse image
    keep_alive_period_in_seconds=600,
)
def pipeline_evaluate_step(
    finetune_output: dict, # Output from finetune_step, contains S3 path for /opt/ml/model
    processed_data_s3_paths: dict, 
    mlflow_arn_tracking: str,
    mlflow_exp_name: str,
    pipeline_exec_id: str,
    hf_auth_token: str # Parameter for HF token, might be needed if model loading requires auth
):
    # from steps.evaluation_classifier import evaluate_model
    
    print(f"Executing evaluation. Finetune output: {finetune_output}")
    print(f"MLflow ARN: {mlflow_arn_tracking}, Experiment: {mlflow_exp_name}, Run: {pipeline_exec_id}")
    
    # The finetune_output['model_s3_path_from_opt_ml_model'] is the S3 URI where SageMaker 
    # stored the /opt/ml/model contents from the fine-tuning step. 
    # The evaluation_classifier.py script needs to handle loading from this S3 path.
    # Alternatively, using MLflow model URI is often more robust.
    
    # Construct MLflow model URI. Assumes finetune_model logged it as 'fine_tuned_classifier_model'
    mlflow_model_uri_to_load = f"runs:/{pipeline_exec_id}/fine_tuned_classifier_model"
    print(f"Attempting to load model for evaluation from MLflow URI: {mlflow_model_uri_to_load}")
    
    # The poc_categories.json is expected to be logged by preprocess_data alongside train/val/test files.
    # Its S3 path needs to be constructed based on one of the dataset paths.
    # Assuming 'poc_categories.json' is at the same S3 prefix as 'train_dataset.jsonl'.
    base_s3_dir_for_data = os.path.dirname(processed_data_s3_paths['train'])
    poc_categories_s3 = os.path.join(base_s3_dir_for_data, "poc_categories.json")
    print(f"Path to poc_categories.json for evaluation: {poc_categories_s3}")

    eval_results = evaluate_model(
        model_s3_path_or_mlflow_uri=mlflow_model_uri_to_load, 
        test_data_s3_path=processed_data_s3_paths['test'],
        poc_categories_s3_path=poc_categories_s3,
        mlflow_arn=mlflow_arn_tracking,
        experiment_name=mlflow_exp_name,
        run_id=pipeline_exec_id 
    )
    print(f"Evaluation results: {eval_results}")
    return eval_results

## 6. Construct the Pipeline

In [None]:
preprocess_step_outputs = pipeline_preprocess_step(
    output_s3_bucket_name=default_bucket, 
    num_samples=param_num_samples_per_category,
    langs=param_languages,
    mlflow_arn_tracking=mlflow_tracking_server_arn, 
    mlflow_exp_name=mlflow_experiment_name, 
    pipeline_exec_id=ExecutionVariables.PIPELINE_EXECUTION_ID,
)

finetune_step_outputs = pipeline_finetune_step(
    processed_data_s3_paths=preprocess_step_outputs, 
    model_identifier=model_id, 
    epochs_ft=param_finetune_epochs,
    lora_r_val=param_lora_r,
    lora_alpha_val=param_lora_alpha,
    lr_val=param_learning_rate,
    mlflow_arn_tracking=mlflow_tracking_server_arn,
    mlflow_exp_name=mlflow_experiment_name,
    pipeline_exec_id=ExecutionVariables.PIPELINE_EXECUTION_ID,
    hf_auth_token=param_hf_token # Pass HF token parameter
)

evaluate_step_outputs = pipeline_evaluate_step(
    finetune_output=finetune_step_outputs, 
    processed_data_s3_paths=preprocess_step_outputs,
    mlflow_arn_tracking=mlflow_tracking_server_arn,
    mlflow_exp_name=mlflow_experiment_name,
    pipeline_exec_id=ExecutionVariables.PIPELINE_EXECUTION_ID,
    hf_auth_token=param_hf_token # Pass HF token parameter, eval might need for tokenizer/model if not fully self-contained
)

pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        param_num_samples_per_category,
        param_languages,
        param_finetune_epochs,
        param_lora_r,
        param_lora_alpha,
        param_learning_rate,
        param_hf_token,
    ],
    steps=[evaluate_step_outputs], # Only need to pass the last step if they are chained correctly
    sagemaker_session=sess
)

## 7. Upsert and Execute Pipeline

**Important:** Before executing, ensure your MLflow Tracking Server ARN is correctly set. If it's still a placeholder, the pipeline will attempt to run but MLflow logging will fail or target a non-existent server.

In [None]:
if "your-region" in mlflow_tracking_server_arn:
    print("ERROR: MLflow Tracking Server ARN is still a placeholder. Pipeline execution will likely fail at MLflow logging steps.")
    print("Please update the 'mlflow_tracking_server_arn' variable in cell [3] before proceeding.")
else:
    print("Upserting the pipeline...")
    pipeline.upsert(role_arn=role)
    print(f"Pipeline '{pipeline_name}' upserted.")

    print("Starting pipeline execution with default parameters...")
    # You can override default parameter values here if needed:
    # execution = pipeline.start(
    #     parameters={
    #         "NumSamplesPerCategory": 100, 
    #         "FineTuneEpochs": 2,
    #         "Languages": "en",
    #         "HuggingFaceToken": "your_actual_hf_token_if_needed_and_not_using_env_vars_or_secrets"
    #     }
    # )
    execution = pipeline.start()
    
    print(f"Pipeline execution started with ARN: {execution.arn}")
    execution.describe()

You can monitor the pipeline execution in the SageMaker console.

To run with different parameters (e.g., for a second experiment):
```python
# execution2 = pipeline.start(
# parameters={
# "NumSamplesPerCategory": 150,
# "FineTuneEpochs": 1,
# "LoraR": 16,
# "LoraAlpha": 32,
# "Languages": "en,fr",
# # "HuggingFaceToken": "your_hf_token_if_needed_for_this_run" 
# }
# )
# print(f"Second pipeline execution started with ARN: {execution2.arn}")
```

## 8. Clean up (Optional)

In [None]:
# To delete the pipeline definition from SageMaker after you're done:
# try:
#     pipeline.delete()
#     print(f"Pipeline '{pipeline_name}' deleted.")
# except Exception as e:
#     print(f"Error deleting pipeline: {e}")

# To delete specific pipeline executions, you can use boto3:
# sm_client = boto3.client("sagemaker")
# if 'execution' in locals() and execution:
# try:
#     sm_client.delete_pipeline(PipelineName=execution.arn.split('/')[-1]) # This is incorrect, use PipelineName
#     # To stop and then delete an execution:
#     # sm_client.stop_pipeline_execution(PipelineExecutionArn=execution.arn)
#     # print(f"Pipeline execution {execution.arn} stopped (if running).")
#     # # Deletion of execution records is usually managed by SageMaker or done via UI/CLI for specific executions.
# except Exception as e:
# print(f"Error managing pipeline execution {execution.arn}: {e}")
pass # Placeholder to avoid empty cell issues