In [None]:
import subprocess
import json
import wandb
import os
import pandas as pd
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from azure.identity import DefaultAzureCredential
import time

# Place the blob storage informaiton to access storing and reading files
account_url = '<ACCOUNT_URL>'
sas_token = '<SAS_TOKEN>'
container_name = '<CONTAINER_NAME>'
      
try:
    # Create the BlobServiceClient object using the account URL and SAS token
    blob_service_client = BlobServiceClient(account_url=account_url, credential=sas_token)
    print("Successfully connected to the Blob Service Client.")
except Exception as e:
    print(f"Failed to connect to the Blob Service Client: {e}")


def run_wandb(output_file_path, container_name, project_name, _run_name, dataset, metric_used,filter_used,sample_main_segment,options_dict):
# This function takes the following elements
# output_file_path: This is the directory in the blob store that contains the output results and log_files
# container_name: Name of the container that contains the output files
# project_name: This is the name of the project in WandB under the account defined by the WandB_API_KEY
# _run_name: This is the name of the run that is defined as a grouping criteria
# dataset: This is the individual segments of interest in the results, they would be the name of the groups that are included in the YAML file
# metric_used: This is the metric used to assess the performance of the model on this dataset
# filer_used: the type of filter used on the output of the evaluation 
# sample_main_segment: the label of the main input in the data set
# options_dict: the dictionary containing the labels of the options/multiple choice answers in the data

    # initalize the WandB run within the sepefic project and grouped by the run name
    wandb.init(project=project_name,name='_'.join(dataset),group=_run_name)
    
    # if not sas_token or not account_url:
    #     raise ValueError("AZURE_STORAGE_SAS_TOKEN or AZURE_ACCOUNT_URL environment variable is not set")
    
    # # Use the SAS token to authenticate
    # blob_service_client = BlobServiceClient(account_url=account_url, credential=sas_token)
    container_client = blob_service_client.get_container_client(container_name)

    # # Use DefaultAzureCredential to authenticate with managed identity
    # credential = DefaultAzureCredential()

    # # Read account_url from environment variable
    # account_url = os.getenv('AZURE_ACCOUNT_URL')
    # if not account_url:
    #     raise ValueError("AZURE_ACCOUNT_URL environment variable is not set")
    
    # blob_service_client = BlobServiceClient(account_url=account_url, credential=credential)
    # container_client = blob_service_client.get_container_client(container_name)


    try:
        # List blobs that contain _run_name in their name
        blob_list = container_client.list_blobs()
        
        # Debug: Print all blob names
        all_blobs = [blob.name for blob in blob_list]
        print("All blobs:", all_blobs)
    except Exception as e:
        print(f"Error listing blobs: {e}")
        return

    folder_path = f"{output_file_path}"
    
    # Filter blobs to include only those in the folder labeled with _run_name
    json_files = [blob for blob in all_blobs if blob.startswith(folder_path) and blob.endswith('.json')]
    jsonl_files = [blob for blob in all_blobs if blob.startswith(folder_path) and blob.endswith('.jsonl')]

    # Debug: Print filtered blob names
    print("Filtered JSON files:", json_files)
    print("Filtered JSONL files:", jsonl_files)

    # Download the filtered blobs
    for blob_name in json_files:
        download_file_path = os.path.join(output_file_path, os.path.basename(blob_name))
        os.makedirs(os.path.dirname(download_file_path), exist_ok=True)
        
        with open(download_file_path, "wb") as download_file:
            output_data = container_client.download_blob(blob_name)
            download_file.write(output_data.readall())
            print(f"Downloaded {blob_name} to {download_file_path}")

    # # Read the output file
    # for file_name in json_files:
    #     blob_client = container_client.get_blob_client(file_name)
    #     blob_data = blob_client.download_blob().readall()
    #     output_data = json.loads(blob_data)
        
        # Extract accuracy metrics based on the provided datasets and metric
        accuracy_data = []
        results = output_data.get('results', {})

        
        for dataset_name, metrics in results.items():
            if dataset_name in dataset and (metric_used +',' + filter_used) in metrics:
                accuracy_data.append({
                    'dataset': dataset_name,
                    'metric': metrics[(metric_used +',' + filter_used)],
                    'stderr': metrics[(metric_used +'_stderr,' + filter_used)]
                })

        
        # Log the accuracy and matching/unmatching results to wandb
        for data in accuracy_data:
            wandb.log({
                str(metric_used): data['metric'],
                str(metric_used + "_stderr"): data['stderr']
            })


    # Filter JSONL files based on group_subtasks
    for dataset_name in dataset:
        matching_jsonl_files = [jsonl_file for jsonl_file in jsonl_files]
        for jsonl_file in matching_jsonl_files:
            df = process_jsonl_file_from_blob(container_client, jsonl_file,_run_name + '_' + "_".join(dataset),metric_used,sample_main_segment,options_dict)
            wandb.log({
                "table": wandb.Table(dataframe=df)
            })


def process_jsonl_file_from_blob(container_client, jsonl_file_path,dataset_name,metric_used,sample_main_segment,options_dict):
# extracts table from the log sample file
# reads the jsonl file from the container defined by the jsonl_file_path and the container_client
# identifies the segments of interest based on the dataset_name 
# metric_used: defines the metric that is used in the assessment of the performance
# sample_main_segment: the label of the main input in the data set
# options_dict: the dictionary containing the labels of the options/multiple choice answers in the data

    records = []
    
    blob_client = container_client.get_blob_client(jsonl_file_path)
    blob_data = blob_client.download_blob().readall().decode('utf-8')
    
    for line in blob_data.splitlines():
        record = json.loads(line)
        data_id = record.get('doc_id')
        context = record.get('doc', {}).get(sample_main_segment)
        target = record.get('target')
        resps = record.get('resps', [])[0][0] if record.get('resps') else None
        filtered_resps = record.get('filtered_resps',[])[0] if record.get('filtered_resps') else None
        result = record.get(metric_used)
    
        # Extract options and choices
        endings = {key: record['doc'].get(key, "") for key in options_dict}
        choices = {key: value for key, value in endings.items() if value}

        context_with_choices = context  + "\n" + "\n".join([f"{key}: {value}" for key, value in choices.items()])
        
        records.append({
            'dataset': dataset_name,
            'result' : result,
            'data_id': data_id,
            'question': context_with_choices,
            'target': target,
            'response': resps,
            'filtered_response': filtered_resps
        })	

    return pd.DataFrame(records)

base_command = [
    "python", "-m", "lm-evaluation-harness/lm_eval.__main__",
    "--device", "cpu",
    "--log_samples",
    "--trust_remote_code",
    "--limit","10"
]

output_file_base_path = '<BASE_PATH>'
output_file_full_path = "<FULL_PATH>"
task = '<TASK_NAME>'
include_path = '<TASK_PATH>'
#The label on the segment in the data that represents the input to the model e.g. question/context/segment etc...
sample_main_segment = '<Segment_Label>'
#Listing of ways the options are labeled in the testing data, e.g. 1,2,3,4; A,B,C,D; etc.
options_dict = ['<Label_0>','<label_1>','<LABEL_2>','<LABEL_3>']



# Define the models to evaluate
models = [
            {
            # Model name such as azure_open_chat_completions or hf or phi
            "name": "<model_name>",
            #Template to use if using chat endpoint, otherwise ""
            "Template": "--apply_chat_template",
            # Run name to use in the output file
            "run_name": "<run_name>",
            # Path to the output directory
            "path": "<path>",
            # Model arguments for the harness, this can include the model deployment name and URL 
            "args": "model=<model>,base_url=<base_url>",
            # If you need to pass extra argument to the lm_eval call
            "extra_args": "",
            # The list of groups/datasets to evaluate from the YAML file
            "Datasets": ['<daaset_name>'],
            # Metric to use, e.g. exact match 
            "metric": '<metric>',
            # Filter deployed or none if no filter is used 
            'filter': '<Filer>',
            #You need to setup the API key foe the model as an env variable
            'API_KEY': '<Name of API key env varibale>',
            # set the sleep time of the call if needed, 0 if not needed
            'SLEEP': 4
        },
    ]

wandb.login(host='http://localhost:8080',key= os.getenv('WANDB_API_KEY'))

# Run the evaluation for each model
for model in models:
    filepath = output_file_full_path + "/" + output_file_base_path + "/" + model["path"]
    command = base_command + ["--model", model["name"], model['Template'], "--model_args", model["args"], "--include_path", include_path, "--tasks", task, "--output_path", filepath]


    env = os.environ.copy()
    env['API_KEY'] = str(os.getenv(model['API_KEY']))
    env['API_SLEEP'] = str(model['SLEEP'])

    # Print the command to verify it
    print("Running command:", " ".join(command))

    # Run the command and capture output and errors
    result = subprocess.run(command, env=env, capture_output=True, text=True)

    # Print the output and errors for debugging
    print("Output:", result.stdout)
    print("Errors:", result.stderr)

    # Check if the command was successful
    if result.returncode != 0:
        print(f"Command failed with return code {result.returncode}")
    else:
        # Upload files to the blob store
        blob_path = os.path.join(output_file_base_path, model['run_name']) + "/"

        listing_file = os.path.join(filepath, model['run_name'])

        print("Blob path:", blob_path)
        print("Full path:", listing_file)

        try:
            files = os.listdir(listing_file)
            for file in files:
                file_path = os.path.join(listing_file, file)
                blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_path + file)
                
                try:
                    with open(file_path, "rb") as data:
                        blob_client.upload_blob(data, overwrite=True)
                        print(f"Uploaded {file} to {blob_path + file}")
                except Exception as e:
                    print(f"Failed to upload {file} to {blob_path + file}: {e}")
        except Exception as e:
            print(f"Failed to list files in the directory {listing_file}: {e}")
                
for model in models:
    for dataset in model["Datasets"]:
        filepath = container_name + "/" + output_file_base_path + "/" + model['run_name']
        # Initialize a new wandb run
        run_wandb(filepath, container_name, task, model["run_name"], [dataset],model['metric'],model['filter'],sample_main_segment,options_dict)

# wandb.finish()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Successfully connected to the Blob Service Client.


[34m[1mwandb[0m: Currently logged in as: [33mnawanas[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for localhost to your netrc file: C:\Users\nawanas\_netrc


Running command: python -m lm_eval.__main__ --device cpu --log_samples --trust_remote_code --limit 10 --model azure-openai-chat-completions --apply_chat_template --model_args model=gpt-4o-uks,base_url=https://aoai-aiq-02-uk-south.openai.azure.com/openai/deployments/gpt-4ouks/chat/completions?api-version=2024-02-15-preview --include_path lm_eval/tasks/kobest --tasks kobest_hellaswag_direct --output_path C:/Users/nawanas/source/repos/lm-evaluation-harness-2/kobest_hellaswag_direct/gpt-4o
INFO 12-04 16:06:06 importing.py:10] Triton not installed; certain GPU-related functions will not be available.
azure-openai-chat-completions (model=gpt-4o-uks,base_url=https://aoai-aiq-02-uk-south.openai.azure.com/openai/deployments/gpt-4ouks/chat/completions?api-version=2024-02-15-preview,trust_remote_code=True), gen_kwargs: (None), limit: 10.0, num_fewshot: None, batch_size: 1
|         Tasks         |Version|Filter|n-shot|Total|Effective|  Metric   |   |Value|   |Stderr|
|-----------------------|----

Error listing blobs: The requested URI does not represent any resource on the server.
RequestId:dd9da16d-501e-004f-3ba9-469784000000
Time:2024-12-05T00:09:22.0771516Z
ErrorCode:InvalidUri
Content: <?xml version="1.0" encoding="utf-8"?>
<Error><Code>InvalidUri</Code><Message>The requested URI does not represent any resource on the server.
RequestId:dd9da16d-501e-004f-3ba9-469784000000
Time:2024-12-05T00:09:22.0771516Z</Message></Error>


Error listing blobs: The requested URI does not represent any resource on the server.
RequestId:dd9da3a3-501e-004f-78a9-469784000000
Time:2024-12-05T00:09:24.2479197Z
ErrorCode:InvalidUri
Content: <?xml version="1.0" encoding="utf-8"?>
<Error><Code>InvalidUri</Code><Message>The requested URI does not represent any resource on the server.
RequestId:dd9da3a3-501e-004f-78a9-469784000000
Time:2024-12-05T00:09:24.2479197Z</Message></Error>
