In [1]:
#compare_models.ipynb
#
#by Joe Hahn
#joe.hahn@oracle.com
#2025 March 17
#
#compare outputs generated by tuned and untuned LLMs

In [2]:
#get start time
import time as tm
clock_start = tm.time()

In [3]:
#import usual libraries
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import rcParams
#import seaborn as sns
color_seq = plt.rcParams['axes.prop_cycle'].by_key()['color']
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)
np.set_printoptions(threshold=200)
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

In [4]:
#check version numbers
import warnings
warnings.filterwarnings('ignore')
import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)
import oci
print('oci.__version__ = ', oci.__version__)
!echo CONDA_DEFAULT_ENV=$CONDA_DEFAULT_ENV 
!echo BUILD_DATE=$BUILD_DATE 

oci.__version__ =  2.158.0
CONDA_DEFAULT_ENV=/home/datascience/conda/generalml_p311_cpu_x86_64_v1
BUILD_DATE=


In [5]:
#set random number seed
random_state = 12
np.random.seed(seed=random_state)

In [6]:
#set max_colwidth for pandas 
pd.set_option('max_colwidth', 1000)

In [7]:
#read test sample
file = 'data/test.jsonl'
df = pd.read_json(file, lines=True)
print ('df.shape = ', df.shape)
df_read = df
df_read.head()

df.shape =  (777, 2)


Unnamed: 0,prompt,completion
0,Provide a succinct answer to the following question: How does Hybridizer generate optimized code?,Hybridizer uses decorated symbols to express parallelism and generates source code or binaries optimized for multicore CPUs and GPUs.
1,Provide a succinct answer to the following question: What is the new feature in CUDA 5.5 version of NVIDIA CUFFT library?,The new feature in CUDA 5.5 version of NVIDIA CUFFT library is the support for the popular FFTW API for FFT acceleration.
2,Provide a succinct answer to the following question: How does EDDY contribute to precision medicine?,EDDY informs doctors with the best options for attacking each individual patient's cancer by analyzing how cells' DNA controls protein production and interactions.
3,Provide a succinct answer to the following question: What requirements does Fraudoscope have similar to traditional polygraph tests?,"Like traditional polygraph tests, Fraudoscope requires a set of calibration questions with well-known answers to detect lies."
4,Provide a succinct answer to the following question: What is the role of the racecheck tool in debugging CUDA applications?,"The racecheck tool in CUDA is used to detect and fix race conditions, which can occur when multiple threads access shared resources simultaneously."


In [8]:
#endpoints for untuned and tuned models
endpoint_untuned = "https://modeldeployment.eu-frankfurt-1.oci.customer-oci.com/ocid1.datasciencemodeldeployment.oc1.eu-frankfurt-1.amaaaaaawe6j4fqawr3lnipvkecvoorvn656xb4dzw737x66w4kzyhazsotq/predict"
endpoint_tuned = "https://modeldeployment.eu-frankfurt-1.oci.customer-oci.com/ocid1.datasciencemodeldeployment.oc1.eu-frankfurt-1.amaaaaaawe6j4fqahspxixfinhnz3wf7s6xwyo3qa7oo37wvyk45ersrukia/predict"

In [9]:
#illustrate call to untuned model's endpoint per
#https://github.com/oracle-samples/oci-data-science-ai-samples/blob/main/ai-quick-actions/model-deployment-tips.md#using-langchain-for-completion-endpoint
endpoint = endpoint_untuned
import ads
from langchain_community.llms import OCIModelDeploymentLLM
ads.set_auth("resource_principal")
model = "odsc-llm"
model_kwargs = {
    "max_tokens": 500,
    "temperature": 0.7,
    "top_k": 50,
    "top_p": 0.99,
    "frequency_penalty": 0,
    "presence_penalty": 0
}
streaming = True
llm = OCIModelDeploymentLLM(endpoint=endpoint, model=model, model_kwargs=model_kwargs, streaming=streaming)
prompt = """
Provide a succinct answer to the following question: 
How did RAPIDS cuDF integrate GPU hash maps, and what benefits did it offer?
"""
response = llm.invoke(prompt)
print ('prompt = ', prompt)
print ('response = ', response)
llm_untuned = llm

prompt =  
Provide a succinct answer to the following question: 
How did RAPIDS cuDF integrate GPU hash maps, and what benefits did it offer?

response =  
RAPIDS cuDF, a GPU-accelerated dataframe library, integrated GPU hash maps to provide fast and efficient key-value mapping operations on large datasets. This integration leverages NVIDIA's CUDA technology to perform hash map operations on the GPU, which can significantly reduce memory usage and execution time compared to CPU-based implementations.

The benefits of integrating GPU hash maps in RAPIDS cuDF include:

1. Reduced memory usage: By performing hash map operations on the GPU, RAPIDS cuDF can reduce the memory footprint of key-value mapping operations, as the GPU has a much larger memory capacity compared to the CPU.

2. Improved performance: GPU hash maps can process large amounts of data much faster than CPU hash maps, as they can perform parallel operations on the GPU's many cores. This results in significant speedups for 

In [10]:
#call tuned model
endpoint = endpoint_tuned
llm = OCIModelDeploymentLLM(endpoint=endpoint, model=model, model_kwargs=model_kwargs, streaming=streaming)
response = llm.invoke(prompt)
print ('prompt = ', prompt)
print ('response = ', response)
llm_tuned = llm

prompt =  
Provide a succinct answer to the following question: 
How did RAPIDS cuDF integrate GPU hash maps, and what benefits did it offer?

response =  ## Answer
RAPIDS cuDF integrated GPU hash maps, enabling parallel hash table updates on the GPU. This integration allowed users to perform complex operations on hash tables, offering performance benefits and facilitating efficient data processing.


In [11]:
#for 5 random records in test sample, call tuned and untuned LLM
df = df_read
N_records = 5
df = df.sample(n=N_records)
t_start = tm.time()
headers = {"route":"/v1/chat/completions"}
for idx, row in df.iterrows():
    prompt = row.prompt
    #get tuned model's prediction
    llm = llm_tuned
    response = llm.invoke(prompt)
    df.loc[idx, 'completion_tuned'] = response
    #get untuned model's prediction
    llm = llm_untuned
    response = llm.invoke(prompt)
    df.loc[idx, 'completion_untuned'] = response
    print ('idx=' + str(idx) + '\t' + 'prompt=' + prompt)
df_predict = df
t_finish = tm.time()
dt_seconds = t_finish - t_start
mean_time_to_predict = dt_seconds/(2*N_records)
print ('mean_time_to_predict = ', mean_time_to_predict)

idx=167	prompt=Provide a succinct answer to the following question: What enhancements were made to the Uncoalesced Memory Access rules in Nsight Compute?
idx=10	prompt=Provide a succinct answer to the following question: What is the role of Thrust in GPU programming?
idx=320	prompt=Provide a succinct answer to the following question: What is the main advantage of using NVIDIA Warp?
idx=351	prompt=Provide a succinct answer to the following question: What are the key features of CUDA 9 libraries?
idx=504	prompt=Provide a succinct answer to the following question: How do the DGL containers help developers avoid using homegrown software?
mean_time_to_predict =  3.538807511329651


In [12]:
df_predict.head()

Unnamed: 0,prompt,completion,completion_tuned,completion_untuned
167,Provide a succinct answer to the following question: What enhancements were made to the Uncoalesced Memory Access rules in Nsight Compute?,"When the Uncoalesced Memory Access rules are triggered in Nsight Compute, they show a table of the five most valuable instances, making it easier to inspect and resolve them on the Source page.",< The Uncoalesced Memory Access rules in Nsight Compute were enhanced to provide insights into memory coalescing. The rules now highlight uncoalesced access patterns and suggest potential optimizations.,"\n\nIn Nsight Compute, the Uncoalesced Memory Access rules were enhanced to include a new option for ""Uncoalesced Access in Kernel"". This option allows developers to explicitly enable uncoalesced memory accesses in their kernels, which can potentially improve performance in certain cases. This is particularly useful for accessing large, contiguous data structures that don't fit into the local memory of a GPU device. The enhanced rules also provide better diagnostics and guidance for resolving uncoalesced memory access issues, making it easier to optimize code for efficient GPU utilization."
10,Provide a succinct answer to the following question: What is the role of Thrust in GPU programming?,"Thrust is a parallel algorithms library inspired by the C++ Standard Template Library. Its primary role is to provide a set of building blocks for parallel computing tasks, such as sorting, scans, transforms, and reductions. Thrust supports multiple system back-ends including NVIDIA GPUs, OpenMP, and Intel's Threading Building Blocks, enabling developers to harness parallel processing power.","� Thrust is a C++ library for GPU programming, providing a set of high-level, template-based algorithms for accelerating computations on GPUs. It simplifies GPU programming by abstracting device-side operations and providing a C++-like programming model.","\n\nThrust is a C++ library for accelerating numerical computations on modern GPUs (Graphics Processing Units) and CPUs (Central Processing Units). It is designed to be easy to use, powerful, and flexible. In GPU programming, Thrust plays a key role in managing and executing parallel operations, such as data sorting, reduction, and transformation, by taking advantage of the massive parallelism available in a GPU. It abstracts the complexity of CUDA, NVIDIA's parallel computing platform, making it possible to write efficient and efficient GPU code without the need to manually handle low-level details like memory management and kernel launches. By using Thrust, programmers can focus on the algorithmic aspects of their problems instead of the intricacies of GPU programming."
320,Provide a succinct answer to the following question: What is the main advantage of using NVIDIA Warp?,"NVIDIA Warp provides a way to write high-performance simulation code using Python, an interpreted language, thus combining performance with productivity.",NVIDIA Warp allows developers to create applications that adapt to varying hardware configurations and provide optimal performance.,"\n\nThe main advantage of using NVIDIA Warp is that it allows for efficient and seamless synchronization of multiple graphics processing operations (pixels or threads) across multiple GPU processors, improving overall performance and reducing latency in complex, parallel computations, particularly in graphics rendering and machine learning applications."
351,Provide a succinct answer to the following question: What are the key features of CUDA 9 libraries?,"CUDA 9 libraries include optimizations for Volta architecture, performance improvements in cuBLAS, redesigned NPP for image and signal processing, improved cuFFT, and new algorithms in nvGRAPH.","6 CUDA 9 libraries offer new features such as the CUDA Graph Analytics SDK, enhancements to cuDNN, and new capabilities for cuBLAS. These libraries provide developers with powerful tools for optimizing algorithms and achieving higher performance.","\n\nCUDA 9 libraries, primarily designed for GPU acceleration, include the following key features:\n\n1. CUDA Runtime API: This is the core library that manages GPU resources, handles memory allocation, synchronization, and launching kernels.\n\n2. cuBLAS: A GPU-accelerated implementation of basic linear algebra subprograms (BLAS), enabling efficient matrix operations.\n\n3. cuFFT: A library for fast Fourier Transform (FFT) operations on the GPU, optimized for a variety of use cases, including signal processing, physics simulations, and more.\n\n4. cuSOLVER: A library that provides optimized linear algebra routines for solving system of equations, eigenvalue problems, and more, utilizing GPU resources.\n\n5. NCCL: A library for collective communication primitives, designed for high-performance distributed deep learning and HPC applications.\n\n6. cuDNN: A deep neural network library providing highly tuned and optimized implementations of common convolutional neural network (CNN) op..."
504,Provide a succinct answer to the following question: How do the DGL containers help developers avoid using homegrown software?,"DGL containers offer tested, validated, and supported end-to-end GNN solutions, allowing developers to avoid the use of expensive-to-maintain homegrown software.","The DGL containers provide the necessary libraries and development tools to build deep learning applications, eliminating the need for developers to create their own software.","\n\nDGL (Dynamic, Graph-centric, and Lightweight) containers help developers by offering pre-built deep learning (DL) modules for graph-based tasks. This means developers can leverage these pre-built modules instead of creating their own from scratch, thus avoiding the need for homegrown software. DGL containers provide a streamlined and efficient solution for building, training, and deploying graph-based DL models, saving time and resources. Additionally, DGL containers are based on PyTorch, a popular DL framework, ensuring compatibility and a familiar development experience for many developers."


In [13]:
#SCORECARD
#record 167:    tuned:   incorrect                      untuned: incorrect and more verbose
#record  10:    tuned:   slightly correct               untuned: partly correct
#record 320:    tuned:   incorrect                      untuned: incorrect and more verbose
#record 351:    tuned:   slightly correct               untuned: incorrect and too verbose
#record 504:    tuned:   partly correct                 untuned: partly correct and too verbose