In [1]:
#compare_models.ipynb
#
#by Joe Hahn
#joe.hahn@oracle.com
#2025 March 17
#
#compare outputs generated by tuned and untuned LLMs

In [2]:
#get start time
import time as tm
clock_start = tm.time()

In [3]:
#import usual libraries
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import rcParams
#import seaborn as sns
color_seq = plt.rcParams['axes.prop_cycle'].by_key()['color']
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)
np.set_printoptions(threshold=200)
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

In [4]:
#check version numbers
import warnings
warnings.filterwarnings('ignore')
import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)
import oci
print('oci.__version__ = ', oci.__version__)
!echo CONDA_DEFAULT_ENV=$CONDA_DEFAULT_ENV 
!echo BUILD_DATE=$BUILD_DATE 

oci.__version__ =  2.160.1
CONDA_DEFAULT_ENV=/home/datascience/conda/generalml_p311_cpu_x86_64_v1
BUILD_DATE=


In [5]:
#set random number seed
random_state = 12
np.random.seed(seed=random_state)

In [6]:
#set max_colwidth for pandas 
pd.set_option('max_colwidth', 1000)

In [7]:
#read test sample
file = 'data/test.jsonl'
df = pd.read_json(file, lines=True)
print ('df.shape = ', df.shape)
df_read = df
df_read.head()

df.shape =  (777, 2)


Unnamed: 0,prompt,completion
0,Provide a succinct answer to the following question: How does Hybridizer generate optimized code?,Hybridizer uses decorated symbols to express parallelism and generates source code or binaries optimized for multicore CPUs and GPUs.
1,Provide a succinct answer to the following question: What is the new feature in CUDA 5.5 version of NVIDIA CUFFT library?,The new feature in CUDA 5.5 version of NVIDIA CUFFT library is the support for the popular FFTW API for FFT acceleration.
2,Provide a succinct answer to the following question: How does EDDY contribute to precision medicine?,EDDY informs doctors with the best options for attacking each individual patient's cancer by analyzing how cells' DNA controls protein production and interactions.
3,Provide a succinct answer to the following question: What requirements does Fraudoscope have similar to traditional polygraph tests?,"Like traditional polygraph tests, Fraudoscope requires a set of calibration questions with well-known answers to detect lies."
4,Provide a succinct answer to the following question: What is the role of the racecheck tool in debugging CUDA applications?,"The racecheck tool in CUDA is used to detect and fix race conditions, which can occur when multiple threads access shared resources simultaneously."


In [8]:
#endpoints for untuned and tuned models
endpoint_untuned = "https://modeldeployment.eu-frankfurt-1.oci.customer-oci.com/ocid1.datasciencemodeldeployment.oc1.eu-frankfurt-1.amaaaaaawe6j4fqawr3lnipvkecvoorvn656xb4dzw737x66w4kzyhazsotq/predict"
endpoint_tuned = "https://modeldeployment.eu-frankfurt-1.oci.customer-oci.com/ocid1.datasciencemodeldeployment.oc1.eu-frankfurt-1.amaaaaaawe6j4fqahspxixfinhnz3wf7s6xwyo3qa7oo37wvyk45ersrukia/predict"

In [9]:
#illustrate call to untuned model's endpoint per
#https://github.com/oracle-samples/oci-data-science-ai-samples/blob/main/ai-quick-actions/model-deployment-tips.md#using-langchain-for-completion-endpoint
endpoint = endpoint_untuned
import ads
from langchain_community.llms import OCIModelDeploymentLLM
ads.set_auth("resource_principal")
model = "odsc-llm"
model_kwargs = {
    "max_tokens": 500,
    "temperature": 0.7,
    "top_k": 50,
    "top_p": 0.99,
    "frequency_penalty": 0,
    "presence_penalty": 0
}
streaming = True
llm = OCIModelDeploymentLLM(endpoint=endpoint, model=model, model_kwargs=model_kwargs, streaming=streaming)
prompt = """
Provide a succinct answer to the following question: 
How did RAPIDS cuDF integrate GPU hash maps, and what benefits did it offer?
"""
response = llm.invoke(prompt)
print ('prompt = ', prompt)
print ('response = ', response)
llm_untuned = llm

prompt =  
Provide a succinct answer to the following question: 
How did RAPIDS cuDF integrate GPU hash maps, and what benefits did it offer?

response =  
RAPIDS cuDF integrated GPU hash maps by utilizing NVIDIA's CUDA library and data structures, specifically cuHashTable, a highly optimized hash table for GPU. cuDF's GPU hash maps allow for efficient storage and retrieval of large amounts of data, offering significant performance benefits over CPU-based hash maps. The benefits include:

1. Speed: GPU hash maps can process large amounts of data much faster than traditional CPU hash maps due to the parallel processing capabilities of GPUs.

2. Memory Efficiency: GPU hash maps use less memory compared to traditional hash maps because they store data in a compressed format. This is beneficial when dealing with large datasets that might not fit into the memory of a single GPU.

3. Low Latency: Lookups in GPU hash maps have low latency due to the optimized data structures, making them idea

In [10]:
#call tuned model
endpoint = endpoint_tuned
llm = OCIModelDeploymentLLM(endpoint=endpoint, model=model, model_kwargs=model_kwargs, streaming=streaming)
response = llm.invoke(prompt)
print ('prompt = ', prompt)
print ('response = ', response)
llm_tuned = llm

prompt =  
Provide a succinct answer to the following question: 
How did RAPIDS cuDF integrate GPU hash maps, and what benefits did it offer?

response =    Answer: RAPIDS cuDF integrated GPU hash maps by using cuHASH_MAP, a device-side hash map optimized for GPU kernels. This integration provided significant performance benefits and scalability for join and filter operations.


In [11]:
#for 5 random records in test sample, call tuned and untuned LLM
df = df_read
N_records = 5
df = df.sample(n=N_records)
t_start = tm.time()
headers = {"route":"/v1/chat/completions"}
for idx, row in df.iterrows():
    prompt = row.prompt
    #get tuned model's prediction
    llm = llm_tuned
    response = llm.invoke(prompt)
    df.loc[idx, 'completion_tuned'] = response
    #get untuned model's prediction
    llm = llm_untuned
    response = llm.invoke(prompt)
    df.loc[idx, 'completion_untuned'] = response
    print ('idx=' + str(idx) + '\t' + 'prompt=' + prompt)
df_predict = df
t_finish = tm.time()
dt_seconds = t_finish - t_start
mean_time_to_predict = dt_seconds/(2*N_records)
print ('mean_time_to_predict = ', mean_time_to_predict)

idx=167	prompt=Provide a succinct answer to the following question: What enhancements were made to the Uncoalesced Memory Access rules in Nsight Compute?
idx=10	prompt=Provide a succinct answer to the following question: What is the role of Thrust in GPU programming?
idx=320	prompt=Provide a succinct answer to the following question: What is the main advantage of using NVIDIA Warp?
idx=351	prompt=Provide a succinct answer to the following question: What are the key features of CUDA 9 libraries?
idx=504	prompt=Provide a succinct answer to the following question: How do the DGL containers help developers avoid using homegrown software?
mean_time_to_predict =  4.043720984458924


In [12]:
df_predict.head()

Unnamed: 0,prompt,completion,completion_tuned,completion_untuned
167,Provide a succinct answer to the following question: What enhancements were made to the Uncoalesced Memory Access rules in Nsight Compute?,"When the Uncoalesced Memory Access rules are triggered in Nsight Compute, they show a table of the five most valuable instances, making it easier to inspect and resolve them on the Source page.","% In Nsight Compute 2021.1, Uncoalesced Memory Access rules have been enhanced with more advanced metrics to help developers better understand and optimize their GPU code.","\n\nIn Nsight Compute, the Uncoalesced Memory Access rules were enhanced to include a new option for ""Strict Uncoalesced Memory Access"" in the profiling settings. This option allows developers to detect uncoalesced memory accesses and enforce coalesced memory accesses where possible, improving GPU performance. Additionally, a ""Memory Coalescing Analysis"" feature was introduced to help developers identify and optimize memory coalescing issues in their code. This analysis provides detailed information about uncoalesced memory access patterns, allowing developers to optimize their kernel launch configurations for better performance."
10,Provide a succinct answer to the following question: What is the role of Thrust in GPU programming?,"Thrust is a parallel algorithms library inspired by the C++ Standard Template Library. Its primary role is to provide a set of building blocks for parallel computing tasks, such as sorting, scans, transforms, and reductions. Thrust supports multiple system back-ends including NVIDIA GPUs, OpenMP, and Intel's Threading Building Blocks, enabling developers to harness parallel processing power.","Thrust is a CUDA programming library that offers a high-level abstraction for parallel algorithms, making it easier to write efficient GPU code.","\n\nThrust is a C++ library for CUDA programming, which provides a high-level abstraction for parallel programming on NVIDIA GPUs. Thrust's role in GPU programming is to simplify the process of writing parallel algorithms, making them more readable, efficient, and maintainable. It provides a set of functional-style operations, such as sorting, reducing, and transforming data, which are often used in scientific computing, data analysis, and machine learning. Thrust is designed to be highly portable, so that algorithms written with it can be easily ported to other parallel systems. In essence, it helps developers focus on their algorithms, rather than the intricacies of parallel programming and GPU-specific architecture."
320,Provide a succinct answer to the following question: What is the main advantage of using NVIDIA Warp?,"NVIDIA Warp provides a way to write high-performance simulation code using Python, an interpreted language, thus combining performance with productivity.","NVIDIA Warp simplifies the process of optimizing and distributing code to GPUs, making it easier for developers to utilize parallel computing capabilities.","\n\nThe main advantage of using NVIDIA Warp is that it enables efficient GPU processing by grouping threads that can be executed together, minimizing memory traffic, and hiding latency, thereby improving overall performance in parallel computing applications. This technique, known as ""warp-level parallelism,"" allows NVIDIA GPUs to process large amounts of data more quickly and efficiently."
351,Provide a succinct answer to the following question: What are the key features of CUDA 9 libraries?,"CUDA 9 libraries include optimizations for Volta architecture, performance improvements in cuBLAS, redesigned NPP for image and signal processing, improved cuFFT, and new algorithms in nvGRAPH.","> CUDA 9 libraries introduce new capabilities in deep learning, image processing, and high performance computing. The libraries are optimized for CUDA 9 and Volta architecture, offering improved performance and capabilities.","\n\nCUDA 9 libraries, primarily used for accelerating computations on NVIDIA GPUs, include the following key features:\n\n1. New APIs and libraries: CUDA 9 introduced APIs like CUDA Graphs for building dataflow graphs, CUDA DNN for deep learning applications, and CUDA RCM (RoCM) for AMD GPUs.\n\n2. Improved performance: Enhancements to the CUDA compiler, such as better auto-tuning and optimizations, help deliver improved performance.\n\n3. Multi-GPU support: CUDA 9 enables efficient utilization of multiple GPUs for increased computational power through features like Multi-Process Service (MPS) and CUDA Graphs.\n\n4. CUDA Toolkit updates: Improvements to tools like NVIDIA Nsight, CUDA Profiler, and CUDA-gdb help developers optimize their CUDA applications.\n\n5. GPU-accelerated data science: Libraries like cuBLAS, cuFFT, cuSOLVER, and cuSPARSE provide optimized algorithms for linear algebra, Fourier transforms, solving linear systems, and sparse matrices, respectively.\n\n6. Deep le..."
504,Provide a succinct answer to the following question: How do the DGL containers help developers avoid using homegrown software?,"DGL containers offer tested, validated, and supported end-to-end GNN solutions, allowing developers to avoid the use of expensive-to-maintain homegrown software.", The DGL containers eliminate the need for developers to install and manage numerous software packages from various sources. This helps developers avoid the hassle of homegrown software and focus on their primary tasks.,"\n\nDGL containers, specifically the PyTorch DGL suite, help developers avoid using homegrown software by offering a pre-built, optimized, and well-documented deep learning graph library. This library provides essential graph neural network (GNN) building blocks, such as graph convolutional networks (GCNs), message passing neural networks (MPNNs), and graph attention networks (GATs), among others. By using DGL containers, developers can leverage these advanced GNN architectures and focus on their specific tasks, rather than spending time on building and optimizing these components from scratch. Additionally, DGL containers provide a consistent and streamlined development environment, allowing developers to easily integrate their models with other PyTorch-based projects and tools. This not only helps in avoiding the overhead of creating custom solutions but also promotes reusability and collaboration among the developer community."


In [13]:
#SCORECARD
#record 167:    tuned:   incorrect                      untuned: incorrect and more verbose
#record  10:    tuned:   slightly correct               untuned: partly correct
#record 320:    tuned:   incorrect                      untuned: incorrect and more verbose
#record 351:    tuned:   slightly correct               untuned: incorrect and too verbose
#record 504:    tuned:   partly correct                 untuned: partly correct and too verbose