# Multi-model inference with Ray on Databricks

This notebook demonstrates 

In [None]:
# using >=0.7.0 as it supports whisper and manually updating numba due to conflicts
%pip install vllm==0.7.0 pydub numba==0.61.0 databricks-sdk 
%pip install ray --upgrade
%restart_python

## Set catalog and schema

In [None]:
CATALOG = "marcell"
SCHEMA = "call_centre_processing"

Create catalog, schema and volume if they don't exist, and create directories for compressed, raw audio files and models.

In [None]:
spark.sql(f"CREATE CATALOG IF NOT EXISTS {CATALOG}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}")
spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.data")
dbutils.fs.mkdirs(f"/Volumes/{CATALOG}/{SCHEMA}/data/compressed/LJSpeech")
dbutils.fs.mkdirs(f"/Volumes/{CATALOG}/{SCHEMA}/data/raw_audio/LJSpeech")
dbutils.fs.mkdirs(f"/Volumes/{CATALOG}/{SCHEMA}/data/models")

## Download raw audio files

We download the [LJSpeech dataset](https://paperswithcode.com/dataset/ljspeech) from the URL and unzip it to the raw audio directory. This is a collection of 13,100 short audio clips of a single speaker reading passages from 7 non-fiction books. The files are stored in a tar.bz2 archive, so we will first download it and then unzip it.

In [None]:
# Download the LJSpeech dataset

import urllib.request

url = "https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2"
target_file_path = f"/Volumes/{CATALOG}/{SCHEMA}/data/compressed/LJSpeech/LJSpeech-1.1.tar.bz2"
urllib.request.urlretrieve(url, target_file_path)

In [0]:
# Unzip the LJSpeech dataset

import zipfile

extract_to_path = f"/Volumes/{CATALOG}/{SCHEMA}/data/raw_audio/LJSpeech"
with zipfile.ZipFile(target_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to_path)

## Create reference dataframe

We create a reference dataframe that contains the file paths of the raw audio files. We will use this dataframe to parallelize the inference process.

In [0]:
import pyspark.sql.functions as F

df_file_reference = spark.createDataFrame(dbutils.fs.ls("/Volumes/marcell/call_centre_processing/data/raw_audio/LJSpeech/LJSpeech-1.1/wavs/"))\
  .withColumn("file_path", F.expr("substring(path, 6, length(path))")) # remove the leading dbfs:/ from the path

df_file_reference.display()

Write the dataframe to a Delta table.

In [0]:
df_file_reference.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable("marcell.call_centre_processing.recording_file_reference")

## Download models from Hugging Face

We download three models from Hugging Face:
- [Whisper-medium](https://huggingface.co/openai/whisper-medium)
- [Bert-large-uncased](https://huggingface.co/bert-large-uncased) (Hugging Face's default model for text classification)



In [None]:
from transformers import pipeline
import torch

Whisper-medium

In [None]:
WHISPER_MODEL_SAVE_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/data/models/whisper-medium/"
dbutils.fs.mkdirs(WHISPER_MODEL_SAVE_PATH)
whisper_pipeline = pipeline("automatic-speech-recognition", "openai/whisper-medium", torch_dtype=torch.float16, device="cuda:0")
whisper_pipeline.save_pretrained(WHISPER_MODEL_SAVE_PATH)

Bert-large-uncased

In [None]:
NER_MODEL_SAVE_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/data/models/bert-large-uncased/"
dbutils.fs.mkdirs(NER_MODEL_SAVE_PATH)
ner_pipeline = pipeline("ner", device="cuda:0")
ner_pipeline.save_pretrained(NER_MODEL_SAVE_PATH)

Phi-4

In [None]:
PHI_MODEL_SAVE_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/data/models/phi-4/"
dbutils.fs.mkdirs(PHI_MODEL_SAVE_PATH)
phi_pipeline = pipeline(
        "text-generation",
        model="microsoft/phi-4",
        model_kwargs={"torch_dtype": "auto"},
        device_map="auto",
    )
phi_pipeline.save_pretrained(PHI_MODEL_SAVE_PATH)

## Run inference

We run inference on the models and save the results to a Delta table.

In [None]:
import ray
import os
from ray.util.spark import setup_ray_cluster, shutdown_ray_cluster

import ssl
import time

import pyspark.sql.types as T
import pandas as pd

from vllm import LLM, SamplingParams
from vllm.assets.audio import AudioAsset
import librosa
import pydub
import numpy as np

### Set up Ray on Databricks

In [None]:

num_cpu_cores_per_worker = 20 # number of cores to allocate to Ray per worker
num_cpus_head_node = 10 # number of cores to allocate to Ray on the head node
num_gpu_per_worker = 1 # number of GPUs to allocate to Ray per worker
num_gpus_head_node = 1 # number of GPUs to allocate to Ray on the head node
min_worker_nodes = 2 # autoscaling minimum number of workers
max_worker_nodes = 2 # autoscaling maximum number of workers

ray_conf = setup_ray_cluster(
  min_worker_nodes=min_worker_nodes,
  max_worker_nodes=max_worker_nodes,
  num_cpus_head_node= num_cpus_head_node,
  num_gpus_head_node= num_gpus_head_node,
  num_cpus_per_node=num_cpu_cores_per_worker,
  num_gpus_per_node=num_gpu_per_worker
  )
