In [1]:
import numpy as np
import torch
import whisper

# Load Whisper model (you can use "base", "small", "medium", "large")
model = whisper.load_model("base")

# Example: your numpy array (simulate 1-second audio at 16kHz)
# Make sure it's float32 and in the range [-1, 1]
# shape: (num_samples,)
audio_np = np.random.randn(16000).astype(np.float32)
audio_np /= np.max(np.abs(audio_np))  # normalize

# Convert numpy array to torch tensor
audio_tensor = torch.from_numpy(audio_np)

# Whisper expects 1D tensor on CPU
if len(audio_tensor.shape) > 1:
    audio_tensor = audio_tensor.squeeze()

# Transcribe
result = model.transcribe(audio_tensor.numpy())
print("Transcribed text:", result["text"])


100%|███████████████████████████████████████| 139M/139M [00:12<00:00, 11.5MiB/s]


Transcribed text: 


In [None]:
!pip install git+https://github.com/openai/whisper.git torch

import torch
import torch.nn as nn
import torch.optim as optim
import whisper
from whisper.tokenizer import get_tokenizer





In [None]:
# -------------------------------
# 1. Define a simple upstream model
# -------------------------------
class UpstreamModel(nn.Module):
    def __init__(self, input_dim=80, hidden_dim=512, output_dim=512):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )
    def forward(self, x):
        # x: (batch, time, input_dim)
        return self.net(x)

# -------------------------------
# 2. Load Whisper & freeze it
# -------------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
whisper_model = whisper.load_model("base", device=device)

# Freeze all Whisper params
for p in whisper_model.parameters():
    p.requires_grad = False

decoder = whisper_model.decoder  # keep decoder only
tokenizer = get_tokenizer(multilingual=True)

# -------------------------------
# 3. Combined model
# -------------------------------
class UpstreamToWhisper(nn.Module):
    def __init__(self, upstream, decoder):
        super().__init__()
        self.upstream = upstream
        self.decoder = decoder

    def forward(self, x, tokens):
        # x -> upstream features
        encoder_out = self.upstream(x)  # (batch, time, 512)
        # Feed into Whisper decoder
        return self.decoder(tokens, encoder_out)

# Instantiate
upstream = UpstreamModel().to(device)
model = UpstreamToWhisper(upstream, decoder).to(device)



In [None]:
# -------------------------------
# 4. Dummy training example
# -------------------------------
# Example fake input (batch=2, time=100, features=80)
dummy_audio_feats = torch.randn(2, 100, 80).to(device)

# Dummy target text
target_texts = ["My mother is from an italian village", "openai rocks"]
token_ids = [tokenizer.encode(t) for t in target_texts]

# Pad token ids
max_len = max(len(t) for t in token_ids)
tokens = torch.full((len(token_ids), max_len), tokenizer.eot, dtype=torch.long, device=device)
