In [1]:
!pip install transformers torch torchaudio librosa jiwer matplotlib seaborn plotly pandas numpy psutil

Collecting jiwer
  Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidi

In [2]:
import torch
import torchaudio
import librosa
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import time
import psutil
import os
from pathlib import Path
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from jiwer import wer, cer
import warnings
warnings.filterwarnings('ignore')


In [3]:
plt.style.use('default')
sns.set_palette("husl")


In [4]:
class ModelEvaluator:
    def __init__(self):
        self.results = {}
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Using device: {self.device}")

    def load_audio(self, audio_path, target_sr=16000):
        """Load and preprocess audio file"""
        try:
            # Load audio using librosa (more robust)
            audio, sr = librosa.load(audio_path, sr=target_sr)
            return audio, sr
        except Exception as e:
            print(f"Error loading audio {audio_path}: {e}")
            return None, None

    def evaluate_single_model(self, model_name, audio_files, reference_texts=None, verbose=True):
        """
        Evaluate a single model on given audio files

        Args:
            model_name: HuggingFace model name (e.g., "shivkumarganesh/whisper-small-uz-v1")
            audio_files: List of audio file paths or single audio file path
            reference_texts: List of reference transcriptions (optional, for WER/CER calculation)
            verbose: Print detailed logs
        """
        if isinstance(audio_files, str):
            audio_files = [audio_files]

        if verbose:
            print(f"\n{'='*60}")
            print(f"🔄 EVALUATING MODEL: {model_name}")
            print(f"{'='*60}")

        # Initialize metrics
        metrics = {
            'model_name': model_name,
            'transcriptions': [],
            'load_time': 0,
            'inference_times': [],
            'memory_usage': {'before': 0, 'after': 0, 'peak': 0},
            'model_size_mb': 0,
            'audio_durations': [],
            'processing_speed_ratio': [],  # How many times faster than real-time
            'wer_scores': [],
            'cer_scores': [],
            'errors': []
        }

        try:
            # Memory before loading
            process = psutil.Process()
            metrics['memory_usage']['before'] = process.memory_info().rss / 1024 / 1024

            # Load model and processor
            if verbose:
                print("📥 Loading model and processor...")
            start_load = time.time()

            processor = WhisperProcessor.from_pretrained(model_name)
            model = WhisperForConditionalGeneration.from_pretrained(model_name)
            model.to(self.device)

            metrics['load_time'] = time.time() - start_load
            metrics['model_size_mb'] = sum(p.numel() * p.element_size() for p in model.parameters()) / (1024 * 1024)

            # Memory after loading
            metrics['memory_usage']['after'] = process.memory_info().rss / 1024 / 1024

            if verbose:
                print(f"✅ Model loaded in {metrics['load_time']:.2f}s")
                print(f"📊 Model size: {metrics['model_size_mb']:.1f} MB")
                print(f"💾 Memory usage: {metrics['memory_usage']['after'] - metrics['memory_usage']['before']:.1f} MB")

            # Process each audio file
            for i, audio_path in enumerate(audio_files):
                if verbose:
                    print(f"\n🎵 Processing audio {i+1}/{len(audio_files)}: {Path(audio_path).name}")

                # Load audio
                audio, sr = self.load_audio(audio_path)
                if audio is None:
                    metrics['errors'].append(f"Failed to load {audio_path}")
                    continue

                audio_duration = len(audio) / sr
                metrics['audio_durations'].append(audio_duration)

                # Transcribe
                start_inference = time.time()

                try:
                    # Prepare inputs
                    inputs = processor(audio, sampling_rate=sr, return_tensors="pt").to(self.device)

                    # Generate transcription
                    with torch.no_grad():
                        predicted_ids = model.generate(
                            inputs["input_features"],
                            max_length=448,
                            num_beams=5,
                            do_sample=False,
                            task="transcribe",
                            language="uz"  # Uzbek language code
                        )

                    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
                    inference_time = time.time() - start_inference

                    metrics['transcriptions'].append(transcription)
                    metrics['inference_times'].append(inference_time)

                    # Calculate processing speed ratio
                    speed_ratio = audio_duration / inference_time
                    metrics['processing_speed_ratio'].append(speed_ratio)

                    if verbose:
                        print(f"📝 Transcription: '{transcription}'")
                        print(f"⏱️  Inference time: {inference_time:.2f}s")
                        print(f"🚀 Speed ratio: {speed_ratio:.1f}x real-time")

                    # Calculate WER and CER if reference provided
                    if reference_texts and i < len(reference_texts):
                        wer_score = wer([reference_texts[i]], [transcription])
                        cer_score = cer([reference_texts[i]], [transcription])
                        metrics['wer_scores'].append(wer_score)
                        metrics['cer_scores'].append(cer_score)

                        if verbose:
                            print(f"📊 WER: {wer_score:.3f} ({wer_score*100:.1f}%)")
                            print(f"📊 CER: {cer_score:.3f} ({cer_score*100:.1f}%)")

                except Exception as e:
                    error_msg = f"Inference error for {audio_path}: {e}"
                    metrics['errors'].append(error_msg)
                    if verbose:
                        print(f"❌ {error_msg}")

            # Peak memory usage
            metrics['memory_usage']['peak'] = process.memory_info().rss / 1024 / 1024

            # Calculate aggregate metrics
            if metrics['inference_times']:
                metrics['avg_inference_time'] = np.mean(metrics['inference_times'])
                metrics['avg_speed_ratio'] = np.mean(metrics['processing_speed_ratio'])
                metrics['total_audio_duration'] = sum(metrics['audio_durations'])
                metrics['total_processing_time'] = sum(metrics['inference_times'])

            if metrics['wer_scores']:
                metrics['avg_wer'] = np.mean(metrics['wer_scores'])
                metrics['avg_cer'] = np.mean(metrics['cer_scores'])

            if verbose:
                print(f"\n📈 SUMMARY FOR {model_name}:")
                print(f"   Average inference time: {metrics.get('avg_inference_time', 0):.2f}s")
                print(f"   Average speed ratio: {metrics.get('avg_speed_ratio', 0):.1f}x")
                if 'avg_wer' in metrics:
                    print(f"   Average WER: {metrics['avg_wer']:.3f} ({metrics['avg_wer']*100:.1f}%)")
                    print(f"   Average CER: {metrics['avg_cer']:.3f} ({metrics['avg_cer']*100:.1f}%)")

        except Exception as e:
            error_msg = f"Model loading/evaluation error: {e}"
            metrics['errors'].append(error_msg)
            if verbose:
                print(f"❌ {error_msg}")

        finally:
            # Clean up memory
            if 'model' in locals():
                del model
            if 'processor' in locals():
                del processor
            torch.cuda.empty_cache() if torch.cuda.is_available() else None

        # Store results
        self.results[model_name] = metrics
        return metrics

    def compare_models(self, model_names, audio_files, reference_texts=None):
        """Compare multiple models"""
        print(f"\n🏁 STARTING COMPARISON OF {len(model_names)} MODELS")
        print(f"📁 Audio files: {len(audio_files) if isinstance(audio_files, list) else 1}")

        for model_name in model_names:
            self.evaluate_single_model(model_name, audio_files, reference_texts)

        return self.results

    def create_comparison_plots(self, save_plots=True):
        """Create comprehensive comparison plots"""
        if not self.results:
            print("❌ No results to plot. Run evaluation first.")
            return

        # Prepare data for plotting
        plot_data = []
        for model_name, metrics in self.results.items():
            if 'avg_inference_time' in metrics:  # Only include successful evaluations
                plot_data.append({
                    'Model': model_name.split('/')[-1],  # Short name
                    'Full_Model': model_name,
                    'Avg_Inference_Time': metrics.get('avg_inference_time', 0),
                    'Avg_Speed_Ratio': metrics.get('avg_speed_ratio', 0),
                    'Memory_Usage_MB': metrics['memory_usage']['after'] - metrics['memory_usage']['before'],
                    'Model_Size_MB': metrics.get('model_size_mb', 0),
                    'Load_Time': metrics.get('load_time', 0),
                    'Avg_WER': metrics.get('avg_wer', None),
                    'Avg_CER': metrics.get('avg_cer', None),
                    'Accuracy_Percent': (1 - metrics.get('avg_wer', 1)) * 100 if metrics.get('avg_wer') is not None else None
                })

        if not plot_data:
            print("❌ No successful evaluations to plot.")
            return

        df = pd.DataFrame(plot_data)

        # 1. Performance Overview
        fig = make_subplots(
            rows=2, cols=2,
            subplot_titles=('Inference Time Comparison', 'Speed Ratio (Real-time Multiple)',
                          'Memory Usage', 'Model Size'),
            specs=[[{"secondary_y": False}, {"secondary_y": False}],
                   [{"secondary_y": False}, {"secondary_y": False}]]
        )

        # Inference time
        fig.add_trace(
            go.Bar(x=df['Model'], y=df['Avg_Inference_Time'], name='Inference Time (s)',
                   marker_color='lightblue'),
            row=1, col=1
        )

        # Speed ratio
        fig.add_trace(
            go.Bar(x=df['Model'], y=df['Avg_Speed_Ratio'], name='Speed Ratio (x)',
                   marker_color='lightgreen'),
            row=1, col=2
        )

        # Memory usage
        fig.add_trace(
            go.Bar(x=df['Model'], y=df['Memory_Usage_MB'], name='Memory (MB)',
                   marker_color='orange'),
            row=2, col=1
        )

        # Model size
        fig.add_trace(
            go.Bar(x=df['Model'], y=df['Model_Size_MB'], name='Model Size (MB)',
                   marker_color='red'),
            row=2, col=2
        )

        fig.update_layout(height=800, title_text="🔍 Model Performance Comparison", showlegend=False)
        fig.show()

        # 2. Accuracy Comparison (if available)
        if df['Avg_WER'].notna().any():
            fig_acc = go.Figure()

            fig_acc.add_trace(go.Bar(
                x=df['Model'],
                y=df['Accuracy_Percent'],
                name='Accuracy %',
                marker_color='green',
                text=[f'{x:.1f}%' for x in df['Accuracy_Percent']],
                textposition='auto'
            ))

            fig_acc.add_trace(go.Bar(
                x=df['Model'],
                y=df['Avg_WER'] * 100,
                name='WER %',
                marker_color='red',
                text=[f'{x:.1f}%' for x in df['Avg_WER'] * 100],
                textposition='auto'
            ))

            fig_acc.update_layout(
                title='🎯 Accuracy Comparison',
                xaxis_title='Model',
                yaxis_title='Percentage (%)',
                barmode='group',
                height=500
            )
            fig_acc.show()

        # 3. Efficiency Scatter Plot
        if df['Avg_WER'].notna().any():
            fig_scatter = px.scatter(
                df,
                x='Avg_Inference_Time',
                y='Accuracy_Percent',
                size='Model_Size_MB',
                color='Model',
                title='⚡ Efficiency Analysis: Speed vs Accuracy',
                labels={
                    'Avg_Inference_Time': 'Average Inference Time (seconds)',
                    'Accuracy_Percent': 'Accuracy (%)',
                    'Model_Size_MB': 'Model Size (MB)'
                },
                hover_data=['Memory_Usage_MB', 'Avg_Speed_Ratio']
            )
            fig_scatter.update_layout(height=500)
            fig_scatter.show()

        # 4. Summary Table
        print("\n📊 DETAILED COMPARISON TABLE")
        print("="*100)

        display_df = df[['Model', 'Avg_Inference_Time', 'Avg_Speed_Ratio', 'Memory_Usage_MB',
                        'Model_Size_MB', 'Load_Time']]
        if 'Accuracy_Percent' in df.columns and df['Accuracy_Percent'].notna().any():
            display_df = pd.concat([display_df, df[['Avg_WER', 'Avg_CER', 'Accuracy_Percent']]], axis=1)

        print(display_df.round(3).to_string(index=False))

        # 5. Recommendations
        print(f"\n🏆 RECOMMENDATIONS")
        print("="*50)

        if len(df) > 1:
            fastest_model = df.loc[df['Avg_Inference_Time'].idxmin(), 'Model']
            print(f"🚀 Fastest Model: {fastest_model}")

            smallest_model = df.loc[df['Model_Size_MB'].idxmin(), 'Model']
            print(f"💾 Smallest Model: {smallest_model}")

            memory_efficient = df.loc[df['Memory_Usage_MB'].idxmin(), 'Model']
            print(f"🧠 Most Memory Efficient: {memory_efficient}")

            if df['Accuracy_Percent'].notna().any():
                most_accurate = df.loc[df['Accuracy_Percent'].idxmax(), 'Model']
                print(f"🎯 Most Accurate: {most_accurate}")

                best_balance = df.loc[(df['Accuracy_Percent'] / df['Accuracy_Percent'].max() +
                                     (df['Avg_Speed_Ratio'] / df['Avg_Speed_Ratio'].max())).idxmax(), 'Model']
                print(f"⚖️  Best Balance (Speed + Accuracy): {best_balance}")

        return df


In [5]:
def quick_test_single_model(model_name, audio_file_path, reference_text=None):
    """
    Quick test for a single model - just change the model name!

    Usage:
        quick_test_single_model("shivkumarganesh/whisper-small-uz-v1", "audio.wav")
    """
    evaluator = ModelEvaluator()
    reference_texts = [reference_text] if reference_text else None

    result = evaluator.evaluate_single_model(
        model_name=model_name,
        audio_files=[audio_file_path],
        reference_texts=reference_texts,
        verbose=True
    )

    return result, evaluator

In [6]:
def compare_multiple_models(model_list, audio_files, reference_texts=None):
    """
    Compare multiple models easily

    Usage:
        models = ["shivkumarganesh/whisper-small-uz-v1", "GitNazarov/whisper-large-uz"]
        compare_multiple_models(models, ["audio1.wav", "audio2.wav"])
    """
    evaluator = ModelEvaluator()
    results = evaluator.compare_models(model_list, audio_files, reference_texts)

    # Create plots
    comparison_df = evaluator.create_comparison_plots()

    return results, comparison_df, evaluator


In [8]:
models_to_compare = [
    # 👈 ADD YOUR MODELS HERE!
    "GitNazarov/whisper-large-uz",
    "Makhmud/whisper-uzbek"
]

audio_files = [
    "/content/s_1796.wav",                          # 👈 ADD YOUR AUDIO FILES!
    "/content/test2.wav"
]

reference_texts = [                                 # 👈 OPTIONAL REFERENCES
    "Quyidagilar Sug‘urta hodisasi hisoblanmaydi: mehnat shartnomasini xodimning tashabbusi bilan bekor qilinganda; xodimning o‘z mehnat vazifalarini muntazam ravishda buzganligi.",
    "Test versiya ikki, model aniqligini tekshiramiz, menning ismim Nurmuhammad"
]

results, comparison_df, evaluator = compare_multiple_models(
    models_to_compare,
    audio_files,
    reference_texts  # Can be None if you don't have references
)

Using device: cuda

🏁 STARTING COMPARISON OF 2 MODELS
📁 Audio files: 2

🔄 EVALUATING MODEL: GitNazarov/whisper-large-uz
📥 Loading model and processor...


pytorch_model.bin:   4%|4         | 273M/6.45G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.17G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.49k [00:00<?, ?B/s]

✅ Model loaded in 317.05s
📊 Model size: 5887.2 MB
💾 Memory usage: 3146.6 MB

🎵 Processing audio 1/2: s_1796.wav


You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50359]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


📝 Transcription: ' Qüydagiler suxurta xadisa səsəq soplanməidə. Mekhnad şartnamasını xadimnin təşabosub ilan bekarqalindəndə xadimnin oz meknad vazifələrini muntazam ravisdə buzgəlligə.'
⏱️  Inference time: 11.39s
🚀 Speed ratio: 1.0x real-time
📊 WER: 0.944 (94.4%)
📊 CER: 0.322 (32.2%)

🎵 Processing audio 2/2: test2.wav
📝 Transcription: ' Taz versiyaya ikki madar anaqla genetik shiramiz. Menin ismim Nur Muhammad.'
⏱️  Inference time: 3.80s
🚀 Speed ratio: 1.4x real-time
📊 WER: 1.111 (111.1%)
📊 CER: 0.324 (32.4%)

📈 SUMMARY FOR GitNazarov/whisper-large-uz:
   Average inference time: 7.60s
   Average speed ratio: 1.2x
   Average WER: 1.028 (102.8%)
   Average CER: 0.323 (32.3%)

🔄 EVALUATING MODEL: Makhmud/whisper-uzbek
📥 Loading model and processor...


preprocessor_config.json:   0%|          | 0.00/339 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.84k [00:00<?, ?B/s]

✅ Model loaded in 51.37s
📊 Model size: 922.1 MB
💾 Memory usage: 27.2 MB

🎵 Processing audio 1/2: s_1796.wav
📝 Transcription: 'Quyidagilar sug‘urta hodisasi hisoblanmaydi. Mehnat shartnomasini xodimning tashabbusi bilan bekor qilinganda, xodimning o‘z mehnat vazifalarini muntazam ravishda buzganligi.'
⏱️  Inference time: 1.90s
🚀 Speed ratio: 5.7x real-time
📊 WER: 0.222 (22.2%)
📊 CER: 0.023 (2.3%)

🎵 Processing audio 2/2: test2.wav
📝 Transcription: 'Tasvversiya ikki, madr aniqligini tekshiramiz, mening ismi Burmuhammad.'
⏱️  Inference time: 0.87s
🚀 Speed ratio: 6.2x real-time
📊 WER: 0.667 (66.7%)
📊 CER: 0.135 (13.5%)

📈 SUMMARY FOR Makhmud/whisper-uzbek:
   Average inference time: 1.39s
   Average speed ratio: 6.0x
   Average WER: 0.444 (44.4%)
   Average CER: 0.079 (7.9%)



📊 DETAILED COMPARISON TABLE
           Model  Avg_Inference_Time  Avg_Speed_Ratio  Memory_Usage_MB  Model_Size_MB  Load_Time  Avg_WER  Avg_CER  Accuracy_Percent
whisper-large-uz               7.596            1.195         3146.648       5887.241    317.054    1.028    0.323            -2.778
   whisper-uzbek               1.389            5.984           27.195        922.146     51.373    0.444    0.079            55.556

🏆 RECOMMENDATIONS
🚀 Fastest Model: whisper-uzbek
💾 Smallest Model: whisper-uzbek
🧠 Most Memory Efficient: whisper-uzbek
🎯 Most Accurate: whisper-uzbek
⚖️  Best Balance (Speed + Accuracy): whisper-uzbek


In [13]:
models_to_compare = [
    # 👈 ADD YOUR MODELS HERE!
    "aslon1213/whisper-small-uz-with-uzbekvoice"]

audio_files = [
    "/content/s_1796.wav",                          # 👈 ADD YOUR AUDIO FILES!
    "/content/test2.wav"
]

reference_texts = [                                 # 👈 OPTIONAL REFERENCES
    "Quyidagilar Sug‘urta hodisasi hisoblanmaydi: mehnat shartnomasini xodimning tashabbusi bilan bekor qilinganda; xodimning o‘z mehnat vazifalarini muntazam ravishda buzganligi.",
    "Test versiya ikki, model aniqligini tekshiramiz, menning ismim Nurmuhammad"
]

results, comparison_df, evaluator = compare_multiple_models(
    models_to_compare,
    audio_files,
    reference_texts  # Can be None if you don't have references
)

Using device: cuda

🏁 STARTING COMPARISON OF 1 MODELS
📁 Audio files: 2

🔄 EVALUATING MODEL: aslon1213/whisper-small-uz-with-uzbekvoice
📥 Loading model and processor...
✅ Model loaded in 6.24s
📊 Model size: 922.1 MB
💾 Memory usage: -0.0 MB

🎵 Processing audio 1/2: s_1796.wav
'
⏱️  Inference time: 1.85s
🚀 Speed ratio: 5.9x real-time
📊 WER: 0.222 (22.2%)
📊 CER: 0.023 (2.3%)

🎵 Processing audio 2/2: test2.wav
📝 Transcription: 'Test versiya ikki, model aniqligini tekshiramiz, mening ismim Nurmuhammad.'
⏱️  Inference time: 0.79s
🚀 Speed ratio: 6.8x real-time
📊 WER: 0.222 (22.2%)
📊 CER: 0.027 (2.7%)

📈 SUMMARY FOR aslon1213/whisper-small-uz-with-uzbekvoice:
   Average inference time: 1.32s
   Average speed ratio: 6.4x
   Average WER: 0.222 (22.2%)
   Average CER: 0.025 (2.5%)



📊 DETAILED COMPARISON TABLE
                           Model  Avg_Inference_Time  Avg_Speed_Ratio  Memory_Usage_MB  Model_Size_MB  Load_Time  Avg_WER  Avg_CER  Accuracy_Percent
whisper-small-uz-with-uzbekvoice               1.323            6.375           -0.012        922.146      6.243    0.222    0.025            77.778

🏆 RECOMMENDATIONS


In [16]:
models_to_compare = [
    # 👈 ADD YOUR MODELS HERE!
    "mustafoyev202/whisper-uz"]

audio_files = [
    "/content/s_1796.wav",                          # 👈 ADD YOUR AUDIO FILES!
    "/content/test2.wav"
]

reference_texts = [                                 # 👈 OPTIONAL REFERENCES
    "Quyidagilar Sug‘urta hodisasi hisoblanmaydi: mehnat shartnomasini xodimning tashabbusi bilan bekor qilinganda; xodimning o‘z mehnat vazifalarini muntazam ravishda buzganligi.",
    "Test versiya ikki, model aniqligini tekshiramiz, menning ismim Nurmuhammad"
]

results, comparison_df, evaluator = compare_multiple_models(
    models_to_compare,
    audio_files,
    reference_texts  # Can be None if you don't have references
)

Using device: cuda

🏁 STARTING COMPARISON OF 1 MODELS
📁 Audio files: 2

🔄 EVALUATING MODEL: mustafoyev202/whisper-uz
📥 Loading model and processor...
✅ Model loaded in 3.82s
📊 Model size: 922.1 MB
💾 Memory usage: 0.0 MB

🎵 Processing audio 1/2: s_1796.wav
📝 Transcription: 'Quyidagilar sug‘urta hodisasiz hisoblanmaydi. Mehnat shartnomasini xodimning tashabbusi bilan bekor qilinganda, xodimning o‘z mehnat vazifalarini muntazam ravishda buzganligi'
⏱️  Inference time: 1.97s
🚀 Speed ratio: 5.6x real-time
📊 WER: 0.333 (33.3%)
📊 CER: 0.034 (3.4%)

🎵 Processing audio 2/2: test2.wav
📝 Transcription: 'Tasvirsiya ikki, madir aniqligini tekshiramiz, mening ismi Nur Muhammad.'
⏱️  Inference time: 0.78s
🚀 Speed ratio: 6.9x real-time
📊 WER: 0.778 (77.8%)
📊 CER: 0.162 (16.2%)

📈 SUMMARY FOR mustafoyev202/whisper-uz:
   Average inference time: 1.38s
   Average speed ratio: 6.2x
   Average WER: 0.556 (55.6%)
   Average CER: 0.098 (9.8%)



📊 DETAILED COMPARISON TABLE
     Model  Avg_Inference_Time  Avg_Speed_Ratio  Memory_Usage_MB  Model_Size_MB  Load_Time  Avg_WER  Avg_CER  Accuracy_Percent
whisper-uz               1.375            6.249              0.0        922.146      3.818    0.556    0.098            44.444

🏆 RECOMMENDATIONS


In [17]:
models_to_compare = [
    # 👈 ADD YOUR MODELS HERE!
    "nodirjon/whisper-small-uz",
    "aisha-org/Whisper-Uzbek"]

audio_files = [
    "/content/s_1796.wav",                          # 👈 ADD YOUR AUDIO FILES!
    "/content/test2.wav"
]

reference_texts = [                                 # 👈 OPTIONAL REFERENCES
    "Quyidagilar Sug‘urta hodisasi hisoblanmaydi: mehnat shartnomasini xodimning tashabbusi bilan bekor qilinganda; xodimning o‘z mehnat vazifalarini muntazam ravishda buzganligi.",
    "Test versiya ikki, model aniqligini tekshiramiz, menning ismim Nurmuhammad"
]

results, comparison_df, evaluator = compare_multiple_models(
    models_to_compare,
    audio_files,
    reference_texts  # Can be None if you don't have references
)

Using device: cuda

🏁 STARTING COMPARISON OF 2 MODELS
📁 Audio files: 2

🔄 EVALUATING MODEL: nodirjon/whisper-small-uz
📥 Loading model and processor...


preprocessor_config.json:   0%|          | 0.00/339 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.25k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.81k [00:00<?, ?B/s]

✅ Model loaded in 50.25s
📊 Model size: 922.1 MB
💾 Memory usage: 0.0 MB

🎵 Processing audio 1/2: s_1796.wav
📝 Transcription: 'Quyidagilar sug‘urda xodisasi hisoblanmaydi. Mehnat shartnomasini xodimning tashabbusi bilan bekor qilinganda, xodimning o‘z mehnat vazifalarini muntazam ravishda buzganligi.'
⏱️  Inference time: 1.87s
🚀 Speed ratio: 5.9x real-time
📊 WER: 0.278 (27.8%)
📊 CER: 0.034 (3.4%)

🎵 Processing audio 2/2: test2.wav
📝 Transcription: 'Tast versiya ikki, model anaqligini tekshiramiz, mening ismim bur muhammad.'
⏱️  Inference time: 0.81s
🚀 Speed ratio: 6.7x real-time
📊 WER: 0.556 (55.6%)
📊 CER: 0.081 (8.1%)

📈 SUMMARY FOR nodirjon/whisper-small-uz:
   Average inference time: 1.34s
   Average speed ratio: 6.3x
   Average WER: 0.417 (41.7%)
   Average CER: 0.058 (5.8%)

🔄 EVALUATING MODEL: aisha-org/Whisper-Uzbek
📥 Loading model and processor...


preprocessor_config.json:   0%|          | 0.00/339 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.25k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.06G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.06G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.83k [00:00<?, ?B/s]

✅ Model loaded in 157.26s
📊 Model size: 2913.9 MB
💾 Memory usage: -270.0 MB

🎵 Processing audio 1/2: s_1796.wav
📝 Transcription: 'Quyidagilar sug‘urta hodisasi hisoblanmaydi, mehnat shartnomasini xodimning tashabbusi bilan bekor qilinganda, xodimning o‘z mehnat vazifalarini muntazam ravishda buzganligi'
⏱️  Inference time: 4.80s
🚀 Speed ratio: 2.3x real-time
📊 WER: 0.222 (22.2%)
📊 CER: 0.023 (2.3%)

🎵 Processing audio 2/2: test2.wav
📝 Transcription: 'Tasvirsiya ikki, model aniqligini tekshiramiz, mening ismim Burmuhammad'
⏱️  Inference time: 2.17s
🚀 Speed ratio: 2.5x real-time
📊 WER: 0.444 (44.4%)
📊 CER: 0.081 (8.1%)

📈 SUMMARY FOR aisha-org/Whisper-Uzbek:
   Average inference time: 3.48s
   Average speed ratio: 2.4x
   Average WER: 0.333 (33.3%)
   Average CER: 0.052 (5.2%)



📊 DETAILED COMPARISON TABLE
           Model  Avg_Inference_Time  Avg_Speed_Ratio  Memory_Usage_MB  Model_Size_MB  Load_Time  Avg_WER  Avg_CER  Accuracy_Percent
whisper-small-uz               1.342            6.265            0.000        922.146     50.245    0.417    0.058            58.333
   Whisper-Uzbek               3.480            2.396         -270.027       2913.887    157.257    0.333    0.052            66.667

🏆 RECOMMENDATIONS
🚀 Fastest Model: whisper-small-uz
💾 Smallest Model: whisper-small-uz
🧠 Most Memory Efficient: Whisper-Uzbek
🎯 Most Accurate: Whisper-Uzbek
⚖️  Best Balance (Speed + Accuracy): whisper-small-uz


In [19]:
models_to_compare = [
    # 👈 ADD YOUR MODELS HERE!
    "jmshd/whisper-uz",
    "ShakhzoDavronov/whisper-large-lora-uz"]

audio_files = [
    "/content/s_1796.wav",                          # 👈 ADD YOUR AUDIO FILES!
    "/content/test2.wav"
]

reference_texts = [                                 # 👈 OPTIONAL REFERENCES
    "Quyidagilar Sug‘urta hodisasi hisoblanmaydi: mehnat shartnomasini xodimning tashabbusi bilan bekor qilinganda; xodimning o‘z mehnat vazifalarini muntazam ravishda buzganligi.",
    "Test versiya ikki, model aniqligini tekshiramiz, menning ismim Nurmuhammad"
]

results, comparison_df, evaluator = compare_multiple_models(
    models_to_compare,
    audio_files,
    reference_texts  # Can be None if you don't have references
)

Using device: cuda

🏁 STARTING COMPARISON OF 2 MODELS
📁 Audio files: 2

🔄 EVALUATING MODEL: jmshd/whisper-uz
📥 Loading model and processor...
✅ Model loaded in 3.55s
📊 Model size: 276.9 MB
💾 Memory usage: 1.2 MB

🎵 Processing audio 1/2: s_1796.wav
📝 Transcription: 'Quyidagilar sug'urta hodisasi hisoblanmaydi. Mehnat shartomasini xodimning tashabbusi bilan bekor qilinganda, xodimning o'z mehnat vazifalarini muntazam ravishda buzganligi.'
⏱️  Inference time: 0.87s
🚀 Speed ratio: 12.6x real-time
📊 WER: 0.333 (33.3%)
📊 CER: 0.040 (4.0%)

🎵 Processing audio 2/2: test2.wav
📝 Transcription: 'Tasvirsiya ikki, model aniqligini tekshiramiz, mening ismin bo'lmuhammad.'
⏱️  Inference time: 0.37s
🚀 Speed ratio: 14.6x real-time
📊 WER: 0.556 (55.6%)
📊 CER: 0.149 (14.9%)

📈 SUMMARY FOR jmshd/whisper-uz:
   Average inference time: 0.62s
   Average speed ratio: 13.6x
   Average WER: 0.444 (44.4%)
   Average CER: 0.094 (9.4%)

🔄 EVALUATING MODEL: ShakhzoDavronov/whisper-large-lora-uz
📥 Loading model and 


📊 DETAILED COMPARISON TABLE
     Model  Avg_Inference_Time  Avg_Speed_Ratio  Memory_Usage_MB  Model_Size_MB  Load_Time  Avg_WER  Avg_CER  Accuracy_Percent
whisper-uz               0.619           13.631            1.172        276.924      3.549    0.444    0.094            55.556

🏆 RECOMMENDATIONS


In [None]:
models_to_compare = [
    # 👈 ADD YOUR MODELS HERE!
    "jmshd/whisper-uz",
    "ShakhzoDavronov/whisper-large-lora-uz"]

audio_files = [
    "/content/s_1796.wav",                          # 👈 ADD YOUR AUDIO FILES!
    "/content/test2.wav"
]

reference_texts = [                                 # 👈 OPTIONAL REFERENCES
    "Quyidagilar Sug‘urta hodisasi hisoblanmaydi: mehnat shartnomasini xodimning tashabbusi bilan bekor qilinganda; xodimning o‘z mehnat vazifalarini muntazam ravishda buzganligi.",
    "Test versiya ikki, model aniqligini tekshiramiz, menning ismim Nurmuhammad"
]

results, comparison_df, evaluator = compare_multiple_models(
    models_to_compare,
    audio_files,
    reference_texts  # Can be None if you don't have references
)