# Optimized Atlas Model Inference with Checkpointing

This notebook demonstrates how to efficiently run the Atlas-Chat-27B model on Kaggle T4 x2 GPUs with proper checkpointing and batch processing for summarizing multiple texts.

## 1. Install Required Packages

In [1]:
!pip install transformers accelerate tqdm



In [2]:
pip install -U bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl (76.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.3
Note: you may need to restart the kernel to use updated packages.


## 2. Import Libraries

In [3]:
import os
import json
import time
import torch
import gc
import csv
import psutil
from tqdm import tqdm
from pathlib import Path
from datetime import datetime
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

## 3. Memory Management Functions

In [4]:
def clear_gpu_memory():
    """Clear CUDA cache to free up GPU memory"""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
    
def print_memory_usage():
    """Print current memory usage"""
    # CPU memory
    process = psutil.Process(os.getpid())
    cpu_mem = process.memory_info().rss / (1024 * 1024)  # MB
    print(f"CPU Memory Usage: {cpu_mem:.2f} MB")
    
    # GPU memory
    if torch.cuda.is_available():
        for i in range(torch.cuda.device_count()):
            allocated = torch.cuda.memory_allocated(i) / (1024 * 1024)  # MB
            reserved = torch.cuda.memory_reserved(i) / (1024 * 1024)  # MB
            print(f"GPU {i} Memory - Allocated: {allocated:.2f} MB, Reserved: {reserved:.2f} MB")

## 4. Define Helper Functions

In [5]:
from accelerate import Accelerator

def setup_model(model_id):
    """Set up the model with optimized configuration for T4 GPUs"""
    # Clear memory before loading the model
    
    # Ensure we use the most memory-efficient settings
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16)
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    
    # Load model with efficient settings
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=quantization_config,
        device_map="auto",         # Automatically use available GPUs
        attn_implementation="flash_attention_2"  # If supported by the model
    )

    accelerator = Accelerator()
    model, tokenizer = accelerator.prepare(model, tokenizer)
    
    return model, tokenizer

def load_texts(file_path):
    """
    Load text and summary from a CSV file.
    Each row is expected to have two columns: text and summary, for example:
    "text with possible spaces" "summary with possible spaces"
    """
    texts = []
    with open(file_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile, delimiter=' ', quotechar='"')
        for row in reader:
            if len(row) != 2:
                raise ValueError(f"Expected 2 columns per row but got {len(row)} columns: {row}")
            text, summary = row
            texts.append(text)
    return texts


def load_checkpoint(checkpoint_path):
    """Load progress from checkpoint"""
    if os.path.exists(checkpoint_path):
        with open(checkpoint_path, 'r', encoding='utf-8') as f:
            checkpoint = json.load(f)
        return set(checkpoint['completed_indices']), checkpoint['results']
    return set(), {}

def save_checkpoint(checkpoint_path, completed_indices, results):
    """Save progress to checkpoint"""
    with open(checkpoint_path, 'w', encoding='utf-8') as f:
        json.dump({
            'completed_indices': list(completed_indices),
            'results': results
        }, f, ensure_ascii=False, indent=2)

def save_batch_results(output_path, results):
    """Save current results to file"""
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

## 5. Main Processing Function with Memory Optimization

In [6]:
def process_texts(model, tokenizer, texts, output_dir, batch_size, max_new_tokens, temperature):
    """Process all texts with batching, checkpointing, and memory optimization"""
    # Create output directory if it doesn't exist
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    # Setup checkpoint paths
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    checkpoint_path = os.path.join(output_dir, f"checkpoint_{timestamp}.json")
    final_output_path = os.path.join(output_dir, f"results_{timestamp}.json")
    
    # Load checkpoint if exists
    completed_indices, results = load_checkpoint(checkpoint_path)
    
    # Enable flash attention if available
    generation_config = {
        "max_new_tokens": max_new_tokens,
        "temperature": temperature,
        "do_sample": temperature > 0,
        "use_cache": True,  # Enable KV caching
    }
    
    # Check if flash attention is available
    if hasattr(model.config, "attn_implementation") and model.config.attn_implementation == "flash_attention_2":
        print("Using Flash Attention for faster inference")
    else:
        print("Flash Attention not available, using standard attention")
    
    try:
        # Process texts in batches
        for i in tqdm(range(0, len(texts), batch_size)):
            # Clear memory at the start of each batch
            clear_gpu_memory()
            
            batch_texts = texts[i:i + batch_size]
            batch_indices = list(range(i, min(i + batch_size, len(texts))))
            
            # Skip already processed texts
            remaining_indices = [idx for idx in batch_indices if idx not in completed_indices]
            if not remaining_indices:
                continue
                
            remaining_texts = [texts[idx] for idx in remaining_indices]
            
            # Prepare inputs in batch (process each prompt individually)
            batch_outputs = []
            for text in remaining_texts:
                # Create the prompt for summarization
                messages = [{
    "role": "user", 
    "content": f"""Summarize the following text in Moroccan Darija in a concise and coherent way using same style as input, summary should be 10% the size of the input text:

{text}
"""
}]
                # Generate response
                with torch.inference_mode():
                    input_ids = tokenizer.apply_chat_template(
                        messages, 
                        return_tensors="pt", 
                        return_dict=True, 
                        add_generation_prompt=True
                    ).to(model.device)
                    
                    # Use optimized generation config
                    outputs = model.generate(
                        **input_ids, 
                        **generation_config
                    )
                    
                    response = tokenizer.decode(outputs[0], skip_special_tokens=False)
                    # Extract just the model's response
                    model_response = response.split("<start_of_turn>model")[-1]
                    batch_outputs.append(model_response)
                    
                    # Clear memory after each text to prevent OOM
                    del input_ids, outputs
                    clear_gpu_memory()
            
            # Update results and completed indices
            for idx, output in zip(remaining_indices, batch_outputs):
                results[str(idx)] = {
                    "input": texts[idx],
                    "output": output
                }
                completed_indices.add(idx)
            
            # Save batch results and checkpoint
            save_checkpoint(checkpoint_path, completed_indices, results)
            save_batch_results(final_output_path, results)
            
            # Print memory usage after each batch
            if (i // batch_size) % 5 == 0:  # Every 5 batches
                print_memory_usage()
            
    except Exception as e:
        # In case of failure, save current progress
        print(f"Error encountered: {e}")
        save_checkpoint(checkpoint_path, completed_indices, results)
        raise
    
    # Save final results
    save_batch_results(final_output_path, results)
    print(f"Processing complete. Results saved to {final_output_path}")
    
    # Remove checkpoint file after successful completion
    if os.path.exists(checkpoint_path):
        os.remove(checkpoint_path)
        
    return results


def process_texts_in_batches(texts, batch_size=8):
    results = []
    
    # Process in batches with progress tracking
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]
        batch_inputs = []
        
        # Prepare all inputs in the batch
        for text in batch:
            messages = [{"role": "user", "content": f"Summarize: {text}"}]
            inputs = tokenizer.apply_chat_template(
                messages, 
                return_tensors="pt", 
                add_generation_prompt=True
            ).to(model.device)
            batch_inputs.append(inputs)
        
        # Generate all outputs with optimized settings
        batch_outputs = []
        for inputs in batch_inputs:
            with torch.no_grad():  # Disable gradient calculation for inference
                output = model.generate(
                    inputs,
                    max_new_tokens=64,
                    temperature=0.0,
                    do_sample=False,
                    use_cache=True
                )
                batch_outputs.append(output)
        
        # Process outputs
        for output in batch_outputs:
            result = tokenizer.decode(output[0]).split("<start_of_turn>model")[-1]
            results.append(result)
    
    return results

## 6. Create Sample Input Data

## 7. Select Smaller Model for Faster Inference

In [7]:
# Configuration - using smaller model for faster inference
# model_id = "MBZUAI-Paris/Atlas-Chat-27B"  # Original large model
model_id = "MBZUAI-Paris/Atlas-Chat-9B"  # Smaller, faster model
output_dir = "/kaggle/working/results"
batch_size = 10                         # Process one at a time to conserve memory
max_new_tokens = 256
temperature = 0.0

# Check available GPU memory before loading model
print("Initial memory state:")
print_memory_usage()

Initial memory state:
CPU Memory Usage: 543.78 MB
GPU 0 Memory - Allocated: 0.00 MB, Reserved: 0.00 MB
GPU 1 Memory - Allocated: 0.00 MB, Reserved: 0.00 MB


## 8. Run Inference with Optimized Settings

In [8]:
clear_gpu_memory()

# To further speed up inference, verify/enable TF32 precision on Ampere or newer GPUs
if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8:
    print("TF32 is available. Enabling for faster inference...")
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True

# Load model and tokenizer with optimized settings
print("Loading model and tokenizer...")
start_time = time.time()

# 2. Setup optimized quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

# 3. Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    device_map="auto"        # Automatically use available GPUs
)

accelerator = Accelerator()
model, tokenizer = accelerator.prepare(model, tokenizer)



print(f"Model loaded in {time.time() - start_time:.2f} seconds")

# Print memory usage after model loading
print("Memory usage after model loading:")
print_memory_usage()

# Load texts
input_file = '/kaggle/input/marsum/MArSum1_train.txt'
print(f"Loading texts from {input_file}...")
texts = load_texts(input_file)[1:2500]
print(f"Loaded {len(texts)} texts for processing")

# Process texts with optimized settings
start_time = time.time()
results = process_texts(
    model, 
    tokenizer, 
    texts, 
    output_dir, 
    batch_size, 
    max_new_tokens, 
    temperature
)
elapsed_time = time.time() - start_time
print(f"Processed {len(results)} texts successfully")
print(f"Total inference time: {elapsed_time:.2f} seconds")
print(f"Average time per text: {elapsed_time / len(results):.2f} seconds")

# Clean up to free memory
#del model, tokenizer
clear_gpu_memory()
print("Final memory state after cleanup:")
print_memory_usage()

Loading model and tokenizer...


tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/920 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/39.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]

Model loaded in 181.54 seconds
Memory usage after model loading:
CPU Memory Usage: 3613.70 MB
GPU 0 Memory - Allocated: 3566.84 MB, Reserved: 3656.00 MB
GPU 1 Memory - Allocated: 2671.04 MB, Reserved: 2844.00 MB
Loading texts from /kaggle/input/marsum/MArSum1_train.txt...
Loaded 2499 texts for processing
Flash Attention not available, using standard attention


The 'batch_size' attribute of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'self.max_batch_size' attribute instead.
  0%|          | 1/250 [02:00<8:19:31, 120.37s/it]

CPU Memory Usage: 3697.45 MB
GPU 0 Memory - Allocated: 3836.21 MB, Reserved: 4036.00 MB
GPU 1 Memory - Allocated: 3063.34 MB, Reserved: 3364.00 MB


  2%|▏         | 6/250 [10:55<7:20:31, 108.33s/it]

CPU Memory Usage: 3697.45 MB
GPU 0 Memory - Allocated: 3836.21 MB, Reserved: 4036.00 MB
GPU 1 Memory - Allocated: 3063.34 MB, Reserved: 3364.00 MB


  4%|▍         | 11/250 [21:25<7:46:58, 117.23s/it]

CPU Memory Usage: 3697.45 MB
GPU 0 Memory - Allocated: 3836.21 MB, Reserved: 4036.00 MB
GPU 1 Memory - Allocated: 3063.34 MB, Reserved: 3364.00 MB


  6%|▋         | 16/250 [30:58<7:25:58, 114.35s/it]

CPU Memory Usage: 3697.45 MB
GPU 0 Memory - Allocated: 3836.21 MB, Reserved: 4036.00 MB
GPU 1 Memory - Allocated: 3063.34 MB, Reserved: 3364.00 MB


  8%|▊         | 21/250 [40:14<6:36:10, 103.80s/it]

CPU Memory Usage: 3697.57 MB
GPU 0 Memory - Allocated: 3836.21 MB, Reserved: 4036.00 MB
GPU 1 Memory - Allocated: 3063.34 MB, Reserved: 3364.00 MB


 10%|█         | 26/250 [50:14<7:33:13, 121.40s/it]

CPU Memory Usage: 3697.57 MB
GPU 0 Memory - Allocated: 3841.26 MB, Reserved: 4036.00 MB
GPU 1 Memory - Allocated: 3070.76 MB, Reserved: 3324.00 MB


 12%|█▏        | 31/250 [1:02:16<8:27:30, 139.04s/it]

CPU Memory Usage: 3697.57 MB
GPU 0 Memory - Allocated: 3852.02 MB, Reserved: 4016.00 MB
GPU 1 Memory - Allocated: 3086.58 MB, Reserved: 3364.00 MB


 14%|█▍        | 36/250 [1:12:21<7:25:02, 124.78s/it]

CPU Memory Usage: 3697.57 MB
GPU 0 Memory - Allocated: 3852.02 MB, Reserved: 4016.00 MB
GPU 1 Memory - Allocated: 3086.58 MB, Reserved: 3364.00 MB


 16%|█▋        | 41/250 [1:20:48<5:56:55, 102.47s/it]

CPU Memory Usage: 3697.57 MB
GPU 0 Memory - Allocated: 3878.98 MB, Reserved: 4036.00 MB
GPU 1 Memory - Allocated: 3126.23 MB, Reserved: 3344.00 MB


 18%|█▊        | 46/250 [1:30:01<6:12:37, 109.59s/it]

CPU Memory Usage: 3697.57 MB
GPU 0 Memory - Allocated: 3878.98 MB, Reserved: 4036.00 MB
GPU 1 Memory - Allocated: 3126.23 MB, Reserved: 3344.00 MB


 20%|██        | 51/250 [1:40:29<6:53:16, 124.61s/it]

CPU Memory Usage: 3697.57 MB
GPU 0 Memory - Allocated: 3878.98 MB, Reserved: 4036.00 MB
GPU 1 Memory - Allocated: 3126.23 MB, Reserved: 3344.00 MB


 22%|██▏       | 56/250 [1:49:34<6:11:22, 114.86s/it]

CPU Memory Usage: 3697.57 MB
GPU 0 Memory - Allocated: 3878.98 MB, Reserved: 4036.00 MB
GPU 1 Memory - Allocated: 3126.23 MB, Reserved: 3344.00 MB


 24%|██▍       | 61/250 [1:59:15<6:14:39, 118.94s/it]

CPU Memory Usage: 3697.57 MB
GPU 0 Memory - Allocated: 3885.62 MB, Reserved: 4016.00 MB
GPU 1 Memory - Allocated: 3136.00 MB, Reserved: 3364.00 MB


 26%|██▋       | 66/250 [2:08:45<5:43:23, 111.97s/it]

CPU Memory Usage: 3697.57 MB
GPU 0 Memory - Allocated: 3889.47 MB, Reserved: 4016.00 MB
GPU 1 Memory - Allocated: 3141.66 MB, Reserved: 3344.00 MB


 28%|██▊       | 71/250 [2:17:09<5:10:42, 104.15s/it]

CPU Memory Usage: 3697.57 MB
GPU 0 Memory - Allocated: 3889.47 MB, Reserved: 4016.00 MB
GPU 1 Memory - Allocated: 3141.66 MB, Reserved: 3344.00 MB


 30%|███       | 76/250 [2:26:26<5:24:39, 111.95s/it]

CPU Memory Usage: 3697.57 MB
GPU 0 Memory - Allocated: 3889.47 MB, Reserved: 4016.00 MB
GPU 1 Memory - Allocated: 3141.66 MB, Reserved: 3344.00 MB


 32%|███▏      | 81/250 [2:35:54<5:35:12, 119.01s/it]

CPU Memory Usage: 3697.57 MB
GPU 0 Memory - Allocated: 3889.47 MB, Reserved: 4016.00 MB
GPU 1 Memory - Allocated: 3141.66 MB, Reserved: 3344.00 MB


 34%|███▍      | 86/250 [2:45:09<4:58:31, 109.21s/it]

CPU Memory Usage: 3697.57 MB
GPU 0 Memory - Allocated: 3889.47 MB, Reserved: 4016.00 MB
GPU 1 Memory - Allocated: 3141.66 MB, Reserved: 3344.00 MB


 36%|███▋      | 91/250 [2:54:09<4:44:38, 107.41s/it]

CPU Memory Usage: 3697.57 MB
GPU 0 Memory - Allocated: 3889.47 MB, Reserved: 4016.00 MB
GPU 1 Memory - Allocated: 3141.66 MB, Reserved: 3344.00 MB


 38%|███▊      | 96/250 [3:04:13<5:27:51, 127.74s/it]

CPU Memory Usage: 3697.57 MB
GPU 0 Memory - Allocated: 3889.47 MB, Reserved: 4016.00 MB
GPU 1 Memory - Allocated: 3141.66 MB, Reserved: 3344.00 MB


 40%|████      | 101/250 [3:14:17<4:48:02, 115.99s/it]

CPU Memory Usage: 3697.57 MB
GPU 0 Memory - Allocated: 3889.47 MB, Reserved: 4016.00 MB
GPU 1 Memory - Allocated: 3141.66 MB, Reserved: 3344.00 MB


 42%|████▏     | 106/250 [3:24:47<4:54:01, 122.51s/it]

CPU Memory Usage: 3697.57 MB
GPU 0 Memory - Allocated: 3889.47 MB, Reserved: 4016.00 MB
GPU 1 Memory - Allocated: 3141.66 MB, Reserved: 3344.00 MB


 44%|████▍     | 111/250 [3:33:23<4:15:47, 110.42s/it]

CPU Memory Usage: 3697.57 MB
GPU 0 Memory - Allocated: 3889.47 MB, Reserved: 4016.00 MB
GPU 1 Memory - Allocated: 3141.66 MB, Reserved: 3344.00 MB


 46%|████▋     | 116/250 [3:44:51<5:17:36, 142.22s/it]

CPU Memory Usage: 3697.70 MB
GPU 0 Memory - Allocated: 4131.59 MB, Reserved: 4252.00 MB
GPU 1 Memory - Allocated: 3499.68 MB, Reserved: 3716.00 MB


 48%|████▊     | 121/250 [3:57:18<5:22:50, 150.16s/it]

CPU Memory Usage: 3697.70 MB
GPU 0 Memory - Allocated: 4131.59 MB, Reserved: 4252.00 MB
GPU 1 Memory - Allocated: 3499.68 MB, Reserved: 3716.00 MB


 50%|█████     | 126/250 [4:06:36<3:59:56, 116.10s/it]

CPU Memory Usage: 3697.70 MB
GPU 0 Memory - Allocated: 4131.59 MB, Reserved: 4252.00 MB
GPU 1 Memory - Allocated: 3499.68 MB, Reserved: 3716.00 MB


 52%|█████▏    | 131/250 [4:17:57<4:34:28, 138.39s/it]

CPU Memory Usage: 3697.70 MB
GPU 0 Memory - Allocated: 4131.59 MB, Reserved: 4252.00 MB
GPU 1 Memory - Allocated: 3499.68 MB, Reserved: 3716.00 MB


 54%|█████▍    | 136/250 [4:28:55<4:23:38, 138.76s/it]

CPU Memory Usage: 3697.70 MB
GPU 0 Memory - Allocated: 4131.59 MB, Reserved: 4252.00 MB
GPU 1 Memory - Allocated: 3499.68 MB, Reserved: 3716.00 MB


 56%|█████▋    | 141/250 [4:40:01<4:04:10, 134.41s/it]

CPU Memory Usage: 3697.82 MB
GPU 0 Memory - Allocated: 4131.59 MB, Reserved: 4252.00 MB
GPU 1 Memory - Allocated: 3499.68 MB, Reserved: 3716.00 MB


 58%|█████▊    | 146/250 [4:51:36<4:07:03, 142.54s/it]

CPU Memory Usage: 3697.82 MB
GPU 0 Memory - Allocated: 4131.59 MB, Reserved: 4252.00 MB
GPU 1 Memory - Allocated: 3499.68 MB, Reserved: 3716.00 MB


 60%|██████    | 151/250 [5:05:02<4:25:51, 161.13s/it]

CPU Memory Usage: 3697.82 MB
GPU 0 Memory - Allocated: 4131.59 MB, Reserved: 4252.00 MB
GPU 1 Memory - Allocated: 3499.68 MB, Reserved: 3716.00 MB


 62%|██████▏   | 156/250 [5:16:09<3:36:35, 138.25s/it]

CPU Memory Usage: 3697.82 MB
GPU 0 Memory - Allocated: 4131.59 MB, Reserved: 4252.00 MB
GPU 1 Memory - Allocated: 3499.68 MB, Reserved: 3716.00 MB


 64%|██████▍   | 161/250 [5:27:54<3:33:17, 143.79s/it]

CPU Memory Usage: 3697.95 MB
GPU 0 Memory - Allocated: 4131.59 MB, Reserved: 4252.00 MB
GPU 1 Memory - Allocated: 3499.68 MB, Reserved: 3716.00 MB


 66%|██████▋   | 166/250 [5:39:41<3:13:25, 138.16s/it]

CPU Memory Usage: 3697.95 MB
GPU 0 Memory - Allocated: 4131.59 MB, Reserved: 4252.00 MB
GPU 1 Memory - Allocated: 3499.68 MB, Reserved: 3716.00 MB


 68%|██████▊   | 171/250 [5:51:41<3:06:58, 142.00s/it]

CPU Memory Usage: 3697.95 MB
GPU 0 Memory - Allocated: 4131.59 MB, Reserved: 4252.00 MB
GPU 1 Memory - Allocated: 3499.68 MB, Reserved: 3716.00 MB


 70%|███████   | 176/250 [6:02:57<2:50:27, 138.20s/it]

CPU Memory Usage: 3697.95 MB
GPU 0 Memory - Allocated: 4131.59 MB, Reserved: 4252.00 MB
GPU 1 Memory - Allocated: 3499.68 MB, Reserved: 3716.00 MB


 72%|███████▏  | 181/250 [6:12:55<2:28:40, 129.28s/it]

CPU Memory Usage: 3697.95 MB
GPU 0 Memory - Allocated: 4131.59 MB, Reserved: 4252.00 MB
GPU 1 Memory - Allocated: 3499.68 MB, Reserved: 3716.00 MB


 74%|███████▍  | 186/250 [6:24:12<2:23:07, 134.19s/it]

CPU Memory Usage: 3697.95 MB
GPU 0 Memory - Allocated: 4131.59 MB, Reserved: 4252.00 MB
GPU 1 Memory - Allocated: 3499.68 MB, Reserved: 3716.00 MB


 76%|███████▋  | 191/250 [6:36:41<2:20:00, 142.38s/it]

CPU Memory Usage: 3698.07 MB
GPU 0 Memory - Allocated: 4131.59 MB, Reserved: 4252.00 MB
GPU 1 Memory - Allocated: 3499.68 MB, Reserved: 3716.00 MB


 78%|███████▊  | 196/250 [6:49:00<2:12:23, 147.11s/it]

CPU Memory Usage: 3698.20 MB
GPU 0 Memory - Allocated: 4131.59 MB, Reserved: 4252.00 MB
GPU 1 Memory - Allocated: 3499.68 MB, Reserved: 3716.00 MB


 80%|████████  | 201/250 [7:00:46<2:01:45, 149.09s/it]

CPU Memory Usage: 3698.32 MB
GPU 0 Memory - Allocated: 4131.59 MB, Reserved: 4252.00 MB
GPU 1 Memory - Allocated: 3499.68 MB, Reserved: 3716.00 MB


 82%|████████▏ | 206/250 [7:12:38<1:40:59, 137.72s/it]

CPU Memory Usage: 3698.32 MB
GPU 0 Memory - Allocated: 4131.59 MB, Reserved: 4252.00 MB
GPU 1 Memory - Allocated: 3499.68 MB, Reserved: 3716.00 MB


 84%|████████▍ | 211/250 [7:23:18<1:27:28, 134.58s/it]

CPU Memory Usage: 3698.32 MB
GPU 0 Memory - Allocated: 4131.59 MB, Reserved: 4252.00 MB
GPU 1 Memory - Allocated: 3499.68 MB, Reserved: 3716.00 MB


 86%|████████▋ | 216/250 [7:35:24<1:22:52, 146.25s/it]

CPU Memory Usage: 3698.32 MB
GPU 0 Memory - Allocated: 4131.59 MB, Reserved: 4252.00 MB
GPU 1 Memory - Allocated: 3499.68 MB, Reserved: 3716.00 MB


 88%|████████▊ | 221/250 [7:47:28<1:09:50, 144.48s/it]

CPU Memory Usage: 3698.32 MB
GPU 0 Memory - Allocated: 4131.59 MB, Reserved: 4252.00 MB
GPU 1 Memory - Allocated: 3499.68 MB, Reserved: 3716.00 MB


 90%|█████████ | 226/250 [7:57:09<49:49, 124.55s/it]

CPU Memory Usage: 3698.32 MB
GPU 0 Memory - Allocated: 4131.59 MB, Reserved: 4252.00 MB
GPU 1 Memory - Allocated: 3499.68 MB, Reserved: 3716.00 MB


 92%|█████████▏| 231/250 [8:07:44<41:12, 130.13s/it]

CPU Memory Usage: 3698.32 MB
GPU 0 Memory - Allocated: 4131.59 MB, Reserved: 4252.00 MB
GPU 1 Memory - Allocated: 3499.68 MB, Reserved: 3716.00 MB


 94%|█████████▍| 236/250 [8:17:51<29:53, 128.12s/it]

CPU Memory Usage: 3698.32 MB
GPU 0 Memory - Allocated: 4131.59 MB, Reserved: 4252.00 MB
GPU 1 Memory - Allocated: 3499.68 MB, Reserved: 3716.00 MB


 96%|█████████▋| 241/250 [8:30:56<22:01, 146.78s/it]

CPU Memory Usage: 3698.32 MB
GPU 0 Memory - Allocated: 4131.59 MB, Reserved: 4252.00 MB
GPU 1 Memory - Allocated: 3499.68 MB, Reserved: 3716.00 MB


 98%|█████████▊| 246/250 [8:44:50<10:39, 159.96s/it]

CPU Memory Usage: 3698.45 MB
GPU 0 Memory - Allocated: 4131.59 MB, Reserved: 4252.00 MB
GPU 1 Memory - Allocated: 3499.68 MB, Reserved: 3716.00 MB


100%|██████████| 250/250 [8:52:33<00:00, 127.81s/it]


Processing complete. Results saved to /kaggle/working/results/results_20250304_120036.json
Processed 2499 texts successfully
Total inference time: 31953.68 seconds
Average time per text: 12.79 seconds
Final memory state after cleanup:
CPU Memory Usage: 3698.45 MB
GPU 0 Memory - Allocated: 4131.59 MB, Reserved: 4252.00 MB
GPU 1 Memory - Allocated: 3499.68 MB, Reserved: 3716.00 MB


## 9. View Sample Results

In [9]:
# Display a few results for quick inspection
sample_count = min(3, len(results))
print(f"\nSample of {sample_count} results:")
for i in range(sample_count):
    idx = str(i)
    if idx in results:
        print(f"\nInput {i+1}:\n{results[idx]['input'][:100]}...")
        print(f"Output {i+1}:\n{results[idx]['output'][:100]}...")


Sample of 3 results:

Input 1:
علمت “كود”، أن الجدارمية التابعين لسرية الدرك الملكي 2 مارس شدو عضو جماعي كيحمل صفة النائب الثالث لر...
Output 1:

عضو جماعي حصل على 2000 درهم رشوة من عند شكايا فكازا. الجدارمية شدوه ونسقو العملية مع النيابة العامة...

Input 2:
ابتداء من اليوم، جماعة الدار البيضاء حددات تعريفة جديدة للوقوف فالباركينكَات العمومية، لتفادي الزياد...
Output 2:

جماعة كازا دارت تعريفة جديدة للباركينكَات العمومية باش تمنع الزيادات غير القانونية. دابا الطوموبيلا...

Input 3:
أعلن الرئيس الروسي، فلاديمير بوتين، اليوم الثلاثاء، عن تسجيل أول لقاح ضد فيروس كورونا في العالم. وقا...
Output 3:

الرئيس الروسي بوتين علن على تسجيل أول لقاح ضد كورونا فالعالم. قال أن هاد اللقاح داز على جميع الاختب...
