In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("nirugidla/sample-data")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /kaggle/input/sample-data


In [5]:
import kagglehub

# Download latest version
path = kagglehub.model_download("google/gemma-2/transformers/gemma-2-27b")

print("Path to model files:", path)

Path to model files: /kaggle/input/gemma-2/transformers/gemma-2-27b/2


In [2]:
import numpy as np
import random
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import pandas as pd
from dataclasses import dataclass
from typing import List, Tuple
import logging
from tqdm import tqdm
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

@dataclass
class OptimizationConfig:
    # Configuration GA-PSO
    population_size: int = 20  # Réduit pour plus de rapidité
    mutation_rate: float = 0.2  # Augmenté pour plus d'exploration
    max_generations: int = 30   # Réduit pour plus de rapidité
    
    # Configuration SA
    initial_temperature: float = 50.0
    cooling_rate: float = 0.85
    max_sa_iterations: int = 50  # Limite le nombre d'itérations
    
    # Paramètres généraux
    timeout_seconds: int = 300  # 5 minutes max par texte
    perplexity_threshold: float = 10.0  # Seuil d'acceptation
    batch_size: int = 8  # Pour le calcul de perplexité

class OptimizedTextProcessor:
    def __init__(self, model_path: str, config: OptimizationConfig):
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_path,
            device_map="auto",
            torch_dtype=torch.bfloat16
        )
        self.model.eval()
        
    @torch.no_grad()
    def batch_calculate_perplexity(self, sequences: List[str]) -> List[float]:
        try:
            # Tokenisation en batch
            inputs = self.tokenizer(
                sequences,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=512
            )
            inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
            
            # Calcul des perplexités en batch
            outputs = self.model(**inputs, labels=inputs["input_ids"])
            losses = outputs.loss.view(-1).cpu().numpy()
            return np.exp(losses).tolist()
            
        except Exception as e:
            logging.error(f"Batch perplexity calculation error: {str(e)}")
            return [float('inf')] * len(sequences)

    def optimize_text(self, text: str) -> str:
        words = text.split()
        if len(words) <= 1:
            return text

        start_time = time.time()
        best_sequence = words
        best_perplexity = float('inf')
        
        def time_exceeded():
            return time.time() - start_time > self.config.timeout_seconds

        # Fonction rapide de mutation
        def quick_mutate(sequence):
            if len(sequence) <= 2:
                return sequence
            idx1, idx2 = random.sample(range(len(sequence)), 2)
            sequence[idx1], sequence[idx2] = sequence[idx2], sequence[idx1]
            return sequence

        # Optimisation principale avec early stopping
        population = [words[:] for _ in range(self.config.population_size)]
        
        for generation in range(self.config.max_generations):
            if time_exceeded():
                break
                
            # Mutations en batch
            mutated = [quick_mutate(seq[:]) for seq in population]
            sequences = [" ".join(seq) for seq in mutated]
            
            # Calcul des perplexités en batch
            perplexities = []
            for i in range(0, len(sequences), self.config.batch_size):
                batch = sequences[i:i + self.config.batch_size]
                perplexities.extend(self.batch_calculate_perplexity(batch))
            
            # Mise à jour de la meilleure solution
            for seq, perp in zip(mutated, perplexities):
                if perp < best_perplexity:
                    best_sequence = seq[:]
                    best_perplexity = perp
                    if best_perplexity < self.config.perplexity_threshold:
                        return " ".join(best_sequence)
            
            # Sélection des meilleurs pour la prochaine génération
            population = [seq for _, seq in sorted(zip(perplexities, mutated))[:self.config.population_size]]
            
            # Injection de diversité
            if generation % 5 == 0:
                population[-2:] = [words[:] for _ in range(2)]

        return " ".join(best_sequence)

def optimize_dataset(data_path: str, model_path: str, output_path: str):
    config = OptimizationConfig()
    processor = OptimizedTextProcessor(model_path, config)
    
    # Chargement des données
    data = pd.read_csv(data_path)
    results = []
    
    # Traitement avec barre de progression
    with tqdm(total=len(data), desc="Optimizing texts") as pbar:
        for idx, row in data.iterrows():
            try:
                optimized_text = processor.optimize_text(row['text'])
                results.append({
                    "id": row['id'],
                    "text": optimized_text
                })
            except Exception as e:
                logging.error(f"Error processing row {idx}: {str(e)}")
                results.append({
                    "id": row['id'],
                    "text": row['text']
                })
            pbar.update(1)
            
            # Sauvegarde intermédiaire tous les 10 textes
            if len(results) % 10 == 0:
                pd.DataFrame(results).to_csv(output_path, index=False)
    
    # Sauvegarde finale
    pd.DataFrame(results).to_csv(output_path, index=False)
    logging.info(f"Optimization completed. Results saved to {output_path}")

if __name__ == "__main__":
    optimize_dataset(
        data_path="/kaggle/input/sample-data/sample_submission.csv",
        model_path="/kaggle/input/gemma-2/transformers/gemma-2-27b/2/",
        output_path='/kaggle/working/optimized_submission.csv'
    )

Loading checkpoint shards: 100%|██████████| 24/24 [00:27<00:00,  1.16s/it]
Optimizing texts: 100%|██████████| 6/6 [28:21<00:00, 283.53s/it]
