In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("nirugidla/sample-data")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /kaggle/input/sample-data


In [5]:
import kagglehub

# Download latest version
path = kagglehub.model_download("google/gemma-2/transformers/gemma-2-27b")

print("Path to model files:", path)

Path to model files: /kaggle/input/gemma-2/transformers/gemma-2-27b/2


In [4]:
import numpy as np
import random
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import pandas as pd
from dataclasses import dataclass
from typing import List, Tuple
import logging
from tqdm import tqdm
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

@dataclass
class OptimizationConfig:
    # Configuration GA-PSO
    population_size: int = 20  
    mutation_rate: float = 0.2  
    max_generations: int = 30   
    
    # Configuration SA
    initial_temperature: float = 50.0
    cooling_rate: float = 0.85
    max_sa_iterations: int = 50  
    
    # Paramètres généraux
    timeout_seconds: int = 300  
    perplexity_threshold: float = 10.0  
    batch_size: int = 8  

class OptimizedTextProcessor:
    def __init__(self, model_path: str, config: OptimizationConfig):
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_path,
            device_map="auto",
            torch_dtype=torch.bfloat16
        )
        self.model.eval()
        
    @torch.no_grad()
        

    def batch_calculate_perplexity(self, sequences: List[str]) -> List[float]:
        try:
            inputs = self.tokenizer(
                sequences,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=512
            )
            inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
        
            outputs = self.model(**inputs, labels=inputs["input_ids"])
            # Assurez-vous que losses est un tenseur 1D
            losses = outputs.loss.view(-1).cpu().numpy()
            return np.exp(losses).tolist()
        
        except Exception as e:
            logging.error(f"Batch perplexity calculation error: {str(e)}")
            return [float('inf')] * len(sequences)
        
        
    def optimize_text(self, text: str) -> Tuple[str, float]:
        words = text.split()
        if len(words) <= 1:
            # Calculer la perplexité du texte original si trop court
            perplexity = self.batch_calculate_perplexity([text])[0]
            return text, perplexity

        start_time = time.time()
        best_sequence = words
        best_perplexity = float('inf')
    
        def time_exceeded():
            return time.time() - start_time > self.config.timeout_seconds

        def quick_mutate(sequence):
            if len(sequence) <= 2:
                return sequence
            idx1, idx2 = random.sample(range(len(sequence)), 2)
            new_sequence = sequence.copy()
            new_sequence[idx1], new_sequence[idx2] = new_sequence[idx2], new_sequence[idx1]
            return new_sequence

        # Calcul initial de la perplexité
        original_text = " ".join(words)
        initial_perplexity = self.batch_calculate_perplexity([original_text])[0]
        best_perplexity = initial_perplexity
    
        population = [words[:] for _ in range(self.config.population_size)]
    
        try:
            for generation in range(self.config.max_generations):
                if time_exceeded():
                    break
                
                # Mutations
                mutated = [quick_mutate(seq[:]) for seq in population]
                sequences = [" ".join(seq) for seq in mutated]
            
                # Calcul des perplexités en batch
                perplexities = []
                for i in range(0, len(sequences), self.config.batch_size):
                    batch = sequences[i:i + self.config.batch_size]
                    batch_perplexities = self.batch_calculate_perplexity(batch)
                    perplexities.extend(batch_perplexities)
            
                # Mise à jour de la meilleure solution
                for idx, (seq, perp) in enumerate(zip(mutated, perplexities)):
                    if perp < best_perplexity and not np.isinf(perp):
                        best_sequence = seq[:]
                        best_perplexity = perp
                        logging.info(f"New best perplexity: {best_perplexity}")
            
                # Sélection
                population = [seq for _, seq in sorted(zip(perplexities, mutated))[:self.config.population_size]]
            
                if generation % 5 == 0:
                    population[-2:] = [words[:] for _ in range(2)]

        except Exception as e:
            logging.error(f"Error during optimization: {str(e)}")
            return original_text, initial_perplexity

        # Toujours retourner un tuple (str, float)
        optimized_text = " ".join(best_sequence)
        return optimized_text, best_perplexity


def optimize_dataset(data_path: str, model_path: str, output_path: str):
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s'
    )
    
    config = OptimizationConfig()
    processor = OptimizedTextProcessor(model_path, config)
    
    data = pd.read_csv(data_path)
    results = []
    
    with tqdm(total=len(data), desc="Optimizing texts") as pbar:
        for idx, row in data.iterrows():
            try:
                text = row['text']
                optimized_text, perplexity_score = processor.optimize_text(text)
                
                result = {
                    "id": row['id'],
                    "text": optimized_text,
                    "original_text": text,
                    "perplexity_score": float(perplexity_score)  # Conversion explicite en float
                }
                results.append(result)
                
                logging.info(f"Text {idx} processed - Score: {perplexity_score}")
                
            except Exception as e:
                logging.error(f"Error processing row {idx}: {str(e)}")
                results.append({
                    "id": row['id'],
                    "text": row['text'],
                    "original_text": row['text'],
                    "perplexity_score": float('inf')
                })
            
            pbar.update(1)
            
            # Sauvegarde intermédiaire
            if (idx + 1) % 10 == 0:
                pd.DataFrame(results).to_csv(output_path, index=False)
    
    # Sauvegarde finale
    final_df = pd.DataFrame(results)
    final_df.to_csv(output_path, index=False)
    
    # Statistiques finales
    valid_scores = final_df['perplexity_score'][~np.isinf(final_df['perplexity_score'])]
    if len(valid_scores) > 0:
        logging.info(f"""
        Optimization completed:
        - Mean perplexity: {valid_scores.mean():.2f}
        - Median perplexity: {valid_scores.median():.2f}
        - Min perplexity: {valid_scores.min():.2f}
        - Max perplexity: {valid_scores.max():.2f}
        """)
    else:
        logging.error("No valid perplexity scores calculated!")
        
if __name__ == "__main__":
    optimize_dataset(
        data_path="/kaggle/input/sample-data/sample_submission.csv",
        model_path="/kaggle/input/gemma-2/transformers/gemma-2-27b/2/",
        output_path='/kaggle/working/optimized_submission.csv'
    )

Loading checkpoint shards: 100%|██████████| 24/24 [00:21<00:00,  1.12it/s]
Optimizing texts: 100%|██████████| 6/6 [28:50<00:00, 288.46s/it]
