In [1]:
import random
import math
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Load GPT-2 Model and Tokenizer
def load_gpt2_model():
    """
    Load the GPT-2 model and tokenizer for evaluating sentence perplexity.
    
    Returns:
        model, tokenizer: Pretrained GPT-2 model and tokenizer.
    """
    model_name = "gpt2"
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)
    model.eval()
    return model, tokenizer

def calculate_gpt2_perplexity(text, model, tokenizer):
    """
    Calculate perplexity of a given text using GPT-2.
    
    Parameters:
        text (str): Input text to evaluate.
        model: Pretrained GPT-2 model.
        tokenizer: Tokenizer for GPT-2.
    
    Returns:
        float: Perplexity score.
    """
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])
        loss = outputs.loss
    return torch.exp(loss).item()

# Step 1: Data Analysis
def load_and_analyze_data(file_path):
    data = pd.read_csv(file_path)
    print("Data Preview:\n", data.head())
    return data

# Step 2: Genetic Algorithm Implementation
def genetic_algorithm(text, model, tokenizer, population_size=20, generations=50, mutation_rate=0.1):
    words = text.split()
    if len(words) < 2:  # Si le texte contient moins de 2 mots, aucun traitement n'est nécessaire
        print("Text too short for optimization:", text)
        return text

    # Initialisation de la population
    population = [random.sample(words, len(words)) for _ in range(population_size)]
    print("Initial population:", population[:3], "...")  # Affiche les 3 premiers individus

    def fitness(solution):
        sentence = " ".join(solution)
        perplexity = calculate_gpt2_perplexity(sentence, model, tokenizer)
        return -perplexity  # Une perplexité plus basse est meilleure

    for generation in range(generations):
        # Calcul des scores de fitness
        fitness_scores = [(fitness(individual), individual) for individual in population]
        fitness_scores.sort(reverse=True, key=lambda x: x[0])

        # Affiche la meilleure solution de la génération
        best_fitness, best_individual = fitness_scores[0]
        print(f"Generation {generation+1}/{generations}, Best fitness: {-best_fitness:.2f}, Best solution: {' '.join(best_individual)}")

        # Sélection des meilleurs individus
        population = [individual for _, individual in fitness_scores[:population_size // 2]]

        # Crossover
        new_population = []
        while len(new_population) < population_size:
            parent1, parent2 = random.sample(population, 2)
            cut = random.randint(1, len(words) - 1)
            child = parent1[:cut] + [word for word in parent2 if word not in parent1[:cut]]
            new_population.append(child)

        # Mutation
        for individual in new_population:
            if random.random() < mutation_rate:
                if len(individual) > 1:  # Assurez-vous qu'il y a au moins deux éléments pour échanger
                    i, j = random.sample(range(len(individual)), 2)
                    individual[i], individual[j] = individual[j], individual[i]

        population = new_population

    # Retourne la meilleure solution finale
    best_solution = max(population, key=lambda ind: fitness(ind))
    print("Final best solution:", " ".join(best_solution))
    return " ".join(best_solution)


# Step 3: Simulated Annealing
def simulated_annealing(text, model, tokenizer, max_iterations=1000, initial_temp=100, cooling_rate=0.99):
    words = text.split()
    current_solution = words[:]
    best_solution = words[:]
    current_perplexity = calculate_gpt2_perplexity(" ".join(current_solution), model, tokenizer)
    best_perplexity = current_perplexity
    temperature = initial_temp

    print("Initial solution:", " ".join(current_solution), f"(Perplexity: {current_perplexity:.2f})")

    for iteration in range(max_iterations):
        neighbor_solution = current_solution[:]
        i, j = random.sample(range(len(words)), 2)
        neighbor_solution[i], neighbor_solution[j] = neighbor_solution[j], neighbor_solution[i]

        neighbor_perplexity = calculate_gpt2_perplexity(" ".join(neighbor_solution), model, tokenizer)

        # Mise à jour de la solution
        if neighbor_perplexity < current_perplexity or random.random() < math.exp((current_perplexity - neighbor_perplexity) / temperature):
            current_solution = neighbor_solution
            current_perplexity = neighbor_perplexity

            if current_perplexity < best_perplexity:
                best_solution = current_solution
                best_perplexity = current_perplexity

        if iteration % 100 == 0:
            print(f"Iteration {iteration}/{max_iterations}, Best perplexity: {best_perplexity:.2f}")

        temperature *= cooling_rate
        if temperature < 1e-3:
            break

    print("Final solution (Simulated Annealing):", " ".join(best_solution))
    return " ".join(best_solution)

# Step 4: Local Search
def local_search(solution, model, tokenizer):
    words = solution.split()
    best_solution = words[:]
    best_perplexity = calculate_gpt2_perplexity(" ".join(best_solution), model, tokenizer)

    for i in range(len(words)):
        for j in range(i + 1, len(words)):
            neighbor_solution = best_solution[:]
            neighbor_solution[i], neighbor_solution[j] = neighbor_solution[j], neighbor_solution[i]

            neighbor_perplexity = calculate_gpt2_perplexity(" ".join(neighbor_solution), model, tokenizer)
            if neighbor_perplexity < best_perplexity:
                best_solution = neighbor_solution
                best_perplexity = neighbor_perplexity

    return " ".join(best_solution)

# Hybrid Approach
def hybrid_approach(text, model, tokenizer):
    intermediate_solution = genetic_algorithm(text, model, tokenizer)
    annealed_solution = simulated_annealing(intermediate_solution, model, tokenizer)
    final_solution = local_search(annealed_solution, model, tokenizer)
    return final_solution

# Process All Data Entries
def process_all_entries(file_path, output_path):
    data = load_and_analyze_data(file_path)
    model, tokenizer = load_gpt2_model()
    results = []

    print("Processing data entries...")
    for index, row in data.iterrows():
        print(f"Processing entry {index + 1}/{len(data)}: {row['text']}")
        optimized_text = hybrid_approach(row['text'], model, tokenizer)
        results.append({'id': row['id'], 'optimized_text': optimized_text})

    output_df = pd.DataFrame(results)
    output_df.to_csv(output_path, index=False)
    print(f"Results saved to {output_path}")

# Example Usage
if __name__ == "__main__":
    input_file_path = "/kaggle/input/santa2024-dataset/sample_submission.csv"
    output_file_path = "/kaggle/working/optimized_results.csv"
    process_all_entries(input_file_path, output_file_path)
    optimized_results = pd.read_csv(output_file_path)
    print(optimized_results)

  from .autonotebook import tqdm as notebook_tqdm
E0000 00:00:1734034195.853709      13 common_lib.cc:798] Could not set metric server port: INVALID_ARGUMENT: Could not find SliceBuilder port 8471 in any of the 0 ports provided in `tpu_process_addresses`="local"
=== Source Location Trace: === 
learning/45eac/tfrc/runtime/common_lib.cc:479
E1212 20:09:55.887548459     232 oauth2_credentials.cc:238]            oauth_fetch: UNKNOWN:C-ares status is not ARES_SUCCESS qtype=A name=metadata.google.internal. is_balancer=0: Domain name not found {created_time:"2024-12-12T20:09:55.887525622+00:00", grpc_status:2}


Data Preview:
    id                                               text
0   0  advent chimney elf family fireplace gingerbrea...
1   1  advent chimney elf family fireplace gingerbrea...
2   2  yuletide decorations gifts cheer holiday carol...
3   3  yuletide decorations gifts cheer holiday carol...
4   4  hohoho candle poinsettia snowglobe peppermint ...
Processing data entries...
Processing entry 1/6: advent chimney elf family fireplace gingerbread mistletoe ornament reindeer scrooge
Initial population: [['chimney', 'ornament', 'fireplace', 'elf', 'gingerbread', 'reindeer', 'family', 'scrooge', 'mistletoe', 'advent'], ['chimney', 'ornament', 'scrooge', 'gingerbread', 'reindeer', 'advent', 'fireplace', 'family', 'elf', 'mistletoe'], ['scrooge', 'reindeer', 'ornament', 'family', 'elf', 'chimney', 'fireplace', 'mistletoe', 'advent', 'gingerbread']] ...
Generation 1/50, Best fitness: 444.29, Best solution: elf chimney scrooge gingerbread advent family reindeer fireplace ornament mistletoe