In [2]:
import numpy as np
import pandas as pd
import random
from itertools import permutations
import math
import time
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load pre-trained language model for perplexity calculation
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model.eval()

# Load the dataset (adjust path for Kaggle environment)
data_path = '/kaggle/input/my-data/sample_submission.csv'
data = pd.read_csv(data_path)

# Define Perplexity Calculation Function
def calculate_perplexity(sequence):
    """
    Calculate perplexity using GPT-2 model.
    """
    inputs = tokenizer(sequence, return_tensors="pt", truncation=True, max_length=512)
    outputs = model(**inputs, labels=inputs["input_ids"])
    loss = outputs.loss.item()
    perplexity = math.exp(loss)
    return perplexity

# Helper function to modify a sequence
def modify_sequence(sequence, reference, velocity):
    """
    Modify the sequence slightly based on a reference sequence and a velocity factor.
    """
    sequence = sequence.copy()
    for _ in range(int(velocity)):
        i, j = random.sample(range(len(sequence)), 2)
        if sequence[i] != reference[j]:
            sequence[i], sequence[j] = sequence[j], sequence[i]
    return sequence

# Genetic Algorithm Implementation
def genetic_algorithm(data, population_size=20, generations=50, mutation_rate=0.1):
    """
    Genetic Algorithm to find optimal word permutations for minimal perplexity.
    """
    results = []
    for _, row in data.iterrows():
        text_id = row['id']
        base_sequence = row['text'].split()

        # Initialize population with random permutations
        population = [random.sample(base_sequence, len(base_sequence)) for _ in range(population_size)]

        best_sequence = None
        best_perplexity = float('inf')

        for generation in range(generations):
            # Evaluate population
            perplexities = [calculate_perplexity(' '.join(seq)) for seq in population]

            # Select top individuals (elitism)
            sorted_indices = np.argsort(perplexities)
            population = [population[i] for i in sorted_indices[:population_size // 2]]

            # Update best sequence
            if perplexities[sorted_indices[0]] < best_perplexity:
                best_perplexity = perplexities[sorted_indices[0]]
                best_sequence = population[0]

            # Crossover (mate top individuals)
            offspring = []
            for i in range(len(population) - 1):
                parent1, parent2 = population[i], population[i + 1]
                split = random.randint(1, len(base_sequence) - 1)
                child = parent1[:split] + [word for word in parent2 if word not in parent1[:split]]
                offspring.append(child)

            population.extend(offspring)

            # Mutation
            for individual in population:
                if random.random() < mutation_rate:
                    i, j = random.sample(range(len(base_sequence)), 2)
                    individual[i], individual[j] = individual[j], individual[i]

        results.append({'id': text_id, 'text': ' '.join(best_sequence), 'perplexity': best_perplexity})
    
    return pd.DataFrame(results)

# Hybrid PSO-GA-SA Implementation
def hybrid_pso_ga_sa(data, swarm_size=20, generations=50, mutation_rate=0.1, initial_temperature=100, cooling_rate=0.95):
    """
    Hybrid method combining PSO, GA, and SA to minimize perplexity.
    """
    results = []
    for _, row in data.iterrows():
        text_id = row['id']
        base_sequence = row['text'].split()

        # Initialize swarm
        swarm = [{'sequence': random.sample(base_sequence, len(base_sequence)), 'velocity': 1} for _ in range(swarm_size)]

        best_global_sequence = None
        best_global_perplexity = float('inf')

        temperature = initial_temperature

        for generation in range(generations):
            for particle in swarm:
                sequence = particle['sequence']
                perplexity = calculate_perplexity(' '.join(sequence))

                # Update personal best
                if 'best_perplexity' not in particle or perplexity < particle['best_perplexity']:
                    particle['best_perplexity'] = perplexity
                    particle['best_sequence'] = sequence

                # Update global best
                if perplexity < best_global_perplexity:
                    best_global_perplexity = perplexity
                    best_global_sequence = sequence

                # Update velocity and position (PSO logic)
                inertia = particle['velocity']
                cognitive = random.random() * 2  # Random cognitive factor
                social = random.random() * 2    # Random social factor

                particle['velocity'] = inertia + cognitive + social
                particle['sequence'] = modify_sequence(sequence, best_global_sequence, particle['velocity'])

            # Simulated Annealing
            for particle in swarm:
                sequence = particle['sequence']
                perplexity = calculate_perplexity(' '.join(sequence))

                delta = perplexity - best_global_perplexity
                if delta < 0 or random.random() < math.exp(-delta / temperature):
                    best_global_sequence = sequence
                    best_global_perplexity = perplexity

            # Cool down temperature
            temperature *= cooling_rate

        results.append({'id': text_id, 'text': ' '.join(best_global_sequence), 'perplexity': best_global_perplexity})

    return pd.DataFrame(results)

# Run Hybrid Method
start_time = time.time()
hybrid_results = hybrid_pso_ga_sa(data)
end_time = time.time()

hybrid_results['time'] = end_time - start_time

# Save results
hybrid_results.to_csv('hybrid_method_submission.csv', index=False)
print("Hybrid PSO-GA-SA Completed. Results saved to hybrid_method_submission.csv.")


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



Hybrid PSO-GA-SA Completed. Results saved to hybrid_method_submission.csv.


In [3]:
hybrid_results

Unnamed: 0,id,text,perplexity,time
0,0,family scrooge chimney advent reindeer firepla...,291.528983,1425.739339
1,1,laugh fireplace ornament scrooge drive family ...,655.659109,1425.739339
2,2,nice cheer naughty beard holly yuletide chimne...,635.266642,1425.739339
3,3,decorations holiday naughty visit sleigh nice ...,897.993038,1425.739339
4,4,greeting have milk wonder joy star peppermint ...,839.852115,1425.739339
5,5,hope the give advent hohoho dream and mistleto...,735.393823,1425.739339


In [4]:
import numpy as np
import pandas as pd
import random
from itertools import permutations
import math
import time
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors

# Load pre-trained language model for perplexity calculation
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model.eval()

# Load word embeddings (e.g., GloVe, Word2Vec)
#embedding_path = '/kaggle/input/word-embeddings/word2vec.bin'  # Update with the correct path
#embeddings = KeyedVectors.load_word2vec_format(embedding_path, binary=True)
embeddings = None
# Load the dataset (adjust path for Kaggle environment)
data_path = '/kaggle/input/my-data/sample_submission.csv'
data = pd.read_csv(data_path)

# Define Perplexity Calculation Function
def calculate_perplexity(sequence):
    """
    Calculate perplexity using GPT-2 model.
    """
    inputs = tokenizer(sequence, return_tensors="pt", truncation=True, max_length=512)
    outputs = model(**inputs, labels=inputs["input_ids"])
    loss = outputs.loss.item()
    perplexity = math.exp(loss)
    return perplexity

# Helper function to calculate semantic coherence
def calculate_coherence(sequence):
    """
    Calculate semantic coherence based on word embeddings.
    """
    vectors = [embeddings[word] for word in sequence if word in embeddings]
    if len(vectors) < 2:
        return 0
    similarities = [cosine_similarity([vectors[i]], [vectors[i + 1]])[0][0] for i in range(len(vectors) - 1)]
    return np.mean(similarities)

# Helper function to modify a sequence
def modify_sequence(sequence, reference, velocity):
    """
    Modify the sequence slightly based on a reference sequence and a velocity factor.
    """
    sequence = sequence.copy()
    for _ in range(int(velocity)):
        i, j = random.sample(range(len(sequence)), 2)
        sequence[i], sequence[j] = sequence[j], sequence[i]
    return sequence

# Genetic Algorithm Implementation
def genetic_algorithm(data, population_size=50, generations=100, mutation_rate=0.2):
    """
    Genetic Algorithm to find optimal word permutations for minimal perplexity.
    """
    results = []
    for _, row in data.iterrows():
        text_id = row['id']
        base_sequence = row['text'].split()

        # Initialize population with random permutations
        population = [random.sample(base_sequence, len(base_sequence)) for _ in range(population_size)]

        best_sequence = None
        best_perplexity = float('inf')

        for generation in range(generations):
            # Evaluate population
            perplexities = [calculate_perplexity(' '.join(seq)) for seq in population]
            coherences = [calculate_coherence(seq) for seq in population]

            # Combine perplexity and coherence scores
            scores = [perplexities[i] - 0.1 * coherences[i] for i in range(len(population))]

            # Select top individuals (elitism)
            sorted_indices = np.argsort(scores)
            population = [population[i] for i in sorted_indices[:population_size // 2]]

            # Update best sequence
            if scores[sorted_indices[0]] < best_perplexity:
                best_perplexity = scores[sorted_indices[0]]
                best_sequence = population[0]

            # Crossover (mate top individuals)
            offspring = []
            for i in range(len(population) - 1):
                parent1, parent2 = population[i], population[i + 1]
                split = random.randint(1, len(base_sequence) - 1)
                child = parent1[:split] + [word for word in parent2 if word not in parent1[:split]]
                offspring.append(child)

            population.extend(offspring)

            # Mutation
            for individual in population:
                if random.random() < mutation_rate:
                    i, j = random.sample(range(len(base_sequence)), 2)
                    individual[i], individual[j] = individual[j], individual[i]

            # Introduce random new individuals periodically
            if generation % 10 == 0:
                new_individuals = [random.sample(base_sequence, len(base_sequence)) for _ in range(population_size // 10)]
                population.extend(new_individuals)

        results.append({'id': text_id, 'text': ' '.join(best_sequence), 'perplexity': best_perplexity})
    
    return pd.DataFrame(results)

# Hybrid PSO-GA-SA Implementation
def hybrid_pso_ga_sa(data, swarm_size=50, generations=100, mutation_rate=0.2, initial_temperature=100, cooling_rate=0.95):
    """
    Hybrid method combining PSO, GA, and SA to minimize perplexity.
    """
    results = []
    for _, row in data.iterrows():
        text_id = row['id']
        base_sequence = row['text'].split()

        # Initialize swarm
        swarm = [{'sequence': random.sample(base_sequence, len(base_sequence)), 'velocity': 1} for _ in range(swarm_size)]

        best_global_sequence = None
        best_global_perplexity = float('inf')

        temperature = initial_temperature

        for generation in range(generations):
            for particle in swarm:
                sequence = particle['sequence']
                perplexity = calculate_perplexity(' '.join(sequence))
                coherence = calculate_coherence(sequence)

                # Update personal best
                if 'best_perplexity' not in particle or perplexity - 0.1 * coherence < particle['best_perplexity']:
                    particle['best_perplexity'] = perplexity - 0.1 * coherence
                    particle['best_sequence'] = sequence

                # Update global best
                if perplexity - 0.1 * coherence < best_global_perplexity:
                    best_global_perplexity = perplexity - 0.1 * coherence
                    best_global_sequence = sequence

                # Update velocity and position (PSO logic)
                inertia = particle['velocity']
                cognitive = random.random() * 2  # Random cognitive factor
                social = random.random() * 2    # Random social factor

                particle['velocity'] = inertia + cognitive + social
                particle['sequence'] = modify_sequence(sequence, best_global_sequence, particle['velocity'])

            # Simulated Annealing
            for particle in swarm:
                sequence = particle['sequence']
                perplexity = calculate_perplexity(' '.join(sequence))
                coherence = calculate_coherence(sequence)

                delta = (perplexity - 0.1 * coherence) - best_global_perplexity
                if delta < 0 or random.random() < math.exp(-delta / temperature):
                    best_global_sequence = sequence
                    best_global_perplexity = perplexity - 0.1 * coherence

            # Cool down temperature
            temperature *= cooling_rate

            # Add random diversity
            if generation % 10 == 0:
                random_particles = [{'sequence': random.sample(base_sequence, len(base_sequence)), 'velocity': 1} for _ in range(swarm_size // 10)]
                swarm.extend(random_particles)

        results.append({'id': text_id, 'text': ' '.join(best_global_sequence), 'perplexity': best_global_perplexity})

    return pd.DataFrame(results)

# Run Hybrid Method
start_time = time.time()
hybrid_results = hybrid_pso_ga_sa(data)
end_time = time.time()

hybrid_results['time'] = end_time - start_time

# Save results
hybrid_results.to_csv('hybrid_method_submission.csv', index=False)
print("Hybrid PSO-GA-SA Completed. Results saved to hybrid_method_submission.csv.")



TypeError: argument of type 'NoneType' is not iterable