In [None]:
import re
import string
from tqdm.auto import tqdm
from datasets import load_dataset # Assuming datasets are loadable this way
from kaggle_secrets import UserSecretsClient
from google import genai
from sentence_transformers import SentenceTransformer, util

In [None]:
# Load Sentence Transformer model for semantic similarity
similarity_model = SentenceTransformer('all-MiniLM-L6-v2')

# Initialize Gemini Translator

user_secrets = UserSecretsClient()
api_key = user_secrets.get_secret("GEMINI_API_KEY")
client = genai.Client(api_key=api_key)

def get_prompt(processed_cand_cm_romanized):
    pmt = f"""I will give you a text in roman-hindi and english below:-
------------------------
{processed_cand_cm_romanized}
------------------------
you have to follow these steps below to translate the text into english.
1. First translate the sentence exactly into non roman hindi representation of characters.
2. Now translate the sentence into its english translation and remember it as SE.
3. Now for the sentences that are english words and are common in both the SE in step 2 and the original text, POS tag them.
4. Now remembering the POS tags of words, translate the original sentence into hindi remembering the meaning of POS words and cross-language homonyms.
5. Now translate the final hindi sentence in step 4 to its english translation.

Only give me the translated sentence in step 5 as your response in this template below so that i can extract the sentence using regular expression
*****Final Translated Sentence*****
[put the final sentence from step 5.]
*****End*****
"""
    return pmt

model_name = "gemini-2.0-flash-lite"

In [None]:

def preprocess(text):
    """Lowercase, remove punctuation, and remove code blocks."""
    if not isinstance(text, str):
        return ""
    # 1. Remove code blocks first
    code_block_pattern = r'```.*?```'
    text_without_code = re.sub(code_block_pattern, '', text, flags=re.DOTALL)
    # 2. Lowercase the remaining text
    text_lower = text_without_code.lower()
    # 3. Remove punctuation
    translator_punct = str.maketrans('', '', string.punctuation)
    text_no_punct = text_lower.translate(translator_punct)
    # 4. Remove extra whitespace
    text_clean = re.sub(r'\s+', ' ', text_no_punct).strip()
    return text_clean


def extract_translated_sentence(text):
    pattern = r"\*{5}Final Translated Sentence\*{5}\s*(.*?)\s*\*{5}End\*{5}"
    match = re.search(pattern, text, re.DOTALL)
    if match:
        return match.group(1).strip()
    else:
        return None

def get_semantic_similarity(text1, text2):
    """Calculate semantic similarity using Sentence Transformers."""
    if not text1 or not text2:
        return 0.0
    try:
        # Normalize embeddings for cosine similarity calculation stability
        embedding1 = similarity_model.encode(text1, convert_to_tensor=True, normalize_embeddings=True)
        embedding2 = similarity_model.encode(text2, convert_to_tensor=True, normalize_embeddings=True)
        # Compute cosine-similarity
        cosine_scores = util.pytorch_cos_sim(embedding1, embedding2)
        # Clamp score between -1 and 1 just in case of float precision issues
        similarity = max(-1.0, min(1.0, cosine_scores.item()))
        return similarity
    except Exception as e:
        print(f"Similarity Calculation Error: {e}")
        return 0.0

# --- GAME Implementation (Modified) ---

class GAME:
    def __init__(self, matrix_lang_code='hi'):
        self.matrix_lang_code = matrix_lang_code
        print(f"GAME initialized with matrix language: {self.matrix_lang_code}")

    def evaluate(self, reference_en, candidate_cm_romanized):
        if not isinstance(reference_en, str) or not isinstance(candidate_cm_romanized, str) or not reference_en or not candidate_cm_romanized:
             return 0.0

        # 1. Preprocess both sentences
        processed_ref_en = preprocess(reference_en)
        processed_cand_cm_romanized = preprocess(candidate_cm_romanized)
        
        if not processed_ref_en or not processed_cand_cm_romanized:
             #print("Warning: Empty string after preprocessing.")
             return 0.0

        response = client.models.generate_content(
            model=model_name, contents=get_prompt(processed_cand_cm_romanized)
        )

        try:
            reconstructed_en_sentence = extract_translated_sentence(response.text)
        except Exception as e:
            print(f"Error extracting translated sentence: {e}")
        
        if not reconstructed_en_sentence:
            # print("Warning: Final English reconstruction failed.")
             return 0.0 # Cannot compare if reconstruction failed
        # 4. Calculate semantic similarity between original reference and final reconstruction
        similarity_score = get_semantic_similarity(processed_ref_en, reconstructed_en_sentence)

        # 5. Scale score to 0-100
        # Max(0, ...) handles potential negative similarity scores (though usually between 0 and 1 after normalization)
        game_score = max(0, similarity_score) * 100
        return game_score

# --- Main Execution Logic (Mostly unchanged, uses the modified GAME class) ---

def compare_datasets(dataset_en_path, dataset_cm_path, matrix_lang='hi', split='train'):

    print(f"Loading English dataset from: {dataset_en_path}")
    # Allow specifying a configuration name for the English dataset if needed
    ds_en = load_dataset(dataset_en_path, split="v0.1.4")

    print(f"Loading Code-Mixed dataset from: {dataset_cm_path}")
    ds_cm = load_dataset(dataset_cm_path, split=split) # Assuming CM dataset doesn't need a specific config name

    if len(ds_en) != len(ds_cm):
        print(f"Warning: Datasets have different lengths ({len(ds_en)} vs {len(ds_cm)}). Comparing row-by-row might be incorrect.")
        min_len = min(len(ds_en), len(ds_cm))
        print(f"Comparing the first {min_len} entries.")
        ds_en = ds_en.select(range(min_len))
        ds_cm = ds_cm.select(range(min_len))

    game_evaluator = GAME(matrix_lang_code=matrix_lang)
    game_scores = []
    failed_evaluations = 0

    print(f"Calculating GAME scores for {len(ds_cm)} entries...")
    # Use zip to iterate through corresponding rows
    for entry_en, entry_cm in tqdm(zip(ds_en, ds_cm), total=len(ds_cm)):
        # Assuming the key containing the prompt text is 'prompt'
        # **** ADJUST THIS KEY IF YOUR DATASETS USE A DIFFERENT COLUMN NAME ****
        reference_prompt = entry_en.get('instruct_prompt')
        candidate_prompt = entry_cm.get('instruct_prompt')
        
        if reference_prompt and candidate_prompt:
            try:
                score = game_evaluator.evaluate(reference_prompt, candidate_prompt)
                game_scores.append(score)
            except Exception as eval_e:
                 print(f"\nError during GAME evaluation for an entry: {eval_e}")
                 print(f"Reference: {reference_prompt[:100]}...")
                 print(f"Candidate: {candidate_prompt[:100]}...")
                 game_scores.append(0.0) # Assign 0 score on evaluation error
                 failed_evaluations += 1
        else:
            # print(f"Warning: Missing prompt key in an entry. Skipping.")
            game_scores.append(0.0) # Assign 0 score if data is missing
            failed_evaluations += 1
        break


    valid_scores = [s for s in game_scores if s is not None] # Should already be floats
    average_game_score = sum(valid_scores) / len(valid_scores) if valid_scores else 0.0

    print("\n--- Evaluation Summary ---")
    print(f"Matrix Language: {matrix_lang}")
    print(f"Number of entries processed: {len(game_scores)}")
    print(f"Number of failed/skipped evaluations: {failed_evaluations}")
    print(f"Average GAME Score (over all processed): {average_game_score:.2f}")

    # Filter out potential None scores if any error handling changes
    # Plot distribution of actual calculated scores
    if valid_scores:
        try:
            import matplotlib.pyplot as plt
            import numpy as np
            plt.figure(figsize=(10, 5))
            plt.hist(np.array(valid_scores), bins=20, range=(0, 100)) # Ensure scores are within 0-100 range for plot
            plt.title(f"Distribution of GAME Scores (Matrix Lang: {matrix_lang})")
            plt.xlabel("GAME Score (0-100)")
            plt.ylabel("Frequency")
            plt.grid(axis='y', alpha=0.75)
            plt.show()
        except ImportError:
            print("\nInstall matplotlib to see the score distribution plot: pip install matplotlib numpy")
    else:
        print("\nNo valid scores generated to plot.")


    return game_scores, average_game_score, failed_evaluations

In [None]:
import os
import re
import time
import json
import string
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from datasets import load_dataset, Dataset
from sentence_transformers import util

# --- Configuration for Checkpoint and Rate Limiting ---
ONE_MINUTE = 60
BATCH_SIZE = 15
CHECKPOINT_FILE = "game_scores_checkpoint.parquet"
ERROR_LOG_FILE = "game_error_log.txt"

# Define similarity model, model_name, and client before using this code

# --- Utility Functions ---
def preprocess(text):
    if not isinstance(text, str):
        return ""
    code_block_pattern = r'```.*?```'
    text = re.sub(code_block_pattern, '', text, flags=re.DOTALL)
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return re.sub(r'\s+', ' ', text).strip()

def extract_translated_sentence(text):
    pattern = r"\*{5}Final Translated Sentence\*{5}\s*(.*?)\s*\*{5}End\*{5}"
    match = re.search(pattern, text, re.DOTALL)
    return match.group(1).strip() if match else None

def get_semantic_similarity(text1, text2):
    if not text1 or not text2:
        return 0.0
    try:
        emb1 = similarity_model.encode(text1, convert_to_tensor=True, normalize_embeddings=True)
        emb2 = similarity_model.encode(text2, convert_to_tensor=True, normalize_embeddings=True)
        sim = util.pytorch_cos_sim(emb1, emb2)
        return max(-1.0, min(1.0, sim.item()))
    except Exception as e:
        print(f"Similarity Calculation Error: {e}")
        return 0.0

class GAME:
    def __init__(self, matrix_lang_code='hi'):
        self.matrix_lang_code = matrix_lang_code

    def evaluate(self, reference_en, candidate_cm_romanized):
        try:
            processed_ref = preprocess(reference_en)
            processed_cand = preprocess(candidate_cm_romanized)
            if not processed_ref or not processed_cand:
                return 0.0

            response = client.models.generate_content(
                model=model_name, contents=get_prompt(processed_cand)
            )
            reconstructed = extract_translated_sentence(response.text)
            if not reconstructed:
                return 0.0

            sim_score = get_semantic_similarity(processed_ref, reconstructed)
            return max(0.0, sim_score) * 100

        except Exception as e:
            return 0.0

def log_error(entry, msg):
    # with open(ERROR_LOG_FILE, "a", encoding="utf-8") as f:
    #     f.write(json.dumps({"error": msg, "entry": entry}, ensure_ascii=False) + "\n")
    print({"error": msg, "entry": entry})

# --- Main Execution with Checkpoint and Rate Limiting ---
def compare_datasets_with_checkpointing(dataset_en_path, dataset_cm_path, matrix_lang='hi', split='train'):
    ds_en = load_dataset(dataset_en_path, split="v0.1.4")
    # ds_cm = load_dataset(dataset_cm_path, split=split)
    ds_cm = pd.read_parquet("/kaggle/input/0-9cmd/MBigCodeBench-hini-end-cmd0.9.parquet")
    # if len(ds_en) != len(ds_cm):
    #     min_len = min(len(ds_en), len(ds_cm))
    #     ds_en = ds_en.select(range(min_len))
    #     ds_cm = ds_cm.select(range(min_len))

    if os.path.exists(CHECKPOINT_FILE):
        checkpoint_df = pd.read_parquet(CHECKPOINT_FILE)
        processed_indices = set(checkpoint_df['index'])
    else:
        checkpoint_df = pd.DataFrame(columns=['index', 'score'])
        processed_indices = set()

    all_scores = []
    failed = 0
    game = GAME(matrix_lang)

    indices = [i for i in range(len(ds_cm)) if i not in processed_indices]
    progress_bar = tqdm(total=len(indices), desc="Processing Entries")
    elapsed_times = []

    for i in range(0, len(indices), BATCH_SIZE):
        batch_indices = indices[i:i+BATCH_SIZE]
        start_time = time.time()

        with ThreadPoolExecutor(max_workers=BATCH_SIZE) as executor:
            futures = {
                executor.submit(
                    game.evaluate,
                    ds_en[j]['instruct_prompt'],
                    # ds_cm[j]['instruct_prompt']
                    ds_cm.iloc[j]['instruct_prompt']
                ): j for j in batch_indices
            }

            for future in as_completed(futures):
                j = futures[future]
                try:
                    score = future.result()
                    all_scores.append(score)
                    checkpoint_df.loc[len(checkpoint_df)] = {'index': j, 'score': score}
                    checkpoint_df.to_parquet(CHECKPOINT_FILE, index=False)
                    progress_bar.update(1)
                except Exception as e:
                    log_error({'index': j}, f"Evaluation Error: {e}")
                    failed += 1

        # Enforce rate limiting
        elapsed = time.time() - start_time
        sleep_time = max(0, ONE_MINUTE - elapsed)
        if sleep_time > 0:
            print(f"Waiting {sleep_time:.2f} seconds before next batch...")
            time.sleep(sleep_time)

    progress_bar.close()
    avg_score = sum(all_scores) / len(all_scores) if all_scores else 0.0

    print("\n--- Evaluation Complete ---")
    print(f"Matrix Language: {matrix_lang}")
    print(f"Entries processed: {len(all_scores)}")
    print(f"Failed evaluations: {failed}")
    print(f"Average GAME Score: {avg_score:.2f}")

    return all_scores, avg_score, failed

In [None]:
# # --- Configuration ---
# Using the specified HF dataset names
ENGLISH_DATASET_PATH = "bigcode/bigcodebench"
CODE_MIXED_DATASET_PATH = "cmd0.9"
MATRIX_LANGUAGE = "hi" # Hindi - change as needed ('bn', 'es', 'fr')
DATASET_SPLIT = 'train' # Usually evaluate on the test split


try:
     game_scores, avg_score, failed_score = compare_datasets_with_checkpointing(
         ENGLISH_DATASET_PATH,
         CODE_MIXED_DATASET_PATH,
         matrix_lang=MATRIX_LANGUAGE,
         split=DATASET_SPLIT,
     )
     # print("\nIndividual Scores:", scores) # Uncomment to see all scores
except FileNotFoundError:
     print("\nError: One or both datasets could not be loaded from Hugging Face Hub.")
     print("Please ensure the dataset identifiers and your internet connection are correct.")
except ValueError as ve:
     print(f"\nValueError during dataset loading or processing: {ve}")
     print("This might indicate an issue with the dataset structure, split name, or configuration name.")
except Exception as e:
     print(f"\nAn unexpected error occurred: {e}")
     # Optionally add more detailed error logging here
     import traceback
     traceback.print_exc()

In [None]:
print("\n--- Evaluation Summary ---")
print(f"Matrix Language: hi")
print(f"Number of entries processed: {len(game_scores)}")
print(f"Number of failed/skipped evaluations: {failed_score}")
print(f"Average GAME Score (over all processed): {avg_score:.2f}")


try:
    import matplotlib.pyplot as plt
    import numpy as np
    plt.figure(figsize=(10, 5))
    plt.hist(np.array(game_scores), bins=20, range=(0, 100)) # Ensure scores are within 0-100 range for plot
    plt.title(f"Distribution of GAME Scores (Matrix Lang: hi)")
    plt.xlabel("GAME Score (0-100)")
    plt.ylabel("Frequency")
    plt.grid(axis='y', alpha=0.75)
    plt.show()
except ImportError:
    print("\nInstall matplotlib to see the score distribution plot: pip install matplotlib numpy")

In [None]:
df = pd.read_parquet("/kaggle/working/game_scores_checkpoint.parquet")
print(df.head(1))

In [None]:
# Calculate values
total_entries = len(df)
failed_score = (df["score"] == 0).sum()
processed_scores = df[df["score"] > 0]["score"]
avg_score = processed_scores.mean()

# Print summary
print("\n--- Evaluation Summary ---")
print(f"Matrix Language: hi")
print(f"Number of entries processed: {total_entries}")
print(f"Number of failed/skipped evaluations: {failed_score}")
print(f"Average GAME Score (over all processed): {avg_score:.2f}")

# Plot distribution
plt.figure(figsize=(10, 5))
plt.hist(processed_scores, bins=20, range=(0, 100))
plt.title(f"Distribution of GAME Scores (Matrix Lang: hi)")
plt.xlabel("GAME Score (0-100)")
plt.ylabel("Frequency")
plt.grid(axis='y', alpha=0.75)
plt.show()
