@inproceedings{patwa2020sentimix,
 title={SemEval-2020 Task 9: Overview of Sentiment Analysis of Code-Mixed Tweets},
 author={Patwa, Parth and
 Aguilar, Gustavo and
 Kar, Sudipta and
 Pandey, Suraj and
 PYKL, Srinivas and
 Gamb{\"a}ck, Bj{\"o}rn and
 Chakraborty, Tanmoy and
 Solorio, Thamar and
 Das, Amitava},
 booktitle = {Proceedings of the 14th International Workshop on Semantic Evaluation ({S}em{E}val-2020)},
 year = {2020},
 month = {December},
 address = {Barcelona, Spain},
 publisher = {Association for Computational Linguistics},
}

  
@inproceedings{nayak-joshi-2022-l3cube,
    title = "{L}3{C}ube-{H}ing{C}orpus and {H}ing{BERT}: A Code Mixed {H}indi-{E}nglish Dataset and {BERT} Language Models",
    author = "Nayak, Ravindra  and
      Joshi, Raviraj",
    editor = "Jha, Girish Nath  and
      L., Sobha  and
      Bali, Kalika  and
      Ojha, Atul Kr.",
    booktitle = "Proceedings of the WILDRE-6 Workshop within the 13th Language Resources and Evaluation Conference",
    month = jun,
    year = "2022",
    address = "Marseille, France",
    publisher = "European Language Resources Association",
    url = "https://aclanthology.org/2022.wildre-1.2/",
    pages = "7--12",
    abstract = "Code-switching occurs when more than one language is mixed in a given sentence or a conversation. This phenomenon is more prominent on social media platforms and its adoption is increasing over time. Therefore code-mixed NLP has been extensively studied in the literature. As pre-trained transformer-based architectures are gaining popularity, we observe that real code-mixing data are scarce to pre-train large language models. We present L3Cube-HingCorpus, the first large-scale real Hindi-English code mixed data in a Roman script. It consists of 52.93M sentences and 1.04B tokens, scraped from Twitter. We further present HingBERT, HingMBERT, HingRoBERTa, and HingGPT. The BERT models have been pre-trained on codemixed HingCorpus using masked language modelling objectives. We show the effectiveness of these BERT models on the subsequent downstream tasks like code-mixed sentiment analysis, POS tagging, NER, and LID from the GLUECoS benchmark. The HingGPT is a GPT2 based generative transformer model capable of generating full tweets. Our models show significant improvements over currently available models pre-trained on multiple languages and synthetic code-mixed datasets. We also release L3Cube-HingLID Corpus, the largest code-mixed Hindi-English language identification(LID) dataset and HingBERT-LID, a production-quality LID model to facilitate capturing of more code-mixed data using the process outlined in this work. The dataset and models are available at \url{https://github.com/l3cube-pune/code-mixed-nlp}."
}

@inproceedings{lovenia2022ascend,
  title={ASCEND: A Spontaneous Chinese-English Dataset for Code-switching in Multi-turn Conversation},
  author={Lovenia, Holy and Cahyawijaya, Samuel and Winata, Genta Indra and Xu, Peng and Yan, Xu and Liu, Zihan and Frieske, Rita and Yu, Tiezheng and Dai, Wenliang and Barezi, Elham J and others},
  booktitle={Proceedings of the 13th Language Resources and Evaluation Conference (LREC)},
  year={2022}

In [1]:
# !pip install ratelimit
# !pip install tqdm
# !pip install spacy
# !pip install unidecode
# !pip install datasets
# !pip install -q -U google-genai
# !python -m spacy download en_core_web_sm

In [2]:
import spacy
import json
import re
import textwrap
import time
import os
import gc
import psutil
import tracemalloc
import unidecode
import pandas as pd
import pyarrow.parquet as pq
from datasets import load_dataset, Dataset
from IPython.display import Markdown
from google import genai
from ratelimit import limits, sleep_and_retry 
from tqdm import tqdm 
from concurrent.futures import ThreadPoolExecutor, as_completed

In [3]:
# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("bigcode/bigcodebench")

README.md:   0%|          | 0.00/8.83k [00:00<?, ?B/s]

v0.1.0_hf-00000-of-00001.parquet:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

v0.1.1-00000-of-00001.parquet:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

v0.1.2-00000-of-00001.parquet:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

v0.1.3-00000-of-00001.parquet:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

v0.1.4-00000-of-00001.parquet:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

Generating v0.1.0_hf split:   0%|          | 0/1140 [00:00<?, ? examples/s]

Generating v0.1.1 split:   0%|          | 0/1140 [00:00<?, ? examples/s]

Generating v0.1.2 split:   0%|          | 0/1140 [00:00<?, ? examples/s]

Generating v0.1.3 split:   0%|          | 0/1140 [00:00<?, ? examples/s]

Generating v0.1.4 split:   0%|          | 0/1140 [00:00<?, ? examples/s]

In [4]:
data = ds['v0.1.4']
new_df = data.to_pandas()
print(new_df.columns)
print("Unique entry points: ",new_df.entry_point.unique())
print("Total libs: ", new_df.canonical_solution.nunique())
print("Total complete_prompt", new_df.complete_prompt.nunique())

Index(['task_id', 'complete_prompt', 'instruct_prompt', 'canonical_solution',
       'code_prompt', 'test', 'entry_point', 'doc_struct', 'libs'],
      dtype='object')
Unique entry points:  ['task_func']
Total libs:  1137
Total complete_prompt 1139


In [5]:
# languages = ['hindi']
languages = ['chinese']

In [6]:
api_key = "api_key"
hugging_key = "api_key"
client = genai.Client(api_key=api_key)

In [7]:
nlp = spacy.load("en_core_web_sm")  # Load the spaCy English language model

def remove_code_blocks(text):
    return re.sub(r"```[\s\S]*?```", "", text)

def create_imp_eng_dict(doc, p_type):
    imp_eng = {}
    if p_type == 'doc_struct':
        exclude_keys = ['description', 'Args', 'notes', 'params', 'returns', 'reqs', 'raises', 'examples']
    else:
        exclude_keys = set()
    for token in doc:
        if token.pos_ in ["NOUN", "ADJ", "ADV", "CCONJ", "INTJ"] and token.text.lower() not in exclude_keys:  # Or use token.tag_ for finer control
            imp_eng[token.text.lower()] = []  # Initialize as a list (for potential multiple translations later)
    return imp_eng

def create_lan_eng_dict(imp_eng, translated_sentence):
    lan_eng_dict = {}
    translated_tokens = nltk.word_tokenize(translated_sentence)
    for english_word in imp_eng:
        lan_translations = []
        for token in translated_tokens:
            try:
                back_translated = translator.translate(token, src='hi', dest='en').text.lower()
                if english_word == back_translated:
                    lan_translations.append(token)
            except:
                pass  # Handle translation errors appropriately
        lan_eng_dict[english_word] = lan_translations
    return lan_eng_dict

def enforce_original_keys(original_dict, translated_dict):
    corrected_dict = {}

    for key in original_dict.keys():
        # Ensure the key exists in translated_dict, otherwise keep original key with empty value
        corrected_dict[key] = translated_dict.get(key, [])  

    return corrected_dict
    
def string_to_dict(input_string):
    try:
        return json.loads(input_string)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
        return None

def restore_blocks(original_text, translated_text):

    # Extract code blocks from the original text
    original_code_blocks = re.findall(r"```[\s\S]*?```", original_text)
    
    # Find all code blocks in the translated text
    translated_code_blocks = re.findall(r"```[\s\S]*?```", translated_text['translated_instruct_prompt'])

    # Replace translated code blocks with the original ones
    for translated_code, original_code in zip(translated_code_blocks, original_code_blocks):
        translated_text['translated_instruct_prompt'] = translated_text['translated_instruct_prompt'].replace(translated_code, original_code, 1)

    # doc_struct = string_to_dict(original_text)
    # tr_doc_struct = string_to_dict(translated_text.strip().removeprefix("```json").removesuffix("```").strip())
    # translated_text = enforce_original_keys(doc_struct, tr_doc_struct)
    # translated_text = json.dumps(translated_text, ensure_ascii=False)
    # translated_text['translated_instruct_prompt'] = translated_text.strip().removeprefix("```json").removesuffix("```").strip()

    keys = ["roman_instruct_dictionary", "translated_doc_struct_prompt", "roman_doc_struct_dictionary"]
    for key in keys:
        translated_text[key] = translated_text[key].strip().removeprefix("```json").removesuffix("```").strip()
        translated_text[key] = string_to_dict(translated_text[key])
        
    return translated_text

In [8]:
def translate_prompt(prompt, lan, i_imp_eng, d_imp_eng):
    m_prompt = f"""
Translate this "instruct" sentence in {lan}
-------------------------------------
{prompt['instruct_prompt']}
-------------------------------------
do not translate the code and programming terms(args, list etc) in the prompt. Make it more like human written.
and here is some of the important english pos tags: 
{i_imp_eng}
Look for the corresponding meaning of these pos in {lan} and look for the {lan} word in the translated sentence.
Now translate each {lan} word in the dictionary in Roman {lan} in three ways or spellings, all must be strictly different in spelling. Remember that the translation must be only in Anglo-Saxon script.
Create a json dictionary which contains the eng pos word as eng_word and the corresponding {lan} word in the translated sentence as {lan}_word if exists in {lan} and the roman Anglo-Saxon script translations of the {lan}_word.
Format above as RFC8259 compliant json dictionary, in the format [“eng”: <eng_word>, “{lan}”: <{lan}_word>,“roman_{lan}”: <transliterations>]

Translate this "doc_struct" prompt in {lan}\n-------------------------------------\n{prompt['doc_struct']}\n-------------------------------------\nThese are the requirements:-\nDo not translate the keys and do not translate the word 'Args'.\nDo not translate the code.\nDo not translate programming terms like args, list etc in the prompt. \nDo not translate any abbreviation.\nDo not translate 'reqs','raises','examples'.\nReturn the whole prompt. \nDo not miss to include any docstring element. \nOnly output the docstring dictionary with no other text at all.\n"
and here is some of the important english pos tags for this sentence: 
{d_imp_eng}
Look for the corresponding meaning of these pos in {lan} and look for the {lan} word in the translated sentence.
Now transliterate each {lan} word in the dictionary in Roman {lan} in three ways or spellings, all must be strictly different in spelling. Remember that the translation must be only in Anglo-Saxon script.
Create a json dictionary which contains the eng pos word as eng_word and the corresponding {lan} word in the translated sentence as {lan}_word if exists in {lan} and the roman Anglo-Saxon script translations of the {lan}_word.
Format above as RFC8259 compliant json dictionary, in the format [“eng”: <eng_word>, “{lan}”: <{lan}_word>,“roman_{lan}”: <transliterations>]


Only return the translated prompt and the roman dictionary in this format structure and nothing else:-
*****translated_instruct_prompt
[translated_prompt]
*****
*****roman_instruct_dictionary
[roman_dictionary]
*****
*****translated_doc_struct_prompt
[translated_prompt]
*****
*****roman_doc_struct_dictionary
[roman_dictionary]
*****
"""
    
    response = client.models.generate_content(
        model="gemini-2.0-flash-lite", contents=m_prompt
    )

    # Define marker patterns
    markers = [
        "translated_instruct_prompt",
        "roman_instruct_dictionary",
        "translated_doc_struct_prompt",
        "roman_doc_struct_dictionary"
    ]
    
    # Regular expression pattern to capture each section
    pattern = r"\*\*\*\*\*({})\n(.*?)(?=\n\*\*\*\*\*|$)".format("|".join(markers))
    
    # Extract matches
    matches = re.findall(pattern, response.text, re.DOTALL)
    
    # Convert to dictionary
    extracted_text = {key: value.strip() for key, value in matches}

    return restore_blocks(prompt['instruct_prompt'], extracted_text)
    # return response.text

In [9]:
def romanize(prompt, lan):
    m_prompt = f"""
I want you to romanize the following from {lan} to its roman translation while keeping the words from english as it is. 
Remember that the translation must be only in roman Anglo-Saxon script. The translated prompt should not contain any {lan} words and all the spellings for english words must be correct.
-------------------------------------
{prompt}
-------------------------------------

Only return the roman translation in the exact format and structure as the original prompt.
"""
    
    response = client.models.generate_content(
        model="gemini-2.0-flash-lite", contents=m_prompt
    )

    return unidecode.unidecode(response.text)

In [10]:
def pipeline(prompt, lan):
    cleaned_i_text = remove_code_blocks(prompt["instruct_prompt"])
    i_doc = nlp(cleaned_i_text)
    d_doc = nlp(prompt['doc_struct'])
    imp_eng = create_imp_eng_dict(i_doc, "instruct_prompt")
    dmp_eng = create_imp_eng_dict(d_doc, "doc_struct")
    translated_prompt = translate_prompt(prompt, lan, imp_eng, dmp_eng)
    
    return translated_prompt

In [11]:
# Function to load word frequencies from a Parquet file
def load_from_parquet(input_file):
    # Read the Parquet file into a PyArrow Table
    table = pq.read_table(input_file)
    # Convert the table to a pandas DataFrame
    df = table.to_pandas()
    # Convert the DataFrame to a dictionary for efficient lookup
    return pd.Series(df['Count'].values, index=df['Word'].str.lower()).to_dict()

In [12]:
hin_input_file = "../dataset/hindi-eng/word_frequency_hieng.parquet"
spa_input_file = "../dataset/spa-eng/word_frequency_speng.parquet"
chi_input_file = "../dataset/chi-eng/word_frequencies_ching.parquet"

# Load the word frequencies from the Parquet file
word_freq_dict = load_from_parquet(chi_input_file)

# Function to get the frequency of a word
def get_word_frequency(word):
    return word_freq_dict.get(word.lower(), 0)  # Case-insensitive lookup

In [13]:
def calculate_score(english_word, translated_word, roman_translations):  # Modified parameters
    """Calculates the score for a word based on frequency, using provided roman_translations."""
    english_freq = get_word_frequency(english_word)
    translated_freq = sum(get_word_frequency(t) for t in roman_translations)  # Use the provided roman translations

    score = float('inf') if translated_freq == 0 else english_freq / translated_freq
    return score


def replace_words(text, replacements, lan):
    """Replaces words in the text based on replacements dictionary."""
    
    text = str(text)

    for en_word, translated_word in replacements.items():
        text = text.replace(translated_word, en_word)

    return text


def algorithm1(translated_prompt, roman_dictionary, cmd, lan):  # Modified parameters
    """Implements Algorithm 1 (modified to use existing roman translations)."""

    replacements = {}

    for entry in roman_dictionary:
        english_word = entry['eng']
        translated_word = entry[lan]
        roman_translations = entry[f"roman_{lan}"]  # Get the pre-calculated romanizations from the dictionary!
        score = calculate_score(english_word, translated_word, roman_translations)  # Use the new calculate_score function
        replacements[english_word] = {'word': translated_word, 'score': score} # adding english_word, translated_word, score in dict


    sorted_replacements = dict(sorted(replacements.items(), key=lambda item: item[1]['score'], reverse=True))  # Sort by score (highest first)
    num_replacements = int(cmd * len(sorted_replacements))

    
    final_replacements = {}
    for i, (en_word, data) in enumerate(sorted_replacements.items()):  # Iterate through sorted replacements
        if i < num_replacements:
             final_replacements[en_word] = data['word'] # add words to be replaced


    code_mixed_text = replace_words(translated_prompt, final_replacements, lan) # replace words
    return code_mixed_text





# for lan in languages:
#   for i,prompt in new_df.iterrows():
#      generated_output = pipeline(prompt, lan)

#      code_mixed_instruct_prompt_09 = algorithm1(
#          generated_output['translated_instruct_prompt'],
#          generated_output['roman_instruct_dictionary'],
#          cmd=0.9,  # Example CMD value
#          lan=lan
#      )

#      code_mixed_doc_prompt_09 = algorithm1(
#          generated_output['translated_doc_struct_prompt'],
#          generated_output['roman_doc_struct_dictionary'],
#          cmd=0.9,  # Example CMD value
#          lan=lan
#      )
     
#      code_mixed_instruct_prompt_06 = algorithm1(
#          generated_output['translated_instruct_prompt'],
#          generated_output['roman_instruct_dictionary'],
#          cmd=0.6,  # Example CMD value
#          lan=lan
#      )

#      code_mixed_doc_prompt_06 = algorithm1(
#          generated_output['translated_doc_struct_prompt'],
#          generated_output['roman_doc_struct_dictionary'],
#          cmd=0.6,  # Example CMD value
#          lan=lan
#      )

#      print("CMD 0.9")
#      print(code_mixed_instruct_prompt_09)
#      print(code_mixed_doc_prompt_09)
#      print("\n CMD 0.6")
#      print(code_mixed_instruct_prompt_06)
#      print(code_mixed_doc_prompt_06)
     
#      if(lan == "hindi"):
#         code_mixed_instruct_prompt_09 = romanize(code_mixed_instruct_prompt_09, lan)
#         code_mixed_doc_prompt_09 = romanize(code_mixed_doc_prompt_09, lan).strip("`\n ") 
#         code_mixed_instruct_prompt_06 = romanize(code_mixed_instruct_prompt_06, lan)
#         code_mixed_doc_prompt_06 = romanize(code_mixed_doc_prompt_06, lan).strip("`\n ") 

     
#     #  prompt['complete_prompt'] = replace_docstring(prompt['complete_prompt'], eval(code_mixed_doc_prompt))
#     #  prompt['instruct_prompt'] = code_mixed_instruct_prompt
#     #  prompt['doc_struct'] = code_mixed_doc_prompt
     
#      # final = pd.concat([final, pd.DataFrame([prompt])], ignore_index=True)
#      break # remove this break if you want to implement on all the data.
#   break # remove this break if you want to implement on all the languages.

In [14]:
# rows_09 = []
# rows_06 = []

# for lan in languages:
#   for i,prompt in new_df.iterrows():
#      generated_output = pipeline(prompt, lan)

#      code_mixed_instruct_prompt_09 = algorithm1(
#          generated_output['translated_instruct_prompt'],
#          generated_output['roman_instruct_dictionary'],
#          cmd=0.9,  # Example CMD value
#          lan=lan
#      )

#      code_mixed_doc_prompt_09 = algorithm1(
#          generated_output['translated_doc_struct_prompt'],
#          generated_output['roman_doc_struct_dictionary'],
#          cmd=0.9,  # Example CMD value
#          lan=lan
#      )
     
#      code_mixed_instruct_prompt_06 = algorithm1(
#          generated_output['translated_instruct_prompt'],
#          generated_output['roman_instruct_dictionary'],
#          cmd=0.6,  # Example CMD value
#          lan=lan
#      )

#      code_mixed_doc_prompt_06 = algorithm1(
#          generated_output['translated_doc_struct_prompt'],
#          generated_output['roman_doc_struct_dictionary'],
#          cmd=0.6,  # Example CMD value
#          lan=lan
#      )

#      print("CMD 0.9")
#      print(code_mixed_instruct_prompt_09)
#      print(code_mixed_doc_prompt_09)
#      print("\n CMD 0.6")
#      print(code_mixed_instruct_prompt_06)
#      print(code_mixed_doc_prompt_06)
     
#      if(lan == "hindi"):
#         code_mixed_instruct_prompt_09 = romanize(code_mixed_instruct_prompt_09, lan)
#         code_mixed_doc_prompt_09 = romanize(code_mixed_doc_prompt_09, lan).strip("`\n ") 
#         code_mixed_instruct_prompt_06 = romanize(code_mixed_instruct_prompt_06, lan)
#         code_mixed_doc_prompt_06 = romanize(code_mixed_doc_prompt_06, lan).strip("`\n ") 

#      new_com_06 = replace_docstring(prompt['complete_prompt'], eval(code_mixed_doc_prompt_06))
#      new_row = prompt.to_dict()
#      new_row.update({
#          "lan": lan,
#          "cmd": 0.6,
#          "complete_prompt": new_com_06,
#          "instruct_prompt": code_mixed_instruct_prompt_06,
#          "doc_struct": code_mixed_doc_prompt_06
#      })
#      rows_06.append(new_row)

#      new_com_09 = replace_docstring(prompt['complete_prompt'], eval(code_mixed_doc_prompt_09))
#      new_row = prompt.to_dict()
#      new_row.update({
#          "lan": lan,
#          "cmd": 0.9,
#          "complete_prompt": new_com_09,
#          "instruct_prompt": code_mixed_instruct_prompt_09,
#          "doc_struct": code_mixed_doc_prompt_09
#      })

#      rows_09.append(new_row)
#     #  prompt['complete_prompt'] = replace_docstring(prompt['complete_prompt'], eval(code_mixed_doc_prompt))
#     #  prompt['instruct_prompt'] = code_mixed_instruct_prompt
#     #  prompt['doc_struct'] = code_mixed_doc_prompt
     
#      # final = pd.concat([final, pd.DataFrame([prompt])], ignore_index=True)
#      break # remove this break if you want to implement on all the data.
#   break # remove this break if you want to implement on all the languages.

In [15]:
def replace_docstring(original_text, new_docstring_dict):
    """Replaces the docstring in the original text with a formatted version of the new docstring dictionary."""
    indent = '    '
    lines = ['"""']

    # Format each section of the docstring
    if new_docstring_dict.get('description'):
        lines.extend(f"{indent}{line}" for line in new_docstring_dict['description'])
        lines.append("")

    if new_docstring_dict.get('notes'):
        lines.append(f"{indent}Notes:")
        lines.extend(f"{indent}    {line}" for line in new_docstring_dict['notes'])
        lines.append("")

    if new_docstring_dict.get('params'):
        lines.append(f"{indent}Parameters:")
        lines.extend(f"{indent}- {line}" for line in new_docstring_dict['params'])
        lines.append("")

    if new_docstring_dict.get('returns'):
        lines.append(f"{indent}Returns:")
        lines.extend(f"{indent}    {line}" for line in new_docstring_dict['returns'])
        lines.append("")

    if new_docstring_dict.get('reqs'):
        lines.append(f"{indent}Requirements:")
        lines.extend(f"{indent}- {line}" for line in new_docstring_dict['reqs'])
        lines.append("")

    if new_docstring_dict.get('raises'):
        lines.append(f"{indent}Raises:")
        lines.extend(f"{indent}- {line}" for line in new_docstring_dict['raises'])
        lines.append("")

    if new_docstring_dict.get('examples'):
        lines.append(f"{indent}Example:")
        for line in new_docstring_dict['examples']:
            # Safely escape all backslashes (e.g., for Windows paths or regex)
            safe_line = line.replace('\\', '\\\\')
            lines.append(f"{indent}{safe_line}")
        lines.append("")

    lines.append(indent + '"""')
    formatted_docstring = '\n'.join(lines)

    # Escape all backslashes in the final replacement string to prevent regex errors
    safe_formatted_docstring = formatted_docstring.replace('\\', r'\\')

    # Replace the first docstring in the original text
    updated_text = re.sub(r'("""[\s\S]*?""")', safe_formatted_docstring, original_text, count=1)

    return updated_text


In [41]:
ONE_MINUTE = 60
BATCH_SIZE = 12  # Process 12 requests at a time
CHECKPOINT_FILE = "checkpoints/MBigCodeBench_checkpoint.parquet"
ERROR_LOG_FILE = "error_log.txt"

rows_09 = []
rows_06 = []

def api_call_function(prompt, lan="Chinese"):
    """Makes an API call and processes the example."""
    try:
        start_time = time.time()  # Track start time

        generated_output = pipeline(prompt, lan)

        code_mixed_instruct_prompt_09 = algorithm1(
            generated_output['translated_instruct_prompt'],
            generated_output['roman_instruct_dictionary'],
            cmd=0.9,  # Example CMD value
            lan=lan
        )

        code_mixed_doc_prompt_09 = algorithm1(
            generated_output['translated_doc_struct_prompt'],
            generated_output['roman_doc_struct_dictionary'],
            cmd=0.9,  # Example CMD value
            lan=lan
        )
        
        code_mixed_instruct_prompt_06 = algorithm1(
            generated_output['translated_instruct_prompt'],
            generated_output['roman_instruct_dictionary'],
            cmd=0.6,  # Example CMD value
            lan=lan
        )

        code_mixed_doc_prompt_06 = algorithm1(
            generated_output['translated_doc_struct_prompt'],
            generated_output['roman_doc_struct_dictionary'],
            cmd=0.6,  # Example CMD value
            lan=lan
        )

        if(lan == "hindi"):
            code_mixed_instruct_prompt_09 = romanize(code_mixed_instruct_prompt_09, lan)
            code_mixed_doc_prompt_09 = romanize(code_mixed_doc_prompt_09, lan).strip("`\n ") 
            code_mixed_instruct_prompt_06 = romanize(code_mixed_instruct_prompt_06, lan)
            code_mixed_doc_prompt_06 = romanize(code_mixed_doc_prompt_06, lan).strip("`\n ") 

        new_com_06 = replace_docstring(prompt['complete_prompt'], eval(code_mixed_doc_prompt_06))
        new_row_06 = prompt.copy()
        new_row_06.update({
            "lan": lan,
            "cmd": 0.6,
            "complete_prompt": new_com_06,
            "instruct_prompt": code_mixed_instruct_prompt_06,
            "doc_struct": code_mixed_doc_prompt_06
        })
        # rows_06.append(new_row)

        new_com_09 = replace_docstring(prompt['complete_prompt'], eval(code_mixed_doc_prompt_09))
        new_row_09 = prompt.copy()
        new_row_09.update({
            "lan": lan,
            "cmd": 0.9,
            "complete_prompt": new_com_09,
            "instruct_prompt": code_mixed_instruct_prompt_09,
            "doc_struct": code_mixed_doc_prompt_09
        })

        # rows_09.append(new_row)

        elapsed_time = time.time() - start_time  # Compute API call time
        return (new_row_06, new_row_09), elapsed_time  # Return elapsed time to estimate ETA

    except json.JSONDecodeError as json_err:
        log_error(prompt, f"JSONDecodeError: {json_err}")
    except TypeError as type_err:
        log_error(prompt, f"TypeError: {type_err}")
    except Exception as e:  # Catch any unexpected errors
        log_error(prompt, f"API Error: {e}")

    return None, None  # Return None if an error occurs

# def api_call_function(prompt, lan="Hindi"):
#     """Makes an API call and processes the example."""
    
#     start_time = time.time()  # Track start time

#     generated_output = pipeline(prompt, lan)

#     code_mixed_instruct_prompt_09 = algorithm1(
#         generated_output['translated_instruct_prompt'],
#         generated_output['roman_instruct_dictionary'],
#         cmd=0.9,  # Example CMD value
#         lan=lan
#     )

#     code_mixed_doc_prompt_09 = algorithm1(
#         generated_output['translated_doc_struct_prompt'],
#         generated_output['roman_doc_struct_dictionary'],
#         cmd=0.9,  # Example CMD value
#         lan=lan
#     )
    
#     code_mixed_instruct_prompt_06 = algorithm1(
#         generated_output['translated_instruct_prompt'],
#         generated_output['roman_instruct_dictionary'],
#         cmd=0.6,  # Example CMD value
#         lan=lan
#     )

#     code_mixed_doc_prompt_06 = algorithm1(
#         generated_output['translated_doc_struct_prompt'],
#         generated_output['roman_doc_struct_dictionary'],
#         cmd=0.6,  # Example CMD value
#         lan=lan
#     )

#     if(lan == "hindi"):
#         code_mixed_instruct_prompt_09 = romanize(code_mixed_instruct_prompt_09, lan)
#         code_mixed_doc_prompt_09 = romanize(code_mixed_doc_prompt_09, lan).strip("`\n ") 
#         code_mixed_instruct_prompt_06 = romanize(code_mixed_instruct_prompt_06, lan)
#         code_mixed_doc_prompt_06 = romanize(code_mixed_doc_prompt_06, lan).strip("`\n ") 

#     new_com_06 = replace_docstring(prompt['complete_prompt'], eval(code_mixed_doc_prompt_06))
#     new_row_06 = prompt
#     new_row_06.update({
#         "lan": lan,
#         "cmd": 0.6,
#         "complete_prompt": new_com_06,
#         "instruct_prompt": code_mixed_instruct_prompt_06,
#         "doc_struct": code_mixed_doc_prompt_06
#     })
#     # rows_06.append(new_row)

#     new_com_09 = replace_docstring(prompt['complete_prompt'], eval(code_mixed_doc_prompt_09))
#     new_row_09 = prompt
#     new_row_09.update({
#         "lan": lan,
#         "cmd": 0.9,
#         "complete_prompt": new_com_09,
#         "instruct_prompt": code_mixed_instruct_prompt_09,
#         "doc_struct": code_mixed_doc_prompt_09
#     })

#     # rows_09.append(new_row)

#     elapsed_time = time.time() - start_time  # Compute API call time
#     return (new_row_06, new_row_09), elapsed_time  # Return elapsed time to estimate ETA


# Function to log errors
def log_error(prompt, error_message):
    # with open(ERROR_LOG_FILE, "a", encoding="utf-8") as log_file:
    #     log_file.write(json.dumps({"error": error_message, "prompt": prompt}, ensure_ascii=False) + "\n")
    print(f"Logged error: {error_message}")


# Load the dataset
dataset = ds['v0.1.4']
# dataset = df_missing_found

EXTRA_COLS = ["lan","cmd"]
base_cols  = dataset.column_names

# Load checkpoint if available
if os.path.exists(CHECKPOINT_FILE):
    print("Loading checkpoint...")
    checkpoint_df = pd.read_parquet(CHECKPOINT_FILE)
    processed_indices = set(checkpoint_df['task_id'])
else:
    checkpoint_df = pd.DataFrame(columns=base_cols+EXTRA_COLS)
    processed_indices = set()

# Convert dataset to pandas for efficient resuming
dataset_df = dataset.to_pandas()
# dataset_df = dataset
total_rows = len(dataset_df)
remaining_rows = total_rows - len(processed_indices)

# Initialize tqdm progress bar
progress_bar = tqdm(total=remaining_rows, desc="Processing Dataset", unit="row")

elapsed_times = []  # Store elapsed times to compute ETA

# Process dataset in batches of 15
unprocessed_rows = dataset_df[~dataset_df['task_id'].isin(processed_indices)]
indices_to_process = unprocessed_rows.index.tolist()

# indices_to_process = [idx for idx in dataset_df.index if idx not in processed_indices]
# indices_to_process = indices_to_process[760:1140]

for i in range(0, len(indices_to_process), BATCH_SIZE):
    batch_indices = indices_to_process[i : i + BATCH_SIZE]
    batch_rows = [dataset_df.loc[idx].to_dict() for idx in batch_indices]

    # Execute 15 API calls in parallel
    with ThreadPoolExecutor(max_workers=BATCH_SIZE) as executor:
        future_to_idx = {executor.submit(api_call_function, row): idx for idx, row in zip(batch_indices, batch_rows)}

        batch_start_time = time.time()

        for future in as_completed(future_to_idx):
            idx = future_to_idx[future]
            try:
                new_rows, elapsed_time = future.result()
                if new_rows:
                    for nr in new_rows:
                        if nr["cmd"] == 0.6:
                            rows_06.append(nr)
                        else:
                            rows_09.append(nr)

                        checkpoint_df = pd.concat(
                            [checkpoint_df, pd.DataFrame([nr])],
                            ignore_index=True
                        )
                    
                    checkpoint_df.to_parquet(CHECKPOINT_FILE, index=False)  # Save checkpoint
                    processed_indices.add(idx)

                    # Track time for ETA calculations
                    if elapsed_time:
                        elapsed_times.append(elapsed_time)
                        avg_time_per_row = sum(elapsed_times) / len(elapsed_times)
                        eta_seconds = avg_time_per_row * (remaining_rows - len(processed_indices))
                        progress_bar.set_postfix(ETA=f"{int(eta_seconds // 60)}m {int(eta_seconds % 60)}s")

                    progress_bar.update(1)

            except Exception as e:
                log_error(dataset_df.loc[idx].to_dict(), f"Unhandled Exception: {e}")

    # Ensure the full batch of 15 requests is completed before starting the next batch
    batch_elapsed_time = time.time() - batch_start_time
    remaining_wait_time = max(0, ONE_MINUTE - batch_elapsed_time)

    if remaining_wait_time > 0:
        print(f"Waiting {remaining_wait_time:.2f} seconds before next batch...")
        time.sleep(remaining_wait_time)

# Close progress bar
progress_bar.close()

# Convert checkpoint back to Hugging Face dataset
new_dataset = Dataset.from_pandas(checkpoint_df)

# Save the final dataset
new_dataset.to_parquet("final/MBigCodeBench_final.parquet")
print("Processing complete. Final dataset saved.")

Loading checkpoint...


Processing Dataset:   0%|          | 0/1 [00:00<?, ?row/s]

Processing Dataset: 100%|██████████| 1/1 [00:10<00:00, 10.71s/row, ETA=-200m 33s]

Waiting 49.29 seconds before next batch...


Processing Dataset: 100%|██████████| 1/1 [01:00<00:00, 60.00s/row, ETA=-200m 33s]


Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Processing complete. Final dataset saved.


In [42]:
final_df = pd.read_parquet("final/MBigCodeBench_final.parquet")
len(final_df)
final_df.head(1)

Unnamed: 0,task_id,complete_prompt,instruct_prompt,canonical_solution,code_prompt,test,entry_point,doc_struct,libs,lan,cmd
0,BigCodeBench/4,from collections import Counter\nimport iterto...,计算inputdictionary中每个values的integeroccurrence的次...,count_dict = Counter(itertools.chain.from_...,from collections import Counter\nimport iterto...,import unittest\nclass TestCases(unittest.Test...,task_func,{'description': ['计算inputdictionary中每个values的i...,"['collections', 'itertools']",Chinese,0.6


In [43]:
# Rows with cmd == 0.6
df_cmd06 = final_df[final_df["cmd"] == 0.6].copy().reset_index(drop=True)

# Rows with cmd == 0.9
df_cmd09 = final_df[final_df["cmd"] == 0.9].copy().reset_index(drop=True)

In [44]:
df_cmd09 = df_cmd09.drop(columns=["lan","cmd"])
df_cmd06 = df_cmd06.drop(columns=["lan","cmd"])

In [47]:
# Define a helper to sort in-place by the numeric suffix of task_id
def sort_by_task_num(df):
    # Extract the integer after the slash
    df['__task_num'] = df['task_id'].str.extract(r'/(\d+)$')[0].astype(int)
    # Sort by that integer
    df.sort_values('__task_num', inplace=True)
    # Drop the helper column
    df.drop(columns='__task_num', inplace=True)
    # df.drop(columns='id_num', inplace=True)
    # Reset index if you like
    df.reset_index(drop=True, inplace=True)

# Sort both subsets
sort_by_task_num(df_cmd06)
sort_by_task_num(df_cmd09)

In [51]:
def find_missing_task_ids(df, prefix="BigCodeBench", max_id=1139):
    """
    Returns a DataFrame of all the `prefix/<n>` task_ids in [0..max_id]
    that are not present in `df.task_id`.
    The original `df` is not modified.
    """
    # 1) work on a copy so we never touch the original
    tmp = df.copy()

    # 2) extract the numeric suffix into a helper column
    tmp['__id_num'] = (
        tmp['task_id']
           .str.extract(r'/(\d+)$')[0]
           .astype(int)
    )

    # 3) build the full expected set and find what's present
    expected = set(range(max_id + 1))
    present  = set(tmp['__id_num'])

    # 4) compute missing integers
    missing_nums = sorted(expected - present)

    # 5) re-compose full task_id strings
    missing_task_ids = [f"{prefix}/{n}" for n in missing_nums]

    # 6) return as a DataFrame
    return pd.DataFrame(missing_task_ids, columns=['missing_task_id'])
    
print(find_missing_task_ids(df_cmd06))
print(find_missing_task_ids(df_cmd09))

Empty DataFrame
Columns: [missing_task_id]
Index: []
Empty DataFrame
Columns: [missing_task_id]
Index: []


In [50]:
df_cmd06.to_parquet("../dataset/chi-eng/MBigCodeBench-chi-eng-cmd0.6.parquet")
df_cmd09.to_parquet("../dataset/chi-eng/MBigCodeBench-chi-eng-cmd0.9.parquet")