In [None]:
import re

# Input code with docstring
input_code = '''def strlen(string: str) -> int:
    """ Return length of given string
    >>> strlen('\\')
    0
    >>> strlen('abc')
    3
    """
    return len(string)'''

# Remove the docstring from the code and avoid leaving empty lines
output_code = re.sub(r'\"\"\".*?\"\"\"', '', input_code, flags=re.DOTALL)  # Remove docstring
# Remove empty lines created by removing docstring
output_code = re.sub(r'\n\s*\n', '\n', output_code)
output_code = output_code.strip()  # Remove leading/trailing whitespace

print(output_code)

In [None]:
import re


def remove_comments_before_function(code):
    # Remove single-line comments before the 'public static long strlen' function
    code_cleaned = re.sub(
        r'(?<=\n)(\s*//.*?\n)+(?=\s*public static)', '', code)

    return code_cleaned.strip()


# Input Java code with comments
java_code = '''import java.util.*;
import java.lang.reflect.*;
import org.javatuples.*;
import java.security.*;
import java.math.*;
import java.io.*;
import java.util.stream.*;
class Problem {
    // Return length of given string
    // >>> stringLength((\"\"))
    // (0l)
    // >>> stringLength((\"abc\"))
    // (3l)
    public static long strlen(String string) {
        // do something
        
        
        return call();
    }
}'''

# Remove comments before the 'public static long strlen' function
cleaned_java_code = remove_comments_before_function(java_code)

print(cleaned_java_code)

In [1]:
import requests
import json
import sseclient

code_token = '7b4b2edb-04c7-42a8-b8cc-c26cc47bbf9d'

# Define the URL
url = 'https://codegeex.cn/prod/code/chatCodeSseV3/chat'

# Define the headers (make sure to include all the headers from the cURL command)
headers = {
    'accept': 'text/event-stream',
    'accept-language': 'en-US',
    'code-token': code_token,
    'content-type': 'application/json',
}

# Define the body of the request
body = {
    "lang": "Java",
    "machineId": "0b5607b5252bf2b518f0a541db6c2b8ef6f1699fdda8fe1c3ce378854cf2add3",
    "history": [],
    "command": "translation",
    "prompt": "",
    "locale": "en",
    "model": "codegeex-chat-pro",
    "code": """```Java
code translation
Python:
def strlen(string: str) -> int:
    return len(string)
Java:
import java.util.*;
import java.lang.reflect.*;
import org.javatuples.*;
import java.security.*;
import java.math.*;
import java.io.*;
import java.util.stream.*;
public class Problem {
    public static int strlen(String string) {
        return string.length();
    }
}
```"""
}

# Make the POST request
response = requests.post(url, json=body, headers=headers)

client = sseclient.SSEClient(response)

translated_code = ""
for event in client.events():
    event_data = json.loads(event.data)
    if event.event == "add":
        translated_code += event_data["text"]
    elif event.event == "finish":
        print("Translated Java code:")
        print(translated_code)
        break

# # Check the response
# if response.status_code == 200:
#     print("Request was successful.")
#     print("Response content-type:", response.headers.get('Content-Type'))
#     print("Response body:", response.text)
# else:
#     print("Request failed with status code:", response.status_code)
#     print("Response body:", response.text)

Translated Java code:
```Java
public class Problem {
    public static int strlen(String string) {
        return string.length();
    }
}
```


In [2]:
import requests
import json
import sseclient

def codegeex_translate(source_code, target_language="Java"):
    code_token = '7b4b2edb-04c7-42a8-b8cc-c26cc47bbf9d'

    # Define the URL
    url = 'https://codegeex.cn/prod/code/chatCodeSseV3/chat'

    # Define the headers
    headers = {
        'accept': 'text/event-stream',
        'accept-language': 'en-US',
        'code-token': code_token,
        'content-type': 'application/json',
    }

    # Define the body of the request
    body = {
        "lang": target_language,
        "machineId": "0b5607b5252bf2b518f0a541db6c2b8ef6f1699fdda8fe1c3ce378854cf2add3",
        "history": [],
        "command": "translation",
        "prompt": "",
        "locale": "en",
        "model": "codegeex-chat-pro",
        "code": source_code
    }

    # Make the POST request
    response = requests.post(url, json=body, headers=headers, stream=True)

    # Check if the response is successful
    if response.status_code != 200:
        print(f"Request failed with status code: {response.status_code}")
        print("Response body:", response.text)
        return None

    # Create an SSE client for processing the streaming response
    client = sseclient.SSEClient(response)

    translated_code = ""
    for event in client.events():
        try:
            event_data = json.loads(event.data)
            if event.event == "add":
                translated_code += event_data.get("text", "")
            elif event.event == "finish":
                # If you want to do something with the finished translation, you can add it here
                break
        except json.JSONDecodeError:
            print("Error decoding JSON:", event.data)

    return translated_code

In [4]:
import json
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm


def translate_and_postprocess(translated_prompts_path, save_translated_codes_path, limit_start=None, limit=None, parallel=False):
    prompts = json.load(open(translated_prompts_path, "r"))

    translated_codes = []

    # Slice the prompts list based on the limit_start (optional), limit (optional)
    if limit_start is not None and limit is not None:
        selected_prompts = prompts[limit_start:limit_start + limit]
    else:
        selected_prompts = prompts

    java_imports = 'import java.util.*;\nimport java.lang.reflect.*;\nimport org.javatuples.*;\nimport java.security.*;\nimport java.math.*;\nimport java.io.*;\nimport java.util.stream.*;'

    replacements = [
        ('```', ''),
        ('java\n', ''),
        ("\n    }\n}\n", ''),
        ("public class", "class")
    ]
    
    # Worker function
    def process_prompt(prompt):
        translated_code = codegeex_translate(prompt)
        for text, new_text in replacements:
            translated_code = translated_code.replace(text, new_text)

        translated_code = java_imports + '\n' + translated_code
        return [translated_code]


    if parallel:
        print("Using parallel processing")
        # Using ThreadPoolExecutor to parallelize the loop
        with ThreadPoolExecutor() as executor:
            # Map the worker function to selected_prompts and collect results
            translated_codes = list(executor.map(process_prompt, selected_prompts))
            
        print(f'Translated {len(translated_codes)} prompts in parallel')
    else:
        for prompt in tqdm(selected_prompts, desc="Translating Prompts", unit="prompt"):
            translated_code = codegeex_translate(prompt)
            for text, new_text in replacements:
                translated_code = translated_code.replace(text, new_text)

            translated_code = java_imports + '\n' + translated_code
            translated_codes.append([translated_code])
        
        print(f'Translated {len(translated_codes)} prompts in serial')

        
    with open(save_translated_codes_path, 'w') as f:
        f.write(json.dumps(translated_codes, indent=2))

In [None]:
# translated_prompts_path = "/Users/phuonglvh/projects/2170558-thesis-automatic-code-generation-using-machine-learning/bigcode-evaluation-harness/benchmark/codegeex2-6b/code2code/py-java/do_sampleFalse/codegeex2-6b-do_sampleFalse-bf16-n1-seed0-batch1-maxlen1024-java-generations-0-158_code2code-multiple-java_translated-prompts.json"

# translated_prompts_path = '/Users/phuonglvh/projects/2170558-thesis-automatic-code-generation-using-machine-learning/bigcode-evaluation-harness/translated-prompts.json'

# save_translated_codes_path = '/Users/phuonglvh/projects/2170558-thesis-automatic-code-generation-using-machine-learning/bigcode-evaluation-harness/codegeex-chat-pro-0-158_code2code-multiple-java-translated-codes.json'

# /Users/phuonglvh/projects/2170558-thesis-automatic-code-generation-using-machine-learning/bigcode-evaluation-harness/benchmark/datasets/MultiPL-E/humaneval-py-reworded.json

translated_prompts_path = '/Users/phuonglvh/projects/2170558-thesis-automatic-code-generation-using-machine-learning/bigcode-evaluation-harness/benchmark/datasets/humaneval-x/humaneval_python_java_prompts.json'

limit_start = 0
limit = len(json.load(open(translated_prompts_path, "r")))
model = 'codegeex-chat-pro'

save_translated_codes_path = f'/Users/phuonglvh/projects/2170558-thesis-automatic-code-generation-using-machine-learning/bigcode-evaluation-harness/benchmark/codegeex-chat-pro/humaneval-x/{model}-humaneval_python_java_prompts-translations-{limit_start}-{limit}.json'

translate_and_postprocess(translated_prompts_path, save_translated_codes_path, limit_start, limit, parallel=True)