In [None]:
import requests
from tenacity import (
    retry, 
    stop_after_attempt, 
    wait_fixed, 
    retry_if_exception_type, 
    wait_random_exponential,
    )

# Use always the latest stable version of the Alpaca Dataset cleaned
url = 'https://raw.githubusercontent.com/gururise/AlpacaDataCleaned/main/alpaca_data_cleaned.json'
# OR of the GPT-4-LLM Alpaca Dataset
# url = 'https://raw.githubusercontent.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM/main/data/alpaca_gpt4_data.json'

@retry(stop=stop_after_attempt(3), wait=wait_fixed(0.1),
      retry=retry_if_exception_type(requests.HTTPError))
def get(url):
    try:
        r = requests.get(url)
        r.raise_for_status()  # raise an error on a bad status
        return r
    except requests.HTTPError:
        print(r.status_code, r.reason)
        raise

response = get(url)

print(f"Request returned {response.status_code} : '{response.reason}'")

if(response.status_code == 200):
  payload = response.json()
else:
  print("Error")

payload

In [None]:
!pip install openai

In [None]:
import os
import openai
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import re

OPENAI_API_KEY = 'YOUR_API_KEY_GOES_HERE'

# ISO 639-1 language code.
# See https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes

lang = "it"
# The language name for OpenAI
LANGUAGE = "Italian"

# Set to True if you want to split the dataset in batches. Reduces peak memory footprint. DEFAULT: False
SPLIT_DATA = False

In [None]:
def matches_regex(regex, text):
    return bool(re.compile(regex).search(text))


def contains_code(text):
    # filter based on keywords that indicate code
    code_blacklist = ['&&', '||', '<html>', ';\n', 'SELECT', "{", "["]
    
    return (
            any(code_keyword in text for code_keyword in code_blacklist) |
            matches_regex(r'\w+\(\w*\) \{', text) | # e.g. myFunc() {
            matches_regex(r'def \w+\(', text) | # e.g. def parse_list(
            matches_regex(r'\[A-z]+\.[A-z]+', text) | # e.g. this.language
            matches_regex(r': [\w\.#]{1,12};', text) | # e.g. font-size: 1.3em;
            matches_regex(r'<\/\w+>', text) # e.g. </html>
           )


def contains_words(text):
    return matches_regex(r'[A-z]{3,}', text) # words with at least three characters


def is_translatable(text):
    if text == "":
        return False
    return (contains_code(text) is False) & contains_words(text)

In [None]:
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(3))
def translate_OpenAI(value):
    """
    Translates text into the target language using Open AI gpt-3.5-turbo model.
    """
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": f"Translate the following text from English to {LANGUAGE}: '{value}'"},
            ],
        max_tokens=1024,
        temperature=0,
        )
    return response.choices[0]["message"]["content"].strip()

In [None]:
def merge_json_files(folder_path):
    json_files = [pos_json for pos_json in os.listdir(folder_path) if pos_json.endswith('.json')]
    json_data = []
    for file in json_files:
        with open(os.path.join(folder_path, file)) as f:
            json_data.extend(json.load(f))
    return json_data

In [None]:
!mkdir ./data

In [None]:
def translate_text(value):
  if(is_translatable(value)): 
    return translate_OpenAI(value)
  return value

def translate_item(item):
    translated_item = {}
    for key, value in item.items():
        if value:
            translated_value = translate_text(value)
            translated_item[key] = translated_value
        else:
            translated_item[key] = ''
    return translated_item

# Maximum number of parallel requests
if(SPLIT_DATA is False):
    MAX_PARALLEL_REQUESTS = 20
else:
    MAX_PARALLEL_REQUESTS = 50

data = payload

CHUNK_SIZE = 1000
start = 0
end = len(data)
print(len(data))

if(SPLIT_DATA is False):
    translated_data = []
    data = data[start:end]

    with ThreadPoolExecutor(max_workers=MAX_PARALLEL_REQUESTS) as executor:
        futures = {executor.submit(translate_item, item): item for item in data}
        
        for future in tqdm(as_completed(futures), total=len(futures), desc="Translating"):
            translated_data.append(future.result())

    # Save the translated data to a new JSON file named 'stambecco_data_{lang}.json'
    with open(f'stambecco_data_{lang}.json', 'w') as f:
        json.dump(translated_data, f, ensure_ascii=False, indent=4)
else:
    # Translate the data in chunks
    for i in range(start, end, CHUNK_SIZE):
        start = i
        end = i + CHUNK_SIZE

        translated_data = []
        data_new = data[start:end]

        with ThreadPoolExecutor(max_workers=MAX_PARALLEL_REQUESTS) as executor:
            futures = {executor.submit(translate_item, item): item for item in data_new}

            for future in tqdm(as_completed(futures), total=len(futures), desc="Translating"):
                translated_data.append(future.result())


        # Save the translated data to a new JSON file named 'translated_data_from_{start}_to_{end}.json'
        with open(f'./data/translated_data_up_to_{start}_to_{end}.json', 'w') as f:
            json.dump(translated_data, f, ensure_ascii=False, indent=4)

        print(f"Translation complete. The translated data is saved in 'translated_data_from_{start}_to_{end}.json'")

    folder_path = './data'
    merged_data = merge_json_files(folder_path)
    with open(f'stambecco_data_{lang}.json', 'w') as f:
        json.dump(merged_data, f, ensure_ascii=False, indent=4)

print(f"Translation complete. The translated data is saved in 'stambecco_data_{lang}.json'")