diff --git a/neurons/validator.py b/neurons/validator.py index 6f8370d2d..a865cfc6a 100644 --- a/neurons/validator.py +++ b/neurons/validator.py @@ -21,6 +21,7 @@ from prompting.llms import HuggingFacePipeline, vLLMPipeline from prompting.base.validator import BaseValidatorNeuron from prompting.rewards import RewardPipeline +from prompting.tasks import TranslationPipeline class Validator(BaseValidatorNeuron): @@ -38,7 +39,8 @@ def __init__(self, config=None): model_id=self.config.neuron.model_id, device=self.device, mock=self.config.mock, - ) + ) + self.translation_pipeline = TranslationPipeline() if abs(1-sum(self.config.neuron.task_p)) > 0.001: raise ValueError("Task probabilities do not sum to 1.") diff --git a/prompting/__init__.py b/prompting/__init__.py index bac567719..8c91d882c 100644 --- a/prompting/__init__.py +++ b/prompting/__init__.py @@ -36,6 +36,7 @@ from . import agent from . import conversation from . import dendrite +from . import shared from .llms import hf diff --git a/prompting/cleaners/all_cleaners.py b/prompting/cleaners/all_cleaners.py index 019e3e063..4a7c46dbd 100644 --- a/prompting/cleaners/all_cleaners.py +++ b/prompting/cleaners/all_cleaners.py @@ -1,4 +1,5 @@ from abc import ABC, abstractmethod +from typing import Union import bittensor as bt import re from typing import Union diff --git a/prompting/conversation.py b/prompting/conversation.py index e17f5cd3d..3e13a9a4a 100644 --- a/prompting/conversation.py +++ b/prompting/conversation.py @@ -1,12 +1,13 @@ import random from transformers import Pipeline -from prompting.tasks import Task, TASKS +from prompting.tasks import Task, TASKS, TranslationPipeline, TranslationTask from prompting.tools import Selector, DATASETS from prompting.task_registry import TASK_REGISTRY def create_task( llm_pipeline: Pipeline, + translation_pipeline: TranslationPipeline, task_name: str, create_reference: bool = True, selector: Selector = random.choice, @@ -42,6 +43,12 @@ def create_task( raise ValueError(f"Dataset {dataset_name} not found") else: dataset = dataset() + + if task_name == TranslationTask.name: + return task( + translation_pipeline=translation_pipeline, + context=dataset.next() + ) return task( llm_pipeline=llm_pipeline, diff --git a/prompting/forward.py b/prompting/forward.py index 073903178..170f85254 100644 --- a/prompting/forward.py +++ b/prompting/forward.py @@ -290,6 +290,7 @@ async def forward(self): try: task = create_task( llm_pipeline=self.llm_pipeline, + translation_pipeline=self.translation_pipeline, task_name=task_name, create_reference=False, ) diff --git a/prompting/shared/__init__.py b/prompting/shared/__init__.py new file mode 100644 index 000000000..79503d91a --- /dev/null +++ b/prompting/shared/__init__.py @@ -0,0 +1 @@ +from .context import Context \ No newline at end of file diff --git a/prompting/tools/datasets/context.py b/prompting/shared/context.py similarity index 100% rename from prompting/tools/datasets/context.py rename to prompting/shared/context.py diff --git a/prompting/task_registry.py b/prompting/task_registry.py index de12c105e..f111a29d9 100644 --- a/prompting/task_registry.py +++ b/prompting/task_registry.py @@ -1,4 +1,4 @@ -from .tasks import Task, MockTask, SummarizationTask, QuestionAnsweringTask, DebuggingTask, MathTask, DateQuestionAnsweringTask, GenericInstructionTask, SentimentAnalysisTask +from .tasks import Task, MockTask, SummarizationTask, QuestionAnsweringTask, DebuggingTask, MathTask, DateQuestionAnsweringTask, GenericInstructionTask, SentimentAnalysisTask, TranslationTask from .tools import MockDataset, WikiDataset, HFCodingDataset, StackOverflowDataset, MathDataset, WikiDateDataset, GenericInstructionDataset, ReviewDataset # TODO: Expand this to include extra information beyond just the task and dataset names @@ -8,6 +8,7 @@ math_task, math_dataset = MathTask.name, [MathDataset.name] date_qa_task, date_qa_dataset = DateQuestionAnsweringTask.name, [WikiDateDataset.name] generic_instruction_task, generic_instruction_dataset = GenericInstructionTask.name, [GenericInstructionDataset.name] +translation_task, translation_dataset = TranslationTask.name, [WikiDataset.name] sentiment_analysis_task, sentiment_analysis_dataset = SentimentAnalysisTask.name, [ReviewDataset.name] TASK_REGISTRY = { @@ -17,5 +18,6 @@ math_task: math_dataset, date_qa_task: date_qa_dataset, generic_instruction_task: generic_instruction_dataset, + translation_task: translation_dataset, sentiment_analysis_task: sentiment_analysis_dataset, } \ No newline at end of file diff --git a/prompting/tasks/__init__.py b/prompting/tasks/__init__.py index 04d0282b3..6794c97ca 100644 --- a/prompting/tasks/__init__.py +++ b/prompting/tasks/__init__.py @@ -5,10 +5,10 @@ from .date_qa import DateQuestionAnsweringTask from .generic_instruction import GenericInstructionTask from .math import MathTask +from .translate import TranslationTask, TranslationPipeline from .mock import MockTask from .sentiment import SentimentAnalysisTask - TASKS = { QuestionAnsweringTask.name: QuestionAnsweringTask, DateQuestionAnsweringTask.name: DateQuestionAnsweringTask, @@ -16,5 +16,6 @@ #DebuggingTask.name: DebuggingTask, GenericInstructionTask.name: GenericInstructionTask, MathTask.name: MathTask, + TranslationTask.name: TranslationTask, SentimentAnalysisTask.name: SentimentAnalysisTask, } diff --git a/prompting/tasks/translate.py b/prompting/tasks/translate.py new file mode 100644 index 000000000..1913db6ed --- /dev/null +++ b/prompting/tasks/translate.py @@ -0,0 +1,130 @@ +import tqdm +import bittensor as bt +import argostranslate.package +import argostranslate.translate +import random +from typing import List, Tuple +from prompting.tasks import Task +from dataclasses import dataclass +from argostranslate.package import AvailablePackage +from prompting.shared import Context + +SUPPORTED_LANGUAGES = [ + "en", "es", "fr", "pt", "uk" +] + +class TranslationPipeline: + def __init__(self): + self.supported_language_pairs = self.load_translation_packages() + + def load_translation_packages(self) -> List[AvailablePackage]: + # Update package index and get available and installed packages + argostranslate.package.update_package_index() + available_packages = argostranslate.package.get_available_packages() + installed_packages = argostranslate.package.get_installed_packages() + + # Helper function to check if a package is installed + def is_package_installed(from_code, to_code, packages): + return any(pkg for pkg in packages if pkg.from_code == from_code and pkg.to_code == to_code) + + # Filter available packages for supported language pairs + supported_language_pairs = [ + pkg for pkg in available_packages + if pkg.from_code in SUPPORTED_LANGUAGES and pkg.to_code in SUPPORTED_LANGUAGES + ] + + bt.logging.info(f"Supported language pairs: {supported_language_pairs}") + + # Check for installed packages + pbar = tqdm.tqdm(supported_language_pairs, desc="Checking installed packages") + for package in pbar: + if not is_package_installed(package.from_code, package.to_code, installed_packages): + bt.logging.info(f"Installing package from {package.from_code} to {package.to_code}") + package_path = str(package.download()) + argostranslate.package.install_from_path(package_path) + bt.logging.success(f'Package successfully installed at {package_path}') + else: + bt.logging.info(f"Package from {package.from_code} to {package.to_code} is already installed, skipping...") + + return supported_language_pairs + + def random_translation(self, content:str) -> str: + # TODO: NOT RANDOM + from_code = self.SUPPORTED_LANGUAGES[0] + to_code = self.SUPPORTED_LANGUAGES[1] + return argostranslate.translate.translate(content, from_code, to_code) + + def translate(self, content: str, from_code: str, to_code: str): + self.reference = argostranslate.translate.translate(content, from_code, to_code) + + def translate_to_random_language(self, content: str, from_code:str='en') -> Tuple[AvailablePackage, str]: + english_supported_languages = list(filter(lambda x: x.from_code == from_code, self.supported_language_pairs)) + available_translations = list(map(lambda x: x, english_supported_languages)) + + random_translation_obj = random.choice(available_translations) + translation_code = random_translation_obj.to_code + + translated_content = argostranslate.translate.translate(content, from_code, to_code=translation_code) + + return random_translation_obj, translated_content + + +@dataclass +class TranslationTask(Task): + name = "translation" + desc = "get translation help" + goal = "to get the translation for the given piece of text" + + templates = [ + "Could you assist me with translating the following text into {another_language}? \n{text}", + "I need some help translating this text into {another_language}. Can you do it? \n{text}", + "Is it possible for you to translate this text for me into {another_language}? Here it is: \n{text}", + "Would you mind helping me convert this text into {another_language}? \n{text}", + "Could you please convert this into {another_language} for me? \n{text}", + "I was wondering if you could help translate this into {another_language}? \n{text}", + "Can you provide a translation for this text into {another_language}? \n{text}", + "Hey, can you turn this text into {another_language} for me? \n{text}", + "Could I get some assistance in translating this into {another_language}? \n{text}", + "Are you able to help me render this text in {another_language}? \n{text}", + "I'd appreciate your help translating this text into {another_language}. Here's the text: \n{text}", + "Please could you translate the following text into {another_language}? \n{text}", + "Might you help me by translating this text to {another_language}? \n{text}", + "I'm looking for help to translate this text into {another_language}. Any chance you can assist? \n{text}", + "How about translating this text into {another_language} for me? \n{text}", + "Would it be possible for you to help translate this text into {another_language}? \n{text}", + "I need your expertise to translate this text into {another_language}, can you help? \n{text}", + "Can you work your magic and translate this text into {another_language}? \n{text}", + "I require assistance translating the following into {another_language}. Can you help? \n{text}", + "Hey, could you take a moment to translate this text into {another_language} for me? \n{text}", + ] + + # TODO: TEST BLEU SCORE + reward_definition = [ + dict(name="rouge", ngram="rouge-1", metric="f", weight=1), + ] + penalty_definition = [ + dict(name="rouge", ngram="rouge-1", metric="f", weight=1), + ] + + cleaning_pipeline = [] + + def __init__(self, translation_pipeline: TranslationPipeline, context: Context): + # Set task internal variables + self.context = context + self.topic = context.title + self.subtopic = context.topic + self.tags = context.tags + + # Translates english text to a random language + content_translation_obj, translated_content = translation_pipeline.translate_to_random_language(context.content) + + # Translates the translation to another random language + reference_translation_obj, reference_translation_content = translation_pipeline.translate_to_random_language(content=translated_content, from_code=content_translation_obj.to_code) + self.reference = reference_translation_content + + # Composes the query + # TODO: Implement template translation + template = random.choice(self.templates) + self.query = template.format(another_language=reference_translation_obj.to_name, text=translated_content) + + \ No newline at end of file diff --git a/prompting/tools/__init__.py b/prompting/tools/__init__.py index 1722a9d84..82e3713d9 100644 --- a/prompting/tools/__init__.py +++ b/prompting/tools/__init__.py @@ -1,5 +1,4 @@ from .datasets import ( - Context, Dataset, MockDataset, HFCodingDataset, diff --git a/prompting/tools/datasets/__init__.py b/prompting/tools/datasets/__init__.py index b1187ee60..3bdda191b 100644 --- a/prompting/tools/datasets/__init__.py +++ b/prompting/tools/datasets/__init__.py @@ -1,4 +1,3 @@ -from .context import Context from .base import Dataset from .code import HFCodingDataset, StackOverflowDataset from .math import MathDataset diff --git a/prompting/tools/datasets/base.py b/prompting/tools/datasets/base.py index a2130e929..ee4535601 100644 --- a/prompting/tools/datasets/base.py +++ b/prompting/tools/datasets/base.py @@ -24,7 +24,7 @@ import bittensor as bt from ..selector import Selector -from .context import Context +from prompting.shared.context import Context from prompting.utils.exceptions import MaxRetryError diff --git a/requirements.txt b/requirements.txt index 4d77ae6c5..9a91e8df0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,4 +23,5 @@ langchainhub==0.1.14 python-dotenv wikipedia_sections vllm -loguru \ No newline at end of file +loguru +argostranslate \ No newline at end of file diff --git a/tests/fixtures/task.py b/tests/fixtures/task.py index 3eca818e9..af6e9d055 100644 --- a/tests/fixtures/task.py +++ b/tests/fixtures/task.py @@ -6,8 +6,9 @@ DebuggingTask, MathTask, DateQuestionAnsweringTask, + TranslationTask ) -from prompting.tools import Context +from prompting.shared import Context from .dataset import WIKI_CONTEXT, CODING_CONTEXT, MATH_CONTEXT, DATEQA_CONTEXT, MOCK_CONTEXT TASKS = [ @@ -17,6 +18,8 @@ DebuggingTask, MathTask, DateQuestionAnsweringTask, + #TODO: Add proper separation for tranlation task tests + #TranslationTask ] CONTEXTS = { @@ -26,6 +29,7 @@ DebuggingTask: CODING_CONTEXT, MathTask: MATH_CONTEXT, DateQuestionAnsweringTask: DATEQA_CONTEXT, + #TranslationTask: WIKI_CONTEXT } TASK_FIELDS = { diff --git a/tests/test_dataset.py b/tests/test_dataset.py index ae352debe..9d208bd1a 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -2,7 +2,7 @@ from .fixtures.dataset import DATASETS, CONTEXTS, CONTEXT_FIELDS from prompting.tools.datasets import Dataset -from prompting.tools import Context +from prompting.shared import Context @pytest.mark.parametrize("dataset", DATASETS)