First, we need to install `toloka-kit`

In [None]:
!pip3 install toloka-kit

In [None]:
import pandas
import numpy as np

import toloka.client as toloka
import toloka.client.project.template_builder as tb
from toloka.client.collectors import AssignmentSubmitTime
from toloka.client.actions import RestrictionV2
from toloka.client.conditions import FastSubmittedCount

import json
import datetime
from time import sleep
import threading

from task_processor import TaskProcessor

Create `TolokaClient` object

In [None]:
toloka_token = '<YOUR TOKEN>'  # Go to https://toloka.yandex.com/requester/profile
toloka_client = toloka.TolokaClient(toloka_token, 'PRODUCTION')  # Or switch to 'SANDBOX' 
print(toloka_client.get_requester())

Specify the experiment prefix

In [None]:
experiment = 'CrowdSpeech'

Let's create the project. Use the `interface.json` to specify the interface and `instruction.html` to specify the instruction

In [None]:
tb_dict = json.loads(open('interface.json').read())
tb_conf = toloka.structure(tb_dict, tb.TemplateBuilder)
project_interface = toloka.project.view_spec.TemplateBuilderViewSpec(config=tb_conf)
prepared_instruction = open('instructions.html').read().strip()

annotation_project = toloka.project.Project(
    assignments_issuing_type=toloka.project.Project.AssignmentsIssuingType.AUTOMATED,
    public_name='Audio Transcription',
    public_description='Listen to the audio and write what you heard',
    public_instructions=prepared_instruction,
    # Set up the task: view, input, and output parameters
    task_spec=toloka.project.task_spec.TaskSpec(
        input_spec={'audio': toloka.project.field_spec.UrlSpec()},
        output_spec={'playing': toloka.project.field_spec.StringSpec(), 'transcription': toloka.project.field_spec.StringSpec()},
        view_spec=project_interface,
    ),
)

annotation_project = toloka_client.create_project(annotation_project)
print(f'Created transcription project with id {annotation_project.id}')
print(f'To view the project, go to https://toloka.yandex.com/requester/project/{annotation_project.id}')

Now we need to create skills for quality control

In [None]:
def create_skill(name, description):
    skill = next(toloka_client.get_skills(name=name), None)
    if skill:
        print('Skill already exists')
    else:
        print('Create new skill')
        skill = toloka_client.create_skill(
            name=name,
            hidden=True,
            public_requester_description={'EN': description},
        )
    return skill

exam_skill = create_skill(f'{experiment} Audio Transcription (Exam)', '')
transcription_skill = create_skill(f'{experiment} Audio Transcription', '')

Let's create the exam pool. This pool has an infinite overlap and the skill from this pool will be used to select performers that will annotate the real tasks.

In [None]:
exam_pool = toloka.pool.Pool(
    project_id=annotation_project.id,
    private_name='Exam other',  # Only you can see this information.
    may_contain_adult_content=True,
    will_expire=datetime.datetime.utcnow() + datetime.timedelta(days=365),  # Pool will automatically close after one year
    reward_per_assignment=0.01,     # Set the minimum payment amount for one task page
    auto_accept_solutions=False,    # Only pay the performer for completing the task,
                                    # based on the verification results of the second project.
                                     
    auto_accept_period_day=1,       # Number of days to determine if we'll pay for task completion by this performer or not.
    assignment_max_duration_seconds=60*20,  # Give performers 20 minutes maximum to complete one task page.
    defaults=toloka.pool.Pool.Defaults(
        default_overlap_for_new_task_suites=99,
        default_overlap_for_new_tasks=None,
    ),
    type='EXAM'
)

exam_pool.set_mixer_config(real_tasks_count=0,
                           golden_tasks_count=4,
                           training_tasks_count=0,
                           mix_tasks_in_creation_order=False,
                           shuffle_tasks_in_task_suite=True)
exam_pool.filter = (toloka.filter.FilterOr([toloka.filter.Languages.in_('EN')]) &
    toloka.filter.FilterOr([
        toloka.filter.ClientType == 'BROWSER',
        toloka.filter.ClientType == 'TOLOKA_APP'
    ])
)
print(exam_pool.private_name)

exam_pool = toloka_client.create_pool(exam_pool)
print(f'To view this pool, go to https://toloka.yandex.com/requester/project/{annotation_project.id}/pool/{exam_pool.id}')

Determine our WER thresholds

In [None]:
EXAM_THRESHOLD = 40  # target WER to pass the exam
QUALITY_THRESHOLD = 35  # target transcription WER

Create the annotation pool

In [None]:
annotation_pool = toloka.pool.Pool(
    project_id=annotation_project.id,
    private_name='test-clean',  # Only you can see this information.
    may_contain_adult_content=True,
    will_expire=datetime.datetime.utcnow() + datetime.timedelta(days=365),  # Pool will automatically close after one year
    reward_per_assignment=0.01,     # Set the minimum payment amount for one task page
    auto_accept_solutions=False,    # Only pay the performer for completing the task,
                                    # based on the verification results of the second project.
                                     
    auto_accept_period_day=1,       # Number of days to determine if we'll pay for task completion by this performer or not.
    assignment_max_duration_seconds=60*20,  # Give performers 20 minutes maximum to complete one task page.
    defaults=toloka.pool.Pool.Defaults(
        default_overlap_for_new_task_suites=7,  # Overlap
        default_overlap_for_new_tasks=7,
    ),
    type='REGULAR'
)

annotation_pool.set_mixer_config(real_tasks_count=1, golden_tasks_count=0, training_tasks_count=0)
annotation_pool.filter = (toloka.filter.FilterOr([toloka.filter.Languages.in_('EN')]) &  # Perfromer's profile language
    toloka.filter.FilterOr([
        toloka.filter.ClientType == 'BROWSER',
        toloka.filter.ClientType == 'TOLOKA_APP'
    ]) &
    toloka.filter.FilterOr([toloka.filter.Skill(exam_skill.id) >= (100 - EXAM_THRESHOLD)]) &
    toloka.filter.FilterOr([
        toloka.filter.Skill(transcription_skill.id) >= (100 - QUALITY_THRESHOLD),
        toloka.filter.Skill(transcription_skill.id) == None
    ])
)

annotation_pool.quality_control.add_action(
    collector=AssignmentSubmitTime(fast_submit_threshold_seconds=10),  # Fast responses threshold
    conditions=[FastSubmittedCount > 0],
    action=RestrictionV2(
        scope='PROJECT',
        duration_unit='PERMANENT',
        private_comment='Fast responses'
    )
)

annotation_pool.quality_control.add_action(
    collector=toloka.collectors.AssignmentsAssessment(),
    conditions=[toloka.conditions.AssessmentEvent == toloka.conditions.AssessmentEvent.REJECT],
    action=toloka.actions.ChangeOverlap(delta=1, open_pool=True),
)

print(annotation_pool.private_name)

annotation_pool = toloka_client.create_pool(annotation_pool)
print(f'To view this pool, go to https://toloka.yandex.com/requester/project/{annotation_project.id}/pool/{annotation_pool.id}')

Add tasks to the exam

In [None]:
with open('exam_gt.txt') as f:
    exam_gt_lines = f.readlines()
    
exam_tasks_inputs = [line.split('\t')[0] for line in exam_gt_lines]
exam_tasks_golden = [line.split('\t')[1] for line in exam_gt_lines]

exam_tasks = []

for task_input, golden in zip(exam_tasks_inputs, exam_tasks_golden):
    exam_tasks.append(
        toloka.task.Task(
            input_values={'audio': task_input},
            known_solutions = [
                toloka.task.BaseTask.KnownSolution(output_values={'transcription': golden})
            ],
            pool_id=exam_pool.id,
            infinite_overlap=True,
        )
    )

created_exam_tasks = toloka_client.create_tasks(exam_tasks)
print(f'{len(created_exam_tasks.items)} tasks added to the pool {exam_pool.id}')

Add tasks to the annotation pool

In [None]:
tasks_file = 'crowdspeech-test-clean-gt.txt'

with open(tasks_file) as f:
    gt_lines = f.readlines()
    
tasks_inputs = [line.split('\t')[0] for line in gt_lines]

real_tasks = []

for task_input in tasks_inputs:
    real_tasks.append(
        toloka.task.Task(
            input_values={'audio': task_input},
            pool_id=annotation_pool.id,
            overlap=7
        )
    )

created_real_tasks = toloka_client.create_tasks(real_tasks)
print(f'{len(created_real_tasks.items)} tasks added to the pool {annotation_pool.id}')

This function will be used for the automatic post-acceptance

In [None]:
def annotate(pool_id, skill_id, gt_path, threshold, window):
    pool = toloka_client.get_pool(pool_id)
    if pool.is_closed():
        toloka_client.open_pool(pool_id)

    task_processor = TaskProcessor(toloka_client, pool_id, skill_id, gt_path, threshold, window)
    pool = toloka_client.get_pool(pool.id)
    while not pool.is_closed():
        task_processor.process()
        sleep(10)
        pool = toloka_client.get_pool(pool.id)
    print(f'{pool_id} closed')

Start pools

In [None]:
exam_thread = threading.Thread(target=annotate, args=(exam_pool.id, exam_skill.id, 'exam_gt.txt', EXAM_THRESHOLD, 10))
annotation_thread = threading.Thread(target=annotate, args=(annotation_pool.id, transcription_skill.id, tasks_file, QUALITY_THRESHOLD, 5))

exam_thread.start()
annotation_thread.start()