# The purpose of this python script is to scrape task and task, user and log related information off of the freelancing platform Toloka and to further store these features in a DataFrame

### Importing Dependencies

In [3]:
import requests
from lxml import html
import pandas as pd

## Fetching the Tasks content from Toloka

In [4]:
myToken = "y0_AgAAAABkcQnJAACtpQAAAADO2AZOBndtdy51TQKGGGQ8qCdk0Z57kCs"

In [5]:
myUrl="https://toloka.yandex.com/api/task-suite-pool-groups?userLangs=en"
head = {'Authorization': 'OAuth {}'.format(myToken)}
poolsres = requests.get(myUrl, headers=head)

In [6]:
# storing the fetched content into variable 'toloka_content'
toloka_content = poolsres.json()

In [7]:
# having a look at the content
print(toloka_content)

[{'refUuid': '00d4695c-1c41-4d18-95ca-85b4ed658328', 'groupUuid': 'd51813a1-3d7f-4af0-af41-46d42673b44b', 'projectId': 105273, 'mayContainAdultContent': False, 'lang': 'EN', 'title': '[Toloka] English Grammar Test', 'description': "Take the test and get more paid tasks.\nAfter you pass the test, you'll get access to more difficult tasks that require advanced language skills.", 'hasInstructions': True, 'snapshotMajorVersion': 1, 'snapshotMinorVersion': 1, 'snapshotMajorVersionActual': True, 'assignmentIssuingType': 'AUTOMATIC', 'requesterInfo': {'id': '71daea4cffae4488067aebfb6583914a', 'name': {'EN': 'Toloka'}, 'trusted': False}, 'availability': {'available': True}, 'postAccept': False, 'iframeSubdomain': '71daea4cffae4488067aebfb6583914a', 'trainingDetails': {'training': False}, 'projectStats': {'grade': {'total_grade': 4.8}, 'averageSubmitTimeSec': 316, 'moneyAvgHourly': 0.0}, 'pools': [{'id': 36185755, 'startedAt': '2022-11-03T19:07:12.911', 'reward': '0.000', 'assignmentMaxDuration

In [11]:
# creating a separate dictionary for separate tasks

def create_tasks_dictionary(content):
    dicts = []
    for i in range(len(content)):
        individual_dicts = dict(content[i])
        dicts.append(individual_dicts)
    return dicts

In [12]:
tasks_dictionary = create_tasks_dictionary(toloka_content)

print('The total number of tasks: ' + str(len(tasks_dictionary)))

The total number of tasks: 4


In [13]:
print(tasks_dictionary)

[{'refUuid': 'b362e5fc-dd12-4ab2-85db-eaff40fc3fd6', 'groupUuid': '227f7cd2-45cf-422c-a399-d4f956a49535', 'projectId': 85377, 'mayContainAdultContent': False, 'title': 'Object recognition & detection', 'description': 'Outline a road signs in images', 'hasInstructions': True, 'snapshotMajorVersion': 1, 'snapshotMinorVersion': 1, 'snapshotMajorVersionActual': True, 'assignmentIssuingType': 'AUTOMATIC', 'requesterInfo': {'id': 'a841946b60a3fc816968533af5f8b371', 'name': {'EN': 'michaelyuyangtong'}, 'trusted': False}, 'availability': {'available': True}, 'postAccept': True, 'iframeSubdomain': 'a841946b60a3fc816968533af5f8b371', 'trainingDetails': {'training': False}, 'projectStats': {'grade': {'total_grade': 1.84}}, 'pools': [{'id': 32474078, 'startedAt': '2022-03-20T11:29:40.108', 'reward': '0.010', 'assignmentMaxDurationSeconds': 300, 'acceptancePeriodDays': 7}], 'projectMetaInfo': {'projectId': 85377, 'experimentMeta': {'dj_project_class__snippet__segmentation': '1'}}}, {'refUuid': 'b36

### Fetching the lists of Tasks from the Toloka Content

In [14]:
# creating a function to fetch the list of tasks from the tasks dictionary we created above
def fetch_tasks(task_dict):
    tasks = []
    for i in range(len(task_dict)):
        title = task_dict[i].get('title')
        tasks.append(title)
    return tasks

In [15]:
list_of_tasks = fetch_tasks(tasks_dictionary)

In [16]:
# printing the lists of tasks available on Toloka
print("The list of tasks on Toloka is provided below:")
print(' ')
for i in range(len(list_of_tasks)):
    print(str(i+1)+ ' '+ str(list_of_tasks[i]))

The list of tasks on Toloka is provided below:
 
1 Object recognition & detection
2 [Toloka] English Grammar Test
3 Check the similarity between two products
4 Сделайте  и загрузите фото собаки/Take and upload photos of a dog


### Fetching other task related information from the Toloka Content

In [100]:
# creating a function to fetch other important task related information from the tasks dictionary we created above
def fetch_other_info(task_dict):
    project_id = []
    adult_content = []
    description = []
    instructions_flag = []
    assignement_issuing_type = []
    requester_id = []
    requester_name = []
    requester_trusted_flag = []
    availability = []
    post_accept_flag = []
    training = []
    project_stat = []
    avg_money_hourly = []
    pool_id = []
    pool_started_at = []
    reward = []
    assignment_duration = []
    acceptance_period = []
    regular_subtype = []
    
    for i in range(len(task_dict)):
        project_ids = task_dict[i].get('projectId')
        adult_content_flag = task_dict[i].get('mayContainAdultContent')
        descriptions = task_dict[i].get('description')
        instructions_flags = task_dict[i].get('hasInstructions')
        assignement_issuing_types = task_dict[i].get('assignmentIssuingType')
        requester_ids = task_dict[i].get('requesterInfo', {}).get('id')
        requester_names = task_dict[i].get('requesterInfo', {}).get('name', {}).get('EN')
        requester_trusted_flags = task_dict[i].get('requesterInfo', {}).get('trusted')
        availabilitys = task_dict[i].get('availability', {}).get('available')
        post_accept_flags = task_dict[i].get('postAccept')
        trainings = task_dict[i].get('trainingDetails', {}).get('training')
        project_stats = task_dict[i].get('projectStats', {}).get('grade', {}).get('total_grade')
        avg_money_hour =  task_dict[i].get('projectStats', {}).get('moneyAvgHourly')
        pool_ids = task_dict[i].get('pools')[0].get('id')
        pool_started_ats = task_dict[i].get('pools')[0].get('startedAt')
        rewards = task_dict[i].get('pools')[0].get('reward')
        assignment_durations = task_dict[i].get('pools')[0].get('assignmentMaxDurationSeconds')
        acceptance_periods = task_dict[i].get('pools')[0].get('acceptancePeriodDays')
        regular_subtypes = task_dict[i].get('pools')[0].get('regularSubtype')
        
        project_id.append(project_ids)
        adult_content.append(adult_content_flag)
        description.append(descriptions)
        instructions_flag.append(instructions_flags)
        assignement_issuing_type.append(assignement_issuing_types)
        requester_id.append(requester_ids)
        requester_name.append(requester_names)
        requester_trusted_flag.append(requester_trusted_flags)
        availability.append(availabilitys) 
        post_accept_flag.append(post_accept_flags)
        training.append(trainings)
        project_stat.append(project_stats)
        avg_money_hourly.append(avg_money_hour)
        pool_id.append(pool_ids)
        pool_started_at.append(pool_started_ats)
        reward.append(rewards)
        assignment_duration.append(assignment_durations)
        acceptance_period.append(acceptance_periods)
        regular_subtype.append(regular_subtypes)
    return project_id, adult_content, description, instructions_flag, assignement_issuing_type, requester_id, requester_name, requester_trusted_flag, availability, post_accept_flag, training, project_stat, avg_money_hourly, pool_id, pool_started_at, reward, assignment_duration, acceptance_period, regular_subtype

In [101]:
project_id, adult_content, description, instructions_flag, assignement_issuing_type,requester_id, requester_name, requester_trusted_flag, availability, post_accept_flag, training, project_stat, avg_money_hourly, pool_id, pool_started_at, reward, assignment_duration, acceptance_period, regular_subtype = fetch_other_info(tasks_dictionary)

In [102]:
project_id

[85377, 105273, 78414, 114652]

In [103]:
adult_content

[False, False, False, False]

In [104]:
description

['Outline a road signs in images',
 "Take the test and get more paid tasks.\nAfter you pass the test, you'll get access to more difficult tasks that require advanced language skills.",
 'In this task, you will see two product listings. The PRODUCT is a product that a user has bookmarked for \nwhich we would like to show them. The RECOMMENDED PRODUCT offers a product we \nmight suggest to the user. The goal of this task is to classify these recommendations into one of the \nfollowing categories based on how closely the RECOMMENDED PRODUCT matches the PRODUCT.',
 'Сделайте фото собаки согласно инструкции и загрузите. Перед выполнением задания внимательно ознакомьтесь с инструкцией и просмотрите примеры. Все задания проходят нашу тщательную проверку!\nTake photos of the dog according to the instructions and upload it. Before completing the task, carefully read the instructions and view the examples. All tasks are carefully checked by us!\n']

In [105]:
instructions_flag

[True, True, True, True]

In [106]:
assignement_issuing_type

['AUTOMATIC', 'AUTOMATIC', 'AUTOMATIC', 'AUTOMATIC']

In [107]:
requester_id

['a841946b60a3fc816968533af5f8b371',
 '71daea4cffae4488067aebfb6583914a',
 'dbaa749ed83b3ef412a3167f6c53c363',
 '631b832d247a0b9add115fb0a8346e6f']

In [108]:
requester_name

['michaelyuyangtong', 'Toloka', 'ADE4PA', 'NtechLab']

In [109]:
requester_trusted_flag

[False, False, False, False]

In [110]:
availability

[True, True, True, True]

In [111]:
post_accept_flag

[True, False, False, True]

In [112]:
training

[False, False, True, False]

In [113]:
project_stat

[1.84, 4.62, 4.34, None]

In [114]:
pool_id

[32474078, 36185755, 35838995, 36088797]

In [115]:
pool_started_at

['2022-03-20T11:29:40.108',
 '2022-11-03T19:07:12.911',
 '2022-10-11T23:36:03.957',
 '2022-11-10T12:00:54.397']

In [116]:
reward

['0.010', '0.000', '0.000', '0.500']

In [117]:
assignment_duration

[300, 900, 120, 600000]

In [118]:
acceptance_period

[7, None, None, 7]

In [119]:
regular_subtype

[None, 'EXAM', None, None]

In [None]:
avg_money_hourly

### Storing the above tasks related information in a DataFrame

In [126]:
task_data = pd.DataFrame({"title": list_of_tasks, "project_id": project_id, "adult_content": adult_content, "description": description, "instructions_flag": instructions_flag, "assignment_issuing_type": assignement_issuing_type, "requester_id": requester_id, "requester_name": requester_name, "requester_trusted_flag": requester_trusted_flag, "availability": availability, "post_accept_flag": post_accept_flag, "training": training, "project_stat": project_stat, "avg_money_hourly": avg_money_hourly, "pool_id": pool_id, "pool_started_at": pool_started_at, "reward": reward, "assignment_duration": assignment_duration, "acceptance_period": acceptance_period, "regular_subtype": regular_subtype})

In [127]:
task_data

Unnamed: 0,title,project_id,adult_content,description,instructions_flag,assignment_issuing_type,requester_id,requester_name,requester_trusted_flag,availability,post_accept_flag,training,project_stat,pool_id,pool_started_at,reward,assignment_duration,acceptance_period,regular_subtype
0,Object recognition & detection,85377,False,Outline a road signs in images,True,AUTOMATIC,a841946b60a3fc816968533af5f8b371,michaelyuyangtong,False,True,True,False,1.84,32474078,2022-03-20T11:29:40.108,0.01,300,7.0,
1,[Toloka] English Grammar Test,105273,False,Take the test and get more paid tasks.\nAfter ...,True,AUTOMATIC,71daea4cffae4488067aebfb6583914a,Toloka,False,True,False,False,4.62,36185755,2022-11-03T19:07:12.911,0.0,900,,EXAM
2,Check the similarity between two products,78414,False,"In this task, you will see two product listing...",True,AUTOMATIC,dbaa749ed83b3ef412a3167f6c53c363,ADE4PA,False,True,False,True,4.34,35838995,2022-10-11T23:36:03.957,0.0,120,,
3,Сделайте и загрузите фото собаки/Take and upl...,114652,False,Сделайте фото собаки согласно инструкции и заг...,True,AUTOMATIC,631b832d247a0b9add115fb0a8346e6f,NtechLab,False,True,True,False,,36088797,2022-11-10T12:00:54.397,0.5,600000,7.0,


## Fetching User related content from Toloka

In [8]:
tolokaUserEndpoint = "https://toloka.yandex.com/api/users/current/worker"
head2 = {'Authorization': 'OAuth {}'.format(myToken)}
poolsres2 = requests.get(tolokaUserEndpoint, headers=head2)

In [9]:
# storing the fetched content into variable 'user_content'
user_content = poolsres2.json()

In [10]:
print(user_content)

{'uid': 1685129673, 'login': 'mitanshi', 'role': 'WORKER', 'userLang': 'EN', 'defaultEmail': 'mitanshi@yandex.com', 'connectionId': 't:3470263886', 'authorizationStatus': 'VALID', 'avatarId': '0/0-0', 'displayName': 'mitanshi', 'fullName': 'Mitanshi Vyas', 'firstName': 'Mitanshi', 'lastName': 'Vyas', 'isAccountOwner': True, 'actualUser': {'uid': 1685129673, 'idpAlias': 'YANDEX_ID_LEGACY', 'idpIdentity': '1685129673', 'enterpriseSsoIdpAlias': False, 'login': 'mitanshi', 'role': 'WORKER', 'userLang': 'EN', 'defaultEmail': 'mitanshi@yandex.com', 'displayName': 'mitanshi', 'readOnlyModeToActUnderAccount': False}, 'availableAccounts': [], 'createdDate': '2022-09-09', 'systemBan': False, 'birthDay': '1999-07-10', 'country': 'US', 'languages': ['EN'], 'adultAllowed': False, 'acceptedEula': 13, 'rating': 0, 'authoritiesInfo': {'issuedAuthorities': ['U_TRANSACTIONS_CREATE', 'APP', 'U_ASSIGNMENTS_SUBMIT', 'U_MESSAGES_VIEW', 'U_ASSIGNMENTS_HISTORY', 'APP_USER', 'U_FORUM_EDIT', 'U_ASSIGNMENTS_UNDE

In [11]:
type(user_content)

dict

In [115]:
def fetch_user_info(user_dict):
    user_id = []
    role = []
    user_language = []
    default_email = []
    connection_id = []
    full_name = []
    birth_day = []
    country = []
    
    user_ids = user_dict.get('uid')
    roles = user_dict.get('role')
    user_languages = user_dict.get('userLang')
    default_emails = user_dict.get('defaultEmail')
    connection_ids = user_dict.get('connectionId')
    full_names = user_dict.get('fullName')
    birth_days = user_dict.get('birthDay')
    countrys = user_dict.get('country')
    
    user_id.append(user_ids)
    role.append(roles)
    user_language.append(user_languages)
    default_email.append(default_emails)
    connection_id.append(connection_ids)
    full_name.append(full_names)
    birth_day.append(birth_days)
    country.append(countrys)
    return user_id, role, user_language, default_email, connection_id, full_name, birth_day, country

In [116]:
user_id, role, user_language, default_email, connection_id, full_name, birth_day, country = fetch_user_info(user_content)

In [25]:
user_id

[1685129673]

In [26]:
role

['WORKER']

In [27]:
user_language

['EN']

In [28]:
default_email

['mitanshi@yandex.com']

In [29]:
connection_id

['t:3470263886']

In [30]:
full_name

['Mitanshi Vyas']

In [31]:
birth_day

['1999-07-10']

In [32]:
country

['US']

In [33]:
user_data = pd.DataFrame({"user_id": user_id, "role": role, "user_language": user_language, "default_email": default_email, "connection_id": connection_id, "full_name": full_name, "birth_day": birth_day, "country": country})

In [34]:
user_data

Unnamed: 0,user_id,role,user_language,default_email,connection_id,full_name,birth_day,country
0,1685129673,WORKER,EN,mitanshi@yandex.com,t:3470263886,Mitanshi Vyas,1999-07-10,US


## Fetching Task Reciept related content from Toloka

In [39]:
tolokaIncomeLogEndpoint = "https://toloka.yandex.com/api/worker/finance/income-log"
head3 = {'Authorization': 'OAuth {}'.format(myToken)}
poolsres3 = requests.get(tolokaIncomeLogEndpoint, headers=head3)

In [40]:
# storing the fetched content into variable 'user_content'
income_content = poolsres3.json()

In [41]:
print(income_content)

[{'date': '2022-10-05', 'assignments': [{'date': '2022-10-05', 'requester': {'login': 'idrnd.inc', 'id': 'c69b008e44d038469bf39335173dce31', 'name': {'EN': 'ID R&D Inc'}, 'role': 'REQUESTER'}, 'project': {'id': 105540, 'name': 'Take a picture of your face on a webcam'}, 'income': '0.000', 'totalIncome': '0.000', 'blockedIncome': '0.000', 'detalizationMode': 'PROJECT', 'additionalReward': '0.000', 'additionalRewardTolokaFee': '0.000'}]}, {'date': '2022-10-18', 'assignments': [{'date': '2022-10-18', 'requester': {'login': 'ADE4PA', 'id': 'dbaa749ed83b3ef412a3167f6c53c363', 'name': {'EN': 'ADE4PA'}, 'role': 'REQUESTER'}, 'project': {'id': 78414, 'lang': 'EN', 'name': 'Check the similarity between two products'}, 'income': '0.000', 'totalIncome': '0.000', 'blockedIncome': '0.000', 'detalizationMode': 'PROJECT', 'additionalReward': '0.000', 'additionalRewardTolokaFee': '0.000'}, {'date': '2022-10-18', 'requester': {'login': 'zen-yndx', 'id': '71daea4cffae4488067aebfb6583914a', 'name': {'EN'

In [38]:
type(income_content)

dict

In [49]:
income_content[0]

{'date': '2022-10-05',
 'assignments': [{'date': '2022-10-05',
   'requester': {'login': 'idrnd.inc',
    'id': 'c69b008e44d038469bf39335173dce31',
    'name': {'EN': 'ID R&D Inc'},
    'role': 'REQUESTER'},
   'project': {'id': 105540,
    'name': 'Take a picture of your face on a webcam'},
   'income': '0.000',
   'totalIncome': '0.000',
   'blockedIncome': '0.000',
   'detalizationMode': 'PROJECT',
   'additionalReward': '0.000',
   'additionalRewardTolokaFee': '0.000'}]}

In [92]:
(income_content[0].get('assignments'))[0].get('date')

'2022-10-05'

## Fetching Income related content from Toloka

In [102]:
def fetch_income_info(income_dict):
    project_id = []
    title = []
    date = []
    
    for i in range(len(income_dict)):
        project_ids = income_dict[i].get('assignments')[0].get('project', {}).get('id')
        dates = income_dict[i].get('date')
        titles =  income_dict[i].get('assignments')[0].get('project', {}).get('name')
        
        project_id.append(project_ids)
        date.append(dates)
        title.append(titles)
    
    return project_id, date, title

In [103]:
project_id, date, title = fetch_income_info(income_content)

In [104]:
project_id

[105540, 78414]

In [105]:
date

['2022-10-05', '2022-10-18']

In [106]:
title

['Take a picture of your face on a webcam',
 'Check the similarity between two products']

In [118]:
user_id = [user_id[0]]*len(title)

In [120]:
income_data = pd.DataFrame({"user_id": user_id, "project_id": project_id, "date": date, "title": title})

In [121]:
income_data

Unnamed: 0,user_id,project_id,date,title
0,1685129673,105540,2022-10-05,Take a picture of your face on a webcam
1,1685129673,78414,2022-10-18,Check the similarity between two products


## Fetching Log Activity related content from Toloka

In [122]:
tolokaNewTasksEndpoint = "https://toloka.yandex.com/api/task-suite-pool-groups?userLangs=EN"
head4 = {'Authorization': 'OAuth {}'.format(myToken)}
poolsres4 = requests.get(tolokaNewTasksEndpoint, headers=head4)

In [123]:
# storing the fetched content into variable 'user_content'
log_content = poolsres4.json()

In [124]:
print(log_content)

[{'refUuid': 'e80c5cc5-0bf9-4c2d-9024-a4eafd605f52', 'groupUuid': 'ef8f2175-1225-47d4-93fc-3f673904dce5', 'projectId': 114652, 'mayContainAdultContent': False, 'lang': 'RU', 'title': 'Сделайте  и загрузите фото собаки/Take and upload photos of a dog', 'description': 'Сделайте фото собаки согласно инструкции и загрузите. Перед выполнением задания внимательно ознакомьтесь с инструкцией и просмотрите примеры. Все задания проходят нашу тщательную проверку!\nTake photos of the dog according to the instructions and upload it. Before completing the task, carefully read the instructions and view the examples. All tasks are carefully checked by us!\n', 'hasInstructions': True, 'snapshotMajorVersion': 4, 'snapshotMinorVersion': 10, 'snapshotMajorVersionActual': True, 'assignmentIssuingType': 'AUTOMATIC', 'requesterInfo': {'id': '631b832d247a0b9add115fb0a8346e6f', 'name': {'EN': 'NtechLab'}, 'trusted': False}, 'availability': {'available': True}, 'postAccept': True, 'iframeSubdomain': '631b832d24

In [125]:
type(log_content)

list