# The purpose of this python script is to scrape task and task, user and log related information off of the freelancing platform Toloka and to further store these features in a DataFrame

### Importing Dependencies

In [None]:
import requests
from lxml import html
import pandas as pd

## Fetching the Tasks content from Toloka

In [None]:
myToken = "y0_AgAAAABkcQnJAACtpQAAAADO2AZOBndtdy51TQKGGGQ8qCdk0Z57kCs"

In [None]:
myUrl="https://toloka.yandex.com/api/task-suite-pool-groups?userLangs=en"
head = {'Authorization': 'OAuth {}'.format(myToken)}
poolsres = requests.get(myUrl, headers=head)

In [None]:
# storing the fetched content into variable 'toloka_content'
toloka_content = poolsres.json()

In [None]:
# having a look at the content
print(toloka_content)

[{'refUuid': 'ef0b0733-165c-40c4-8e44-af1a1180e24d', 'groupUuid': '7f8a610c-7931-48f8-bdff-d21c19807d80', 'projectId': 78414, 'mayContainAdultContent': False, 'lang': 'EN', 'title': 'Check the similarity between two products', 'description': 'In this task, you will see two product listings. The PRODUCT is a product that a user has bookmarked for \nwhich we would like to show them. The RECOMMENDED PRODUCT offers a product we \nmight suggest to the user. The goal of this task is to classify these recommendations into one of the \nfollowing categories based on how closely the RECOMMENDED PRODUCT matches the PRODUCT.', 'hasInstructions': True, 'snapshotMajorVersion': 5, 'snapshotMinorVersion': 11, 'snapshotMajorVersionActual': True, 'assignmentIssuingType': 'AUTOMATIC', 'requesterInfo': {'id': 'dbaa749ed83b3ef412a3167f6c53c363', 'name': {'EN': 'ADE4PA'}, 'trusted': False}, 'availability': {'available': True}, 'postAccept': False, 'iframeSubdomain': 'dbaa749ed83b3ef412a3167f6c53c363', 'trai

In [None]:
# creating a separate dictionary for separate tasks

def create_tasks_dictionary(content):
    dicts = []
    for i in range(len(content)):
        individual_dicts = dict(content[i])
        dicts.append(individual_dicts)
    return dicts

In [None]:
tasks_dictionary = create_tasks_dictionary(toloka_content)

print('The total number of tasks: ' + str(len(tasks_dictionary)))

The total number of tasks: 3


In [None]:
print(tasks_dictionary)

[{'refUuid': 'ef0b0733-165c-40c4-8e44-af1a1180e24d', 'groupUuid': '7f8a610c-7931-48f8-bdff-d21c19807d80', 'projectId': 78414, 'mayContainAdultContent': False, 'lang': 'EN', 'title': 'Check the similarity between two products', 'description': 'In this task, you will see two product listings. The PRODUCT is a product that a user has bookmarked for \nwhich we would like to show them. The RECOMMENDED PRODUCT offers a product we \nmight suggest to the user. The goal of this task is to classify these recommendations into one of the \nfollowing categories based on how closely the RECOMMENDED PRODUCT matches the PRODUCT.', 'hasInstructions': True, 'snapshotMajorVersion': 5, 'snapshotMinorVersion': 11, 'snapshotMajorVersionActual': True, 'assignmentIssuingType': 'AUTOMATIC', 'requesterInfo': {'id': 'dbaa749ed83b3ef412a3167f6c53c363', 'name': {'EN': 'ADE4PA'}, 'trusted': False}, 'availability': {'available': True}, 'postAccept': False, 'iframeSubdomain': 'dbaa749ed83b3ef412a3167f6c53c363', 'trai

### Fetching the lists of Tasks from the Toloka Content

In [None]:
# creating a function to fetch the list of tasks from the tasks dictionary we created above
def fetch_tasks(task_dict):
    tasks = []
    for i in range(len(task_dict)):
        title = task_dict[i].get('title')
        tasks.append(title)
    return tasks

In [None]:
list_of_tasks = fetch_tasks(tasks_dictionary)

In [None]:
# printing the lists of tasks available on Toloka
print("The list of tasks on Toloka is provided below:")
print(' ')
for i in range(len(list_of_tasks)):
    print(str(i+1)+ ' '+ str(list_of_tasks[i]))

The list of tasks on Toloka is provided below:
 
1 Check the similarity between two products
2 Object recognition & detection
3 [Toloka] English Grammar Test


### Fetching other task related information from the Toloka Content

In [None]:
# creating a function to fetch other important task related information from the tasks dictionary we created above
def fetch_other_info(task_dict):
    project_id = []
    adult_content = []
    lang = []
    description = []
    instructions_flag = []
    assignement_issuing_type = []
    requester_id = []
    requester_name = []
    requester_trusted_flag = []
    availability = []
    post_accept_flag = []
    training = []
    project_stat = []
    avg_money_hourly = []
    money_med = []
    money_top = []
    money_max = []
    pool_id = []
    pool_started_at = []
    reward = []
    assignment_duration = []
    acceptance_period = []
    regular_subtype = []
    
#     'lang',
#  'moneyMed',
#  'moneyTop10',
#  'moneyMax3'
    
    for i in range(len(task_dict)):
        project_ids = task_dict[i].get('projectId')
        adult_content_flag = task_dict[i].get('mayContainAdultContent')
        langs = task_dict[i].get('lang')
        descriptions = task_dict[i].get('description')
        instructions_flags = task_dict[i].get('hasInstructions')
        assignement_issuing_types = task_dict[i].get('assignmentIssuingType')
        requester_ids = task_dict[i].get('requesterInfo', {}).get('id')
        requester_names = task_dict[i].get('requesterInfo', {}).get('name', {}).get('EN')
        requester_trusted_flags = task_dict[i].get('requesterInfo', {}).get('trusted')
        availabilitys = task_dict[i].get('availability', {}).get('available')
        post_accept_flags = task_dict[i].get('postAccept')
        trainings = task_dict[i].get('trainingDetails', {}).get('training')
        project_stats = task_dict[i].get('projectStats', {}).get('grade', {}).get('total_grade')
        avg_money_hour =  task_dict[i].get('projectStats', {}).get('moneyAvgHourly')
        money_meds =  task_dict[i].get('projectStats', {}).get('moneyMed')
        money_tops =  task_dict[i].get('projectStats', {}).get('moneyTop10')
        money_maxs =  task_dict[i].get('projectStats', {}).get('moneyMax3')
        pool_ids = task_dict[i].get('pools')[0].get('id')
        pool_started_ats = task_dict[i].get('pools')[0].get('startedAt')
        rewards = task_dict[i].get('pools')[0].get('reward')
        assignment_durations = task_dict[i].get('pools')[0].get('assignmentMaxDurationSeconds')
        acceptance_periods = task_dict[i].get('pools')[0].get('acceptancePeriodDays')
        regular_subtypes = task_dict[i].get('pools')[0].get('regularSubtype')
        
        project_id.append(project_ids)
        adult_content.append(adult_content_flag)
        lang.append(langs)
        description.append(descriptions)
        instructions_flag.append(instructions_flags)
        assignement_issuing_type.append(assignement_issuing_types)
        requester_id.append(requester_ids)
        requester_name.append(requester_names)
        requester_trusted_flag.append(requester_trusted_flags)
        availability.append(availabilitys) 
        post_accept_flag.append(post_accept_flags)
        training.append(trainings)
        project_stat.append(project_stats)
        avg_money_hourly.append(avg_money_hour)
        money_med.append(money_meds)
        money_top.append(money_tops)
        money_max.append(money_maxs)
        pool_id.append(pool_ids)
        pool_started_at.append(pool_started_ats)
        reward.append(rewards)
        assignment_duration.append(assignment_durations)
        acceptance_period.append(acceptance_periods)
        regular_subtype.append(regular_subtypes)
    return project_id, adult_content, lang, description, instructions_flag, assignement_issuing_type, requester_id, requester_name, requester_trusted_flag, availability, post_accept_flag, training, project_stat, avg_money_hourly, money_med, money_top, money_max, pool_id, pool_started_at, reward, assignment_duration, acceptance_period, regular_subtype

In [None]:
project_id, adult_content, lang, description, instructions_flag, assignement_issuing_type,requester_id, requester_name, requester_trusted_flag, availability, post_accept_flag, training, project_stat, avg_money_hourly,money_med, money_top, money_max, pool_id, pool_started_at, reward, assignment_duration, acceptance_period, regular_subtype = fetch_other_info(tasks_dictionary)

In [None]:
lang

['EN', None, 'EN']

In [None]:
project_id

[78414, 85377, 105273]

In [None]:
adult_content

[False, False, False]

In [None]:
description

['In this task, you will see two product listings. The PRODUCT is a product that a user has bookmarked for \nwhich we would like to show them. The RECOMMENDED PRODUCT offers a product we \nmight suggest to the user. The goal of this task is to classify these recommendations into one of the \nfollowing categories based on how closely the RECOMMENDED PRODUCT matches the PRODUCT.',
 'Outline a road signs in images',
 "Take the test and get more paid tasks.\nAfter you pass the test, you'll get access to more difficult tasks that require advanced language skills."]

In [None]:
instructions_flag

[True, True, True]

In [None]:
assignement_issuing_type

['AUTOMATIC', 'AUTOMATIC', 'AUTOMATIC']

In [None]:
requester_id

['dbaa749ed83b3ef412a3167f6c53c363',
 'a841946b60a3fc816968533af5f8b371',
 '71daea4cffae4488067aebfb6583914a']

In [None]:
requester_name

['ADE4PA', 'michaelyuyangtong', 'Toloka']

In [None]:
requester_trusted_flag

[False, False, False]

In [None]:
availability

[True, True, True]

In [None]:
post_accept_flag

[False, True, False]

In [None]:
training

[True, False, False]

In [None]:
project_stat

[4.31, 2.52, 4.51]

In [None]:
pool_id

[35838995, 32474078, 36185755]

In [None]:
pool_started_at

['2022-10-11T23:36:03.957',
 '2022-03-20T11:29:40.108',
 '2022-11-03T19:07:12.911']

In [None]:
reward

['0.000', '0.010', '0.000']

In [None]:
assignment_duration

[120, 300, 900]

In [None]:
acceptance_period

[None, 7, None]

In [None]:
regular_subtype

[None, None, 'EXAM']

In [None]:
avg_money_hourly

[0.0, None, 0.0]

In [None]:
len(task_data)

NameError: ignored

### Storing the above tasks related information in a DataFrame

In [None]:
c = [86400, 86400, 86400]

In [None]:
task_data = pd.DataFrame({"title": list_of_tasks, "project_id": project_id, "adult_content": adult_content, "lang": lang, "description": description, "instructions_flag": instructions_flag, "assignment_issuing_type": assignement_issuing_type, "requester_id": requester_id, "requester_name": requester_name, "requester_trusted_flag": requester_trusted_flag, "availability": availability, "post_accept_flag": post_accept_flag, "training": training, "project_stat": project_stat, "avg_money_hourly": avg_money_hourly, "moneyMed": money_med, "moneyTop10": money_top, "moneyMax3": money_max,"pool_id": pool_id, "pool_started_at": pool_started_at, "reward": reward, "assignment_duration": assignment_duration, "acceptance_period": acceptance_period, "regular_subtype": regular_subtype, "c": c})

In [None]:
len(task_data)

3

In [None]:
task_data['acceptance_period'] = task_data['acceptance_period']*task_data['c'].fillna(1)

In [None]:
task_data

Unnamed: 0,title,project_id,adult_content,lang,description,instructions_flag,assignment_issuing_type,requester_id,requester_name,requester_trusted_flag,...,moneyMed,moneyTop10,moneyMax3,pool_id,pool_started_at,reward,assignment_duration,acceptance_period,regular_subtype,c
0,Check the similarity between two products,78414,False,EN,"In this task, you will see two product listing...",True,AUTOMATIC,dbaa749ed83b3ef412a3167f6c53c363,ADE4PA,False,...,0.0,1.15,19.56,35838995,2022-10-11T23:36:03.957,0.0,120,,,86400
1,Object recognition & detection,85377,False,,Outline a road signs in images,True,AUTOMATIC,a841946b60a3fc816968533af5f8b371,michaelyuyangtong,False,...,,,,32474078,2022-03-20T11:29:40.108,0.01,300,604800.0,,86400
2,[Toloka] English Grammar Test,105273,False,EN,Take the test and get more paid tasks.\nAfter ...,True,AUTOMATIC,71daea4cffae4488067aebfb6583914a,Toloka,False,...,,,,36185755,2022-11-03T19:07:12.911,0.0,900,,EXAM,86400


## Fetching User related content from Toloka

In [None]:
tolokaUserEndpoint = "https://toloka.yandex.com/api/users/current/worker"
head2 = {'Authorization': 'OAuth {}'.format(myToken)}
poolsres2 = requests.get(tolokaUserEndpoint, headers=head2)

In [None]:
# storing the fetched content into variable 'user_content'
user_content = poolsres2.json()

In [None]:
print(user_content)

{'puid': 1685129673, 'uid': 1685129673, 'login': 'mitanshi', 'role': 'WORKER', 'userLang': 'EN', 'defaultEmail': 'mitanshi@yandex.com', 'connectionId': 't:3470263886', 'authorizationStatus': 'VALID', 'avatarId': '0/0-0', 'displayName': 'mitanshi', 'fullName': 'Mitanshi Vyas', 'firstName': 'Mitanshi', 'lastName': 'Vyas', 'isAccountOwner': True, 'actualUser': {'puid': 1685129673, 'uid': 1685129673, 'idpAlias': 'YANDEX_ID_LEGACY', 'idpIdentity': '1685129673', 'enterpriseSsoIdpAlias': False, 'login': 'mitanshi', 'role': 'WORKER', 'userLang': 'EN', 'defaultEmail': 'mitanshi@yandex.com', 'displayName': 'mitanshi', 'readOnlyModeToActUnderAccount': False}, 'availableAccounts': [], 'createdDate': '2022-09-09', 'systemBan': False, 'birthDay': '1999-07-10', 'country': 'US', 'languages': ['EN'], 'adultAllowed': False, 'acceptedEula': 13, 'rating': 0, 'authoritiesInfo': {'issuedAuthorities': ['U_TRANSACTIONS_CREATE', 'U_WALLETS_EDIT', 'U_ASSIGNMENTS_UNDERTAKE', 'U_ASSIGNMENTS_SUBMIT', 'U_PROFILE_VI

In [None]:
type(user_content)

dict

In [None]:
def fetch_user_info(user_dict):
    user_id = []
    role = []
    user_language = []
    default_email = []
    connection_id = []
    full_name = []
    birth_day = []
    country = []
    joined = []
    
    user_ids = user_dict.get('uid')
    roles = user_dict.get('role')
    user_languages = user_dict.get('userLang')
    default_emails = user_dict.get('defaultEmail')
    connection_ids = user_dict.get('connectionId')
    full_names = user_dict.get('fullName')
    birth_days = user_dict.get('birthDay')
    countrys = user_dict.get('country')
    joineds = user_dict.get('createdDate')
    
    user_id.append(user_ids)
    role.append(roles)
    user_language.append(user_languages)
    default_email.append(default_emails)
    connection_id.append(connection_ids)
    full_name.append(full_names)
    birth_day.append(birth_days)
    country.append(countrys)
    joined.append(joineds)
    return user_id, role, user_language, default_email, connection_id, full_name, birth_day, country, joined

In [None]:
user_id, role, user_language, default_email, connection_id, full_name, birth_day, country, joined = fetch_user_info(user_content)

In [None]:
user_id

[1685129673]

In [None]:
role

['WORKER']

In [None]:
user_language

['EN']

In [None]:
default_email

['mitanshi@yandex.com']

In [None]:
connection_id

['t:3470263886']

In [None]:
full_name

['Mitanshi Vyas']

In [None]:
birth_day

['1999-07-10']

In [None]:
country

['US']

In [None]:
user_data = pd.DataFrame({"user_id": user_id, "role": role, "user_language": user_language, "default_email": default_email, "connection_id": connection_id, "full_name": full_name, "birth_day": birth_day, "country": country, 'joined': joined})

In [None]:
user_data

Unnamed: 0,user_id,role,user_language,default_email,connection_id,full_name,birth_day,country,joined
0,1685129673,WORKER,EN,mitanshi@yandex.com,t:3470263886,Mitanshi Vyas,1999-07-10,US,2022-09-09


## Fetching Task Reciept related content from Toloka

In [None]:
tolokaIncomeLogEndpoint = "https://toloka.yandex.com/api/worker/finance/income-log"
head3 = {'Authorization': 'OAuth {}'.format(myToken)}
poolsres3 = requests.get(tolokaIncomeLogEndpoint, headers=head3)

In [None]:
# storing the fetched content into variable 'user_content'
income_content = poolsres3.json()

In [None]:
print(income_content)

[]


In [None]:
type(income_content)

list

In [None]:
income_content[0]

{'date': '2022-10-18',
 'assignments': [{'date': '2022-10-18',
   'requester': {'login': 'ADE4PA',
    'id': 'dbaa749ed83b3ef412a3167f6c53c363',
    'name': {'EN': 'ADE4PA'},
    'role': 'REQUESTER'},
   'project': {'id': 78414,
    'lang': 'EN',
    'name': 'Check the similarity between two products'},
   'income': '0.000',
   'totalIncome': '0.000',
   'blockedIncome': '0.000',
   'detalizationMode': 'PROJECT',
   'additionalReward': '0.000',
   'additionalRewardTolokaFee': '0.000'},
  {'date': '2022-10-18',
   'requester': {'login': 'zen-yndx',
    'id': '71daea4cffae4488067aebfb6583914a',
    'name': {'EN': 'Toloka'},
    'role': 'REQUESTER'},
   'project': {'id': 50751,
    'lang': 'EN',
    'name': '[Toloka] English Comprehension Test'},
   'income': '0.000',
   'totalIncome': '0.000',
   'blockedIncome': '0.000',
   'detalizationMode': 'PROJECT',
   'additionalReward': '0.000',
   'additionalRewardTolokaFee': '0.000'}]}

In [None]:
(income_content[0].get('assignments'))[0].get('date')

'2022-10-18'

## Fetching Income related/Task Receipt content from Toloka

In [None]:
def fetch_income_info(income_dict):
    project_id = []
#     title = []
    date = []
    income = []
    
    for i in range(len(income_dict)):
        project_ids = income_dict[i].get('assignments')[0].get('project', {}).get('id')
        dates = income_dict[i].get('date')
#         titles =  income_dict[i].get('assignments')[0].get('project', {}).get('name')
        incomes = income_dict[i].get('assignments')[0].get('income')
        
        project_id.append(project_ids)
        date.append(dates)
        income.append(incomes)
#         title.append(titles)
        
    return project_id, date, income

In [None]:
project_id, date, income = fetch_income_info(income_content)

In [None]:
project_id

[78414]

In [None]:
date

['2022-10-18']

In [None]:
income

['0.000']

In [None]:
user_id = [user_id[0]]*len(income)

In [None]:
task_receipt = pd.DataFrame({"user_id": user_id, "project_id": project_id, "date": date, "income": income})

In [None]:
task_receipt

Unnamed: 0,user_id,project_id,date,income
0,1685129673,78414,2022-10-18,0.0


## Fetching Log Activity related content from Toloka

In [None]:
tolokaNewTasksEndpoint = "https://toloka.yandex.com/api/task-suite-pool-groups?userLangs=EN"
head4 = {'Authorization': 'OAuth {}'.format(myToken)}
poolsres4 = requests.get(tolokaNewTasksEndpoint, headers=head4)

In [None]:
# storing the fetched content into variable 'user_content'
log_content = poolsres4.json()

In [None]:
print(log_content)

[{'refUuid': '787d08d5-ccde-4109-9b71-fac65f57ac14', 'groupUuid': '52662002-3473-4726-86c4-3b120d1e61c5', 'projectId': 78414, 'mayContainAdultContent': False, 'lang': 'EN', 'title': 'Check the similarity between two products', 'description': 'In this task, you will see two product listings. The PRODUCT is a product that a user has bookmarked for \nwhich we would like to show them. The RECOMMENDED PRODUCT offers a product we \nmight suggest to the user. The goal of this task is to classify these recommendations into one of the \nfollowing categories based on how closely the RECOMMENDED PRODUCT matches the PRODUCT.', 'hasInstructions': True, 'snapshotMajorVersion': 5, 'snapshotMinorVersion': 11, 'snapshotMajorVersionActual': True, 'assignmentIssuingType': 'AUTOMATIC', 'requesterInfo': {'id': 'dbaa749ed83b3ef412a3167f6c53c363', 'name': {'EN': 'ADE4PA'}, 'trusted': False}, 'availability': {'available': True}, 'postAccept': False, 'iframeSubdomain': 'dbaa749ed83b3ef412a3167f6c53c363', 'trai

In [None]:
type(log_content)

list

## Connecting to the MySQL database

In [None]:
# pip install pymysql

In [None]:
# import pymysql
# conn = pymysql.connect(host = 'ec2-user@ec2-44-202-247-223.compute-1.amazonaws.com',
# port = int(3306),
# user = 'root',
# passwd = 'Civic#AI@Lab22!',
# db = 'sample_logs')

## Tasks Data

In [None]:
import requests
# api-endpoint
URL = "https://toloka-tracer-study.forgigworkers.org:8080/api/civic_ai/private/secure/tasks/"
# sending get request and saving the response as response object
r = requests.get(url = URL, verify = False)
data = r.json()
print(data)

[{'project_id': 0, 'pool_id': 0, 'title': 'title', 'description': '', 'pool_startedAt': '2022-09-16T22:16:48.000Z', 'hasInstructions': 0, 'mayContainAdultContent': 1, 'requesterID': 'requesterID', 'requesterTrusted': 1, 'lang': 'EN', 'grade': 4.3, 'averageAcceptanceTimeSec': None, 'moneyAvgHourly': 0.54, 'moneyMed': None, 'moneyTop10': None, 'moneyMax3': None, 'reward': 0}, {'project_id': 2338, 'pool_id': 36616149, 'title': 'Ekran görüntüsünde yetişkin içerik var mı?', 'description': 'Bu, "Ekran görüntüsünde yetişkin içerik var mı?" projesinin sınavıdır.\nAna projeye erişim yetkisi almak için bu sınavdan geçmeniz gerekmektedir.\n(Dikkat! Görevde yetişkin içerik bulunabilir)', 'pool_startedAt': '2022-11-29T12:19:54.000Z', 'hasInstructions': 1, 'mayContainAdultContent': 1, 'requesterID': 'fa517ea3113e489138d249ff9844f572', 'requesterTrusted': 1, 'lang': None, 'grade': 4.52, 'averageAcceptanceTimeSec': None, 'moneyAvgHourly': 0, 'moneyMed': 0, 'moneyTop10': 0.09, 'moneyMax3': 1.19, 'rewar



In [None]:
task_data2 = pd.DataFrame.from_dict(pd.json_normalize(data), orient='columns')

In [None]:
task_data2

Unnamed: 0,project_id,pool_id,title,description,pool_startedAt,hasInstructions,mayContainAdultContent,requesterID,requesterTrusted,lang,grade,averageAcceptanceTimeSec,moneyAvgHourly,moneyMed,moneyTop10,moneyMax3,reward
0,0,0,title,,2022-09-16T22:16:48.000Z,0,1,requesterID,1,EN,4.30,,0.540000,,,,0.00
1,2338,36616149,Ekran görüntüsünde yetişkin içerik var mı?,"Bu, ""Ekran görüntüsünde yetişkin içerik var mı...",2022-11-29T12:19:54.000Z,1,1,fa517ea3113e489138d249ff9844f572,1,,4.52,,0.000000,0.00,0.09,1.19,0.00
2,2338,36760808,Ekran görüntüsünde yetişkin içerik var mı?,Değerlendirilen görsele en uygun kategoriyi se...,2022-12-08T22:27:28.000Z,1,1,fa517ea3113e489138d249ff9844f572,1,,4.54,,0.000000,0.00,0.29,1.69,0.00
3,2338,36774752,Ekran görüntüsünde yetişkin içerik var mı?,Değerlendirilen görsele en uygun kategoriyi se...,2022-12-09T21:50:47.000Z,1,1,fa517ea3113e489138d249ff9844f572,1,,4.51,,0.000000,0.00,0.09,1.19,0.00
4,6795,21626385,Тональность сообщений из социальных сетей,Разметка сообщений по эмоциональной окраске.,2022-10-23T22:24:44.000Z,1,1,81029b6e4b04badacad20d15ff3998e0,1,,4.89,,2.400000,0.04,0.14,,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
623,123557,36767932,Product Search Navigation,"Decide, how relevant a search filter is for a ...",2022-12-09T17:30:06.000Z,1,1,d95afdfb999a6136be1e935fe8e62b2a,0,,4.70,,,,,,0.00
624,123667,36774418,Outline all traffic signs with bounding boxes,Find and outline all traffic signs with boundi...,2022-12-09T22:31:38.000Z,1,0,707ca2c5c4bc0fc002c9cf281614d0b0,0,,,954.0,0.545455,,,,0.00
625,123668,36774597,Are the traffic signs outlined correctly?,Look at the image and decide whether or not th...,2022-12-09T21:26:57.000Z,1,0,707ca2c5c4bc0fc002c9cf281614d0b0,0,,,,,,,,0.00
626,123715,36778716,Named Entity Recognition (NER),Select one or more words in a text and tag them,2022-12-10T06:05:22.000Z,1,1,acbad9110e7d4d6afd89a1d7032767c7,0,,,,,,,,0.00


In [None]:
task_data2.to_csv('task_data.csv')

# Assignment Data

In [None]:
import requests
# api-endpoint
URL = "https://toloka-tracer-study.forgigworkers.org:8080/api/civic_ai/private/secure/assignments/"
# sending get request and saving the response as response object
r = requests.get(url = URL, verify = False)
data = r.json()
print(data)



[{'uid': 1316040758, 'project_id': 49390}, {'uid': 1316040758, 'project_id': 54170}, {'uid': 1316040758, 'project_id': 73686}, {'uid': 1316040758, 'project_id': 80759}, {'uid': 1316040758, 'project_id': 81112}, {'uid': 1316040758, 'project_id': 105897}, {'uid': 1316040758, 'project_id': 118365}, {'uid': 1316040758, 'project_id': 118439}, {'uid': 1316040758, 'project_id': 121728}, {'uid': 1429235843, 'project_id': 42346}, {'uid': 1429235843, 'project_id': 43625}, {'uid': 1429235843, 'project_id': 54170}, {'uid': 1429235843, 'project_id': 59047}, {'uid': 1429235843, 'project_id': 64825}, {'uid': 1429235843, 'project_id': 73686}, {'uid': 1429235843, 'project_id': 80759}, {'uid': 1429235843, 'project_id': 81112}, {'uid': 1429235843, 'project_id': 81351}, {'uid': 1429235843, 'project_id': 83739}, {'uid': 1429235843, 'project_id': 86740}, {'uid': 1429235843, 'project_id': 89409}, {'uid': 1429235843, 'project_id': 91297}, {'uid': 1429235843, 'project_id': 93167}, {'uid': 1429235843, 'project_

In [None]:
assignment_data = pd.DataFrame.from_dict(pd.json_normalize(data), orient='columns')

## Log Data

In [None]:
import pandas as pd
import requests

In [None]:
URL = "https://toloka-tracer-study.forgigworkers.org:8080/api/civic_ai/private/secure/logs/"
# sending get request and saving the response as response object
r = requests.get(url = URL, verify = False)
data = r.json()
print(data)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
log_data2 = pd.DataFrame.from_dict(pd.json_normalize(data), orient='columns')

In [None]:
log_data2

Unnamed: 0,id,uid,event,timestamp,url,tab_id,scroll_count,blur_count,focus_count,click_count,keypress_count
0,232,1699065816,TABUPDATED,1668886726463,https://toloka.yandex.com/signup,48,0.0,0.0,0.0,0.0,0.0
1,233,1699065816,WEBREQUEST,1668886726797,https://toloka.yandex.com/signup,48,,,,,
2,234,1699065816,TABUPDATED,1668886727665,https://toloka.yandex.com/,48,0.0,0.0,0.0,0.0,0.0
3,235,1699065816,WEBREQUEST,1668886727848,https://toloka.yandex.com/,48,,,,,
4,236,1699065816,TABUPDATED,1668886728408,https://toloka.yandex.com/tasks,48,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
153890,154858,1675921760,TABACTIVATED,1674551858647,https://toloka.yandex.com/task/37259679/000238...,1655521996,271.0,1.0,1.0,0.0,0.0
153891,154859,1675921760,TABACTIVATED,1674551860520,https://www.overdrive.com/,1655522135,,,,,
153892,154860,1675921760,TABACTIVATED,1674551883244,https://toloka.yandex.com/task/37259679/000238...,1655521996,0.0,0.0,1.0,3.0,0.0
153893,154861,1675921760,TABUPDATED,1674551894428,https://toloka.yandex.com/tasks,1655521996,,,,,


In [None]:
log_data2.columns

Index(['id', 'uid', 'event', 'timestamp', 'url', 'tab_id', 'scroll_count',
       'blur_count', 'focus_count', 'click_count', 'keypress_count'],
      dtype='object')

In [None]:
log_data2.to_csv('log_data.csv')

## User Data

In [None]:
URL = "https://toloka-tracer-study.forgigworkers.org:8080/api/civic_ai/private/secure/workers/"
# sending get request and saving the response as response object
r = requests.get(url = URL, verify = False)
data = r.json()
print(data)

[{'uid': 1316040758, 'country': 'EG', 'language': 'AR,EN,', 'birthdate': '1984-03-11T00:00:00.000Z', 'joined': '2021-01-13T00:00:00.000Z'}, {'uid': 1429235843, 'country': 'KE', 'language': 'EN,KI,SW,', 'birthdate': '1993-03-01T00:00:00.000Z', 'joined': '2021-05-27T00:00:00.000Z'}, {'uid': 1430094402, 'country': 'PK', 'language': 'DE,EN,ES,ID,PT,RU,', 'birthdate': '1989-09-24T00:00:00.000Z', 'joined': '2021-05-29T00:00:00.000Z'}, {'uid': 1458049517, 'country': 'TR', 'language': 'DE,EN,FR,RU,TR,', 'birthdate': '1996-12-16T00:00:00.000Z', 'joined': '2021-07-27T00:00:00.000Z'}, {'uid': 1490584733, 'country': 'ID', 'language': 'EN,ID,', 'birthdate': '1982-03-13T00:00:00.000Z', 'joined': '2021-09-29T00:00:00.000Z'}, {'uid': 1648608319, 'country': 'PE', 'language': 'DE,EN,FR,IT,NL,RU,', 'birthdate': '1993-06-03T00:00:00.000Z', 'joined': '2022-06-17T00:00:00.000Z'}, {'uid': 1674749132, 'country': 'KE', 'language': 'EN,', 'birthdate': '1999-01-09T00:00:00.000Z', 'joined': '2022-08-14T00:00:00.0



In [None]:
user_data2 = pd.DataFrame.from_dict(pd.json_normalize(data), orient='columns')

In [None]:
user_data2

Unnamed: 0,uid,country,language,birthdate,joined
0,1316040758,EG,"AR,EN,",1984-03-11T00:00:00.000Z,2021-01-13T00:00:00.000Z
1,1429235843,KE,"EN,KI,SW,",1993-03-01T00:00:00.000Z,2021-05-27T00:00:00.000Z
2,1430094402,PK,"DE,EN,ES,ID,PT,RU,",1989-09-24T00:00:00.000Z,2021-05-29T00:00:00.000Z
3,1458049517,TR,"DE,EN,FR,RU,TR,",1996-12-16T00:00:00.000Z,2021-07-27T00:00:00.000Z
4,1490584733,ID,"EN,ID,",1982-03-13T00:00:00.000Z,2021-09-29T00:00:00.000Z
5,1648608319,PE,"DE,EN,FR,IT,NL,RU,",1993-06-03T00:00:00.000Z,2022-06-17T00:00:00.000Z
6,1674749132,KE,"EN,",1999-01-09T00:00:00.000Z,2022-08-14T00:00:00.000Z
7,1675921760,PH,"EN,ES,PT,PT-BR,TL,",1986-11-18T00:00:00.000Z,2022-08-17T00:00:00.000Z
8,1684019585,ET,"AM,EN,",1998-10-04T00:00:00.000Z,2022-09-06T00:00:00.000Z
9,1699065816,US,EN,2000-07-07T00:00:00.000Z,2022-10-11T00:00:00.000Z


In [None]:
user_data2.to_csv('user_data.csv')

 ## Income Data/ Task receipts

In [None]:
URL = "https://toloka-tracer-study.forgigworkers.org:8080/api/civic_ai/private/secure/reciepts/"
# sending get request and saving the response as response object
r = requests.get(url = URL, verify = False)
data = r.json()
print(data)

[{'project_id': 1246, 'uid': 1458049517, 'date': '2022-12-25T00:00:00.000Z', 'income': 0.03}, {'project_id': 1246, 'uid': 1703905621, 'date': '2022-10-30T00:00:00.000Z', 'income': 0.03}, {'project_id': 1246, 'uid': 1714073899, 'date': '2022-11-27T00:00:00.000Z', 'income': 0.03}, {'project_id': 2151, 'uid': 1458049517, 'date': '2022-12-19T00:00:00.000Z', 'income': 0.02}, {'project_id': 2338, 'uid': 1458049517, 'date': '2022-12-25T00:00:00.000Z', 'income': 0.06}, {'project_id': 7050, 'uid': 1703905621, 'date': '2022-12-04T00:00:00.000Z', 'income': 0.08}, {'project_id': 7053, 'uid': 1714073899, 'date': '2022-12-08T00:00:00.000Z', 'income': 0.035}, {'project_id': 7738, 'uid': 1458049517, 'date': '2022-12-28T00:00:00.000Z', 'income': 0}, {'project_id': 9950, 'uid': 1648608319, 'date': '2022-12-02T00:00:00.000Z', 'income': 0}, {'project_id': 10013, 'uid': 1458049517, 'date': '2022-12-29T00:00:00.000Z', 'income': 0}, {'project_id': 10013, 'uid': 1648608319, 'date': '2022-11-27T00:00:00.000Z',



In [None]:
task_receipts2 = pd.DataFrame.from_dict(pd.json_normalize(data), orient='columns')

In [None]:
task_receipts2

Unnamed: 0,project_id,uid,date,income
0,1246,1458049517,2022-12-25T00:00:00.000Z,0.03
1,1246,1703905621,2022-10-30T00:00:00.000Z,0.03
2,1246,1714073899,2022-11-27T00:00:00.000Z,0.03
3,2151,1458049517,2022-12-19T00:00:00.000Z,0.02
4,2338,1458049517,2022-12-25T00:00:00.000Z,0.06
...,...,...,...,...
1330,126890,1675921760,2023-01-06T00:00:00.000Z,0.00
1331,127064,1429235843,2022-12-30T00:00:00.000Z,0.01
1332,127631,1429235843,2023-01-04T00:00:00.000Z,0.20
1333,127631,1675921760,2023-01-04T00:00:00.000Z,0.20


In [None]:
task_receipts2.to_csv('income_data.csv')

### Combining the Data

In [None]:
task_data.head()

Unnamed: 0,title,project_id,adult_content,lang,description,instructions_flag,assignment_issuing_type,requester_id,requester_name,requester_trusted_flag,...,moneyMed,moneyTop10,moneyMax3,pool_id,pool_started_at,reward,assignment_duration,acceptance_period,regular_subtype,c
0,Check the similarity between two products,78414,False,EN,"In this task, you will see two product listing...",True,AUTOMATIC,dbaa749ed83b3ef412a3167f6c53c363,ADE4PA,False,...,0.0,1.15,19.56,35838995,2022-10-11T23:36:03.957,0.0,120,,,86400
1,Object recognition & detection,85377,False,,Outline a road signs in images,True,AUTOMATIC,a841946b60a3fc816968533af5f8b371,michaelyuyangtong,False,...,,,,32474078,2022-03-20T11:29:40.108,0.01,300,604800.0,,86400
2,[Toloka] English Grammar Test,105273,False,EN,Take the test and get more paid tasks.\nAfter ...,True,AUTOMATIC,71daea4cffae4488067aebfb6583914a,Toloka,False,...,,,,36185755,2022-11-03T19:07:12.911,0.0,900,,EXAM,86400


In [None]:
cols = list(task_data.columns.values)
print(cols)

['title', 'project_id', 'adult_content', 'lang', 'description', 'instructions_flag', 'assignment_issuing_type', 'requester_id', 'requester_name', 'requester_trusted_flag', 'availability', 'post_accept_flag', 'training', 'project_stat', 'avg_money_hourly', 'moneyMed', 'moneyTop10', 'moneyMax3', 'pool_id', 'pool_started_at', 'reward', 'assignment_duration', 'acceptance_period', 'regular_subtype', 'c']


In [None]:
task_data1 =task_data[['title', 'project_id', 'adult_content', 'lang', 'description', 'instructions_flag','requester_id', 'requester_trusted_flag', 'project_stat', 'avg_money_hourly', 'moneyMed', 'moneyTop10', 'moneyMax3', 'pool_id', 'pool_started_at', 'acceptance_period']]

In [None]:
task_data1.shape

(3, 16)

In [None]:
task_data2.shape

(628, 17)

In [None]:
task_data2 = task_data2[['title',
 'project_id','mayContainAdultContent', 'lang',
 'description','hasInstructions','requesterID', 'requesterTrusted','grade','moneyAvgHourly','moneyMed','moneyTop10', 'moneyMax3','pool_id',
 'pool_startedAt', 'averageAcceptanceTimeSec']]

In [None]:
task_data2.columns = ['title', 'project_id', 'adult_content', 'lang', 'description', 'instructions_flag','requester_id', 'requester_trusted_flag', 'project_stat', 'avg_money_hourly', 'moneyMed', 'moneyTop10', 'moneyMax3', 'pool_id', 'pool_started_at', 'acceptance_period']

In [None]:
task_data2.head()

Unnamed: 0,title,project_id,adult_content,lang,description,instructions_flag,requester_id,requester_trusted_flag,project_stat,avg_money_hourly,moneyMed,moneyTop10,moneyMax3,pool_id,pool_started_at,acceptance_period
0,title,0,1,EN,,0,requesterID,1,4.3,0.54,,,,0,2022-09-16T22:16:48.000Z,
1,Ekran görüntüsünde yetişkin içerik var mı?,2338,1,,"Bu, ""Ekran görüntüsünde yetişkin içerik var mı...",1,fa517ea3113e489138d249ff9844f572,1,4.52,0.0,0.0,0.09,1.19,36616149,2022-11-29T12:19:54.000Z,
2,Ekran görüntüsünde yetişkin içerik var mı?,2338,1,,Değerlendirilen görsele en uygun kategoriyi se...,1,fa517ea3113e489138d249ff9844f572,1,4.54,0.0,0.0,0.29,1.69,36760808,2022-12-08T22:27:28.000Z,
3,Ekran görüntüsünde yetişkin içerik var mı?,2338,1,,Değerlendirilen görsele en uygun kategoriyi se...,1,fa517ea3113e489138d249ff9844f572,1,4.51,0.0,0.0,0.09,1.19,36774752,2022-12-09T21:50:47.000Z,
4,Тональность сообщений из социальных сетей,6795,1,,Разметка сообщений по эмоциональной окраске.,1,81029b6e4b04badacad20d15ff3998e0,1,4.89,2.4,0.04,0.14,,21626385,2022-10-23T22:24:44.000Z,


In [None]:
task_data1.head()

Unnamed: 0,title,project_id,adult_content,lang,description,instructions_flag,requester_id,requester_trusted_flag,project_stat,avg_money_hourly,moneyMed,moneyTop10,moneyMax3,pool_id,pool_started_at,acceptance_period
0,Check the similarity between two products,78414,False,EN,"In this task, you will see two product listing...",True,dbaa749ed83b3ef412a3167f6c53c363,False,4.31,0.0,0.0,1.15,19.56,35838995,2022-10-11T23:36:03.957,
1,Object recognition & detection,85377,False,,Outline a road signs in images,True,a841946b60a3fc816968533af5f8b371,False,2.52,,,,,32474078,2022-03-20T11:29:40.108,604800.0
2,[Toloka] English Grammar Test,105273,False,EN,Take the test and get more paid tasks.\nAfter ...,True,71daea4cffae4488067aebfb6583914a,False,4.51,0.0,,,,36185755,2022-11-03T19:07:12.911,


In [None]:

frames = [task_data1, task_data2]
  
task_data_final = pd.concat(frames)

In [None]:
task_data_final.shape

(631, 16)

In [None]:
task_data_final.head()

Unnamed: 0,title,project_id,adult_content,lang,description,instructions_flag,requester_id,requester_trusted_flag,project_stat,avg_money_hourly,moneyMed,moneyTop10,moneyMax3,pool_id,pool_started_at,acceptance_period
0,Check the similarity between two products,78414,False,EN,"In this task, you will see two product listing...",True,dbaa749ed83b3ef412a3167f6c53c363,False,4.31,0.0,0.0,1.15,19.56,35838995,2022-10-11T23:36:03.957,
1,Object recognition & detection,85377,False,,Outline a road signs in images,True,a841946b60a3fc816968533af5f8b371,False,2.52,,,,,32474078,2022-03-20T11:29:40.108,604800.0
2,[Toloka] English Grammar Test,105273,False,EN,Take the test and get more paid tasks.\nAfter ...,True,71daea4cffae4488067aebfb6583914a,False,4.51,0.0,,,,36185755,2022-11-03T19:07:12.911,
0,title,0,1,EN,,0,requesterID,1,4.3,0.54,,,,0,2022-09-16T22:16:48.000Z,
1,Ekran görüntüsünde yetişkin içerik var mı?,2338,1,,"Bu, ""Ekran görüntüsünde yetişkin içerik var mı...",1,fa517ea3113e489138d249ff9844f572,1,4.52,0.0,0.0,0.09,1.19,36616149,2022-11-29T12:19:54.000Z,


In [None]:
user_data

Unnamed: 0,user_id,role,user_language,default_email,connection_id,full_name,birth_day,country,joined
0,1685129673,WORKER,EN,mitanshi@yandex.com,t:3470263886,Mitanshi Vyas,1999-07-10,US,2022-09-09


In [None]:
cols = list(user_data.columns.values)
print(cols)

['user_id', 'role', 'user_language', 'default_email', 'connection_id', 'full_name', 'birth_day', 'country', 'joined']


In [None]:
user_data1 = user_data[['user_id',  'country', 'user_language', 'birth_day', 'joined']]

In [None]:
user_data1

Unnamed: 0,user_id,country,user_language,birth_day,joined
0,1685129673,US,EN,1999-07-10,2022-09-09


In [None]:
user_data2.columns = ['user_id',  'country', 'user_language', 'birth_day', 'joined']

In [None]:
user_data2

Unnamed: 0,user_id,country,user_language,birth_day,joined
0,1316040758,EG,"AR,EN,",1984-03-11T00:00:00.000Z,2021-01-13T00:00:00.000Z
1,1429235843,KE,"EN,KI,SW,",1993-03-01T00:00:00.000Z,2021-05-27T00:00:00.000Z
2,1430094402,PK,"DE,EN,ES,FR,ID,RU,",1989-09-24T00:00:00.000Z,2021-05-29T00:00:00.000Z
3,1458049517,TR,"EN,TR,",1996-12-16T00:00:00.000Z,2021-07-27T00:00:00.000Z
4,1490584733,ID,"EN,ID,",1982-03-13T00:00:00.000Z,2021-09-29T00:00:00.000Z
5,1648608319,PE,"DE,EN,FR,IT,NL,RU,",1993-06-03T00:00:00.000Z,2022-06-17T00:00:00.000Z
6,1674749132,KE,"EN,",1999-01-09T00:00:00.000Z,2022-08-14T00:00:00.000Z
7,1675921760,PH,"EN,ES,PT,PT-BR,TL,",1986-11-18T00:00:00.000Z,2022-08-17T00:00:00.000Z
8,1684019585,ET,"AM,EN,",1998-10-04T00:00:00.000Z,2022-09-06T00:00:00.000Z
9,1699065816,US,EN,2000-07-07T00:00:00.000Z,2022-10-11T00:00:00.000Z


In [None]:
frames = [user_data1, user_data2]
  
user_data_final = pd.concat(frames)

In [None]:
user_data_final

Unnamed: 0,user_id,country,user_language,birth_day,joined
0,1685129673,US,EN,1999-07-10,2022-09-09
0,1316040758,EG,"AR,EN,",1984-03-11T00:00:00.000Z,2021-01-13T00:00:00.000Z
1,1429235843,KE,"EN,KI,SW,",1993-03-01T00:00:00.000Z,2021-05-27T00:00:00.000Z
2,1430094402,PK,"DE,EN,ES,FR,ID,RU,",1989-09-24T00:00:00.000Z,2021-05-29T00:00:00.000Z
3,1458049517,TR,"EN,TR,",1996-12-16T00:00:00.000Z,2021-07-27T00:00:00.000Z
4,1490584733,ID,"EN,ID,",1982-03-13T00:00:00.000Z,2021-09-29T00:00:00.000Z
5,1648608319,PE,"DE,EN,FR,IT,NL,RU,",1993-06-03T00:00:00.000Z,2022-06-17T00:00:00.000Z
6,1674749132,KE,"EN,",1999-01-09T00:00:00.000Z,2022-08-14T00:00:00.000Z
7,1675921760,PH,"EN,ES,PT,PT-BR,TL,",1986-11-18T00:00:00.000Z,2022-08-17T00:00:00.000Z
8,1684019585,ET,"AM,EN,",1998-10-04T00:00:00.000Z,2022-09-06T00:00:00.000Z


In [None]:
task_receipts2.shape

(1208, 4)

In [None]:
task_receipts2.head(2)

Unnamed: 0,project_id,uid,date,income
0,1246,1703905621,2022-10-30T00:00:00.000Z,0.03
1,1246,1714073899,2022-11-27T00:00:00.000Z,0.03


In [None]:
task_receipts2.columns = [ 'project_id', 'user_id','date', 'income']

In [None]:
task_receipts2 = task_receipts2[[ 'user_id','project_id','date', 'income']]

In [None]:
task_receipts2.head()

Unnamed: 0,user_id,project_id,date,income
0,1703905621,1246,2022-10-30T00:00:00.000Z,0.03
1,1714073899,1246,2022-11-27T00:00:00.000Z,0.03
2,1458049517,2151,2022-12-12T00:00:00.000Z,0.02
3,1458049517,2338,2022-12-15T00:00:00.000Z,0.03
4,1703905621,7050,2022-12-04T00:00:00.000Z,0.08


In [None]:
cols = list(task_receipt.columns.values)
print(cols)

['user_id', 'project_id', 'date', 'income']


In [None]:
task_receipts1 = task_receipt[['user_id', 'project_id', 'date', 'income']]

In [None]:
# task_receipts2.columns = ['user_id', 'project_id', 'date', 'income']

In [None]:
task_receipts1

Unnamed: 0,user_id,project_id,date,income
0,1685129673,78414,2022-10-18,0.0


In [None]:
task_receipts2.head()

Unnamed: 0,user_id,project_id,date,income
0,1703905621,1246,2022-10-30T00:00:00.000Z,0.03
1,1714073899,1246,2022-11-27T00:00:00.000Z,0.03
2,1458049517,2151,2022-12-12T00:00:00.000Z,0.02
3,1458049517,2338,2022-12-15T00:00:00.000Z,0.03
4,1703905621,7050,2022-12-04T00:00:00.000Z,0.08


In [None]:
frames = [task_receipts1, task_receipts2]
  
task_receipts_final = pd.concat(frames)

In [None]:
task_receipts_final.head()

Unnamed: 0,user_id,project_id,date,income
0,1685129673,78414,2022-10-18,0.0
0,1703905621,1246,2022-10-30T00:00:00.000Z,0.03
1,1714073899,1246,2022-11-27T00:00:00.000Z,0.03
2,1458049517,2151,2022-12-12T00:00:00.000Z,0.02
3,1458049517,2338,2022-12-15T00:00:00.000Z,0.03


In [None]:
cols = list(log_data2.columns.values)
print(cols)

['id', 'uid', 'event', 'timestamp', 'url', 'tab_id', 'scroll_count', 'blur_count', 'focus_count', 'click_count', 'keypress_count']


In [None]:
log_data2.columns = ['id', 'user_id', 'event', 'timestamp', 'url', 'tab_id', 'scroll_count', 'blur_count', 'focus_count', 'click_count', 'keypress_count']

In [None]:
task_data_final.head(1)

Unnamed: 0,title,project_id,adult_content,lang,description,instructions_flag,requester_id,requester_trusted_flag,project_stat,avg_money_hourly,moneyMed,moneyTop10,moneyMax3,pool_id,pool_started_at,acceptance_period
0,Check the similarity between two products,78414,False,EN,"In this task, you will see two product listing...",True,dbaa749ed83b3ef412a3167f6c53c363,False,4.31,0.0,0.0,1.15,19.56,35838995,2022-10-11T23:36:03.957,


In [None]:
user_data_final.head(1)

Unnamed: 0,user_id,country,user_language,birth_day,joined
0,1685129673,US,EN,1999-07-10,2022-09-09


In [None]:
task_receipts_final.head(1)

Unnamed: 0,user_id,project_id,date,income
0,1685129673,78414,2022-10-18,0.0


In [None]:
log_data2.head(1)

Unnamed: 0,id,user_id,event,timestamp,url,tab_id,scroll_count,blur_count,focus_count,click_count,keypress_count
0,232,1699065816,TABUPDATED,1668886726463,https://toloka.yandex.com/signup,48,0.0,0.0,0.0,0.0,0.0


In [None]:
# newdf = task_data_final.merge(task_receipts_final, how='outer', on='project_id')

In [None]:
task_receipts_final

Unnamed: 0,user_id,project_id,date,income
0,1685129673,78414,2022-10-18,0.000
0,1703905621,1246,2022-10-30T00:00:00.000Z,0.03
1,1714073899,1246,2022-11-27T00:00:00.000Z,0.03
2,1458049517,2151,2022-12-12T00:00:00.000Z,0.02
3,1458049517,2338,2022-12-15T00:00:00.000Z,0.03
...,...,...,...,...
1203,1684019585,123529,2022-12-09T00:00:00.000Z,2.0
1204,1714073899,123529,2022-12-09T00:00:00.000Z,2.0
1205,1684019585,123668,2022-12-09T00:00:00.000Z,0.005
1206,1675921760,124175,2022-12-15T00:00:00.000Z,0.02


In [None]:
task_receipts_final['task_attempted_flag'] = 'TRUE'

In [None]:
task_receipts_final.shape

(1209, 5)

In [None]:
task_receipts_final.columns

Index(['user_id', 'project_id', 'date', 'income', 'task_attempted_flag'], dtype='object')

In [None]:
newdf = task_data_final.merge(task_receipts_final[['user_id','project_id','task_attempted_flag']], how='outer', on='project_id')

In [None]:
newdf

Unnamed: 0,title,project_id,adult_content,lang,description,instructions_flag,requester_id,requester_trusted_flag,project_stat,avg_money_hourly,moneyMed,moneyTop10,moneyMax3,pool_id,pool_started_at,acceptance_period,user_id,task_attempted_flag
0,Check the similarity between two products,78414,False,EN,"In this task, you will see two product listing...",True,dbaa749ed83b3ef412a3167f6c53c363,False,4.31,0.0,0.0,1.15,19.56,35838995.0,2022-10-11T23:36:03.957,,1.685130e+09,TRUE
1,Check the similarity between two products,78414,False,EN,"In this task, you will see two product listing...",True,dbaa749ed83b3ef412a3167f6c53c363,False,4.31,0.0,0.0,1.15,19.56,35838995.0,2022-10-11T23:36:03.957,,1.490585e+09,TRUE
2,Check the similarity between two products,78414,False,EN,"In this task, you will see two product listing...",True,dbaa749ed83b3ef412a3167f6c53c363,False,4.31,0.0,0.0,1.15,19.56,35838995.0,2022-10-11T23:36:03.957,,1.675922e+09,TRUE
3,Check the similarity between two products,78414,False,EN,"In this task, you will see two product listing...",True,dbaa749ed83b3ef412a3167f6c53c363,False,4.31,0.0,0.0,1.15,19.56,35838995.0,2022-10-11T23:36:03.957,,1.703906e+09,TRUE
4,Check the similarity between two products,78414,False,EN,"In this task, you will see two product listing...",True,dbaa749ed83b3ef412a3167f6c53c363,False,4.31,0.0,0.0,1.15,19.56,35838995.0,2022-10-11T23:36:03.957,,1.706440e+09,TRUE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2276,,123443,,,,,,,,,,,,,,,1.703906e+09,TRUE
2277,,123485,,,,,,,,,,,,,,,1.648608e+09,TRUE
2278,,123519,,,,,,,,,,,,,,,1.714074e+09,TRUE
2279,,124175,,,,,,,,,,,,,,,1.675922e+09,TRUE


In [None]:
newdf['task_attempted_flag'].isna().sum()

174

In [None]:
newdf['task_attempted_flag'] = newdf['task_attempted_flag'].fillna('FALSE')

In [None]:
newdf['task_attempted_flag'].value_counts()

TRUE     2107
FALSE     174
Name: task_attempted_flag, dtype: int64

In [None]:
newdf

Unnamed: 0,title,project_id,adult_content,lang,description,instructions_flag,requester_id,requester_trusted_flag,project_stat,avg_money_hourly,moneyMed,moneyTop10,moneyMax3,pool_id,pool_started_at,acceptance_period,user_id,task_attempted_flag
0,Check the similarity between two products,78414,False,EN,"In this task, you will see two product listing...",True,dbaa749ed83b3ef412a3167f6c53c363,False,4.31,0.0,0.0,1.15,19.56,35838995.0,2022-10-11T23:36:03.957,,1.685130e+09,TRUE
1,Check the similarity between two products,78414,False,EN,"In this task, you will see two product listing...",True,dbaa749ed83b3ef412a3167f6c53c363,False,4.31,0.0,0.0,1.15,19.56,35838995.0,2022-10-11T23:36:03.957,,1.490585e+09,TRUE
2,Check the similarity between two products,78414,False,EN,"In this task, you will see two product listing...",True,dbaa749ed83b3ef412a3167f6c53c363,False,4.31,0.0,0.0,1.15,19.56,35838995.0,2022-10-11T23:36:03.957,,1.675922e+09,TRUE
3,Check the similarity between two products,78414,False,EN,"In this task, you will see two product listing...",True,dbaa749ed83b3ef412a3167f6c53c363,False,4.31,0.0,0.0,1.15,19.56,35838995.0,2022-10-11T23:36:03.957,,1.703906e+09,TRUE
4,Check the similarity between two products,78414,False,EN,"In this task, you will see two product listing...",True,dbaa749ed83b3ef412a3167f6c53c363,False,4.31,0.0,0.0,1.15,19.56,35838995.0,2022-10-11T23:36:03.957,,1.706440e+09,TRUE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2276,,123443,,,,,,,,,,,,,,,1.703906e+09,TRUE
2277,,123485,,,,,,,,,,,,,,,1.648608e+09,TRUE
2278,,123519,,,,,,,,,,,,,,,1.714074e+09,TRUE
2279,,124175,,,,,,,,,,,,,,,1.675922e+09,TRUE


In [None]:
task_data_final.shape

(631, 16)

In [None]:
task_receipts_final.shape

(1209, 5)

In [None]:
newdf2 = newdf.merge(user_data_final, how = 'outer', on='user_id')

In [None]:
newdf2

Unnamed: 0,title,project_id,adult_content,lang,description,instructions_flag,requester_id,requester_trusted_flag,project_stat,avg_money_hourly,...,moneyMax3,pool_id,pool_started_at,acceptance_period,user_id,task_attempted_flag,country,user_language,birth_day,joined
0,Check the similarity between two products,78414,False,EN,"In this task, you will see two product listing...",True,dbaa749ed83b3ef412a3167f6c53c363,False,4.31,0.00000,...,19.56,35838995.0,2022-10-11T23:36:03.957,,1.685130e+09,TRUE,US,EN,1999-07-10,2022-09-09
1,Check the similarity between two products,78414,0,EN,"In this task, you will see two product listing...",1,dbaa749ed83b3ef412a3167f6c53c363,0,4.33,1.07234,...,20.26,35838995.0,2022-10-11T23:36:04.000Z,,1.685130e+09,TRUE,US,EN,1999-07-10,2022-09-09
2,Check the similarity between two products,78414,0,EN,"In this task, you will see two product listing...",1,dbaa749ed83b3ef412a3167f6c53c363,0,4.33,0.00000,...,19.75,35859153.0,2022-10-13T07:13:58.000Z,,1.685130e+09,TRUE,US,EN,1999-07-10,2022-09-09
3,Check the similarity between two products,78414,0,EN,"In this task, you will see two product listing...",1,dbaa749ed83b3ef412a3167f6c53c363,0,4.33,0.00000,...,19.75,36777013.0,2022-12-10T02:07:23.000Z,,1.685130e+09,TRUE,US,EN,1999-07-10,2022-09-09
4,Check the similarity between two products,78414,False,EN,"In this task, you will see two product listing...",True,dbaa749ed83b3ef412a3167f6c53c363,False,4.31,0.00000,...,19.56,35838995.0,2022-10-11T23:36:03.957,,1.490585e+09,TRUE,ID,"EN,ID,",1982-03-13T00:00:00.000Z,2021-09-29T00:00:00.000Z
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2276,Take a picture of your face,105540,0,,Take a picture of yourself using our service,1,c69b008e44d038469bf39335173dce31,0,,8.45070,...,,36647181.0,2022-12-10T04:14:09.000Z,225028.0,1.725202e+09,TRUE,US,"EN,ES,",2002-03-16T00:00:00.000Z,2022-12-08T00:00:00.000Z
2277,Take a picture of your face,105540,0,,Take a picture of yourself using our service,1,c69b008e44d038469bf39335173dce31,0,,8.45070,...,,36647238.0,2022-12-01T12:26:01.000Z,225028.0,1.725202e+09,TRUE,US,"EN,ES,",2002-03-16T00:00:00.000Z,2022-12-08T00:00:00.000Z
2278,Take a picture of your face,105540,0,,Take a picture of yourself using our service,1,c69b008e44d038469bf39335173dce31,0,,8.45070,...,,36647307.0,2022-12-10T04:16:51.000Z,225028.0,1.725202e+09,TRUE,US,"EN,ES,",2002-03-16T00:00:00.000Z,2022-12-08T00:00:00.000Z
2279,Take a picture of your face,105540,0,,Take a picture of yourself using our service,1,c69b008e44d038469bf39335173dce31,0,,8.45070,...,,36647422.0,2022-12-01T12:23:18.000Z,225028.0,1.725202e+09,TRUE,US,"EN,ES,",2002-03-16T00:00:00.000Z,2022-12-08T00:00:00.000Z


In [None]:
newdf2

Unnamed: 0,title,project_id,adult_content,lang,description,instructions_flag,requester_id,requester_trusted_flag,project_stat,avg_money_hourly,...,moneyMax3,pool_id,pool_started_at,acceptance_period,user_id,task_attempted_flag,country,user_language,birth_day,joined
0,Check the similarity between two products,78414,False,EN,"In this task, you will see two product listing...",True,dbaa749ed83b3ef412a3167f6c53c363,False,4.31,0.00000,...,19.56,35838995.0,2022-10-11T23:36:03.957,,1.685130e+09,TRUE,US,EN,1999-07-10,2022-09-09
1,Check the similarity between two products,78414,0,EN,"In this task, you will see two product listing...",1,dbaa749ed83b3ef412a3167f6c53c363,0,4.33,1.07234,...,20.26,35838995.0,2022-10-11T23:36:04.000Z,,1.685130e+09,TRUE,US,EN,1999-07-10,2022-09-09
2,Check the similarity between two products,78414,0,EN,"In this task, you will see two product listing...",1,dbaa749ed83b3ef412a3167f6c53c363,0,4.33,0.00000,...,19.75,35859153.0,2022-10-13T07:13:58.000Z,,1.685130e+09,TRUE,US,EN,1999-07-10,2022-09-09
3,Check the similarity between two products,78414,0,EN,"In this task, you will see two product listing...",1,dbaa749ed83b3ef412a3167f6c53c363,0,4.33,0.00000,...,19.75,36777013.0,2022-12-10T02:07:23.000Z,,1.685130e+09,TRUE,US,EN,1999-07-10,2022-09-09
4,Check the similarity between two products,78414,False,EN,"In this task, you will see two product listing...",True,dbaa749ed83b3ef412a3167f6c53c363,False,4.31,0.00000,...,19.56,35838995.0,2022-10-11T23:36:03.957,,1.490585e+09,TRUE,ID,"EN,ID,",1982-03-13T00:00:00.000Z,2021-09-29T00:00:00.000Z
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2276,Take a picture of your face,105540,0,,Take a picture of yourself using our service,1,c69b008e44d038469bf39335173dce31,0,,8.45070,...,,36647181.0,2022-12-10T04:14:09.000Z,225028.0,1.725202e+09,TRUE,US,"EN,ES,",2002-03-16T00:00:00.000Z,2022-12-08T00:00:00.000Z
2277,Take a picture of your face,105540,0,,Take a picture of yourself using our service,1,c69b008e44d038469bf39335173dce31,0,,8.45070,...,,36647238.0,2022-12-01T12:26:01.000Z,225028.0,1.725202e+09,TRUE,US,"EN,ES,",2002-03-16T00:00:00.000Z,2022-12-08T00:00:00.000Z
2278,Take a picture of your face,105540,0,,Take a picture of yourself using our service,1,c69b008e44d038469bf39335173dce31,0,,8.45070,...,,36647307.0,2022-12-10T04:16:51.000Z,225028.0,1.725202e+09,TRUE,US,"EN,ES,",2002-03-16T00:00:00.000Z,2022-12-08T00:00:00.000Z
2279,Take a picture of your face,105540,0,,Take a picture of yourself using our service,1,c69b008e44d038469bf39335173dce31,0,,8.45070,...,,36647422.0,2022-12-01T12:23:18.000Z,225028.0,1.725202e+09,TRUE,US,"EN,ES,",2002-03-16T00:00:00.000Z,2022-12-08T00:00:00.000Z


In [None]:
## newdf2 contains task_reciept data in the form of task_attempted_flag, task_data, user_data
## still haven't added log_data to the complete dataframe, not sure how to add that

# Combining tables using assignment data

In [None]:
# task_receipts_final['task_attempted_flag'] = 'TRUE'

In [None]:
exp_df = task_data_final.merge(task_receipts_final[['user_id','project_id','task_attempted_flag']], how='outer', on='project_id')

In [None]:
task_data_final.shape

(631, 16)

In [None]:
task_receipts_final.shape

(1209, 5)

In [None]:
exp_df.shape

(2281, 18)

In [None]:
exp_df['user_id'].isnull().sum()

174

In [None]:
exp_df.head()

Unnamed: 0,title,project_id,adult_content,lang,description,instructions_flag,requester_id,requester_trusted_flag,project_stat,avg_money_hourly,moneyMed,moneyTop10,moneyMax3,pool_id,pool_started_at,acceptance_period,user_id,task_attempted_flag
0,Check the similarity between two products,78414,False,EN,"In this task, you will see two product listing...",True,dbaa749ed83b3ef412a3167f6c53c363,False,4.31,0.0,0.0,1.15,19.56,35838995.0,2022-10-11T23:36:03.957,,1685130000.0,True
1,Check the similarity between two products,78414,False,EN,"In this task, you will see two product listing...",True,dbaa749ed83b3ef412a3167f6c53c363,False,4.31,0.0,0.0,1.15,19.56,35838995.0,2022-10-11T23:36:03.957,,1490585000.0,True
2,Check the similarity between two products,78414,False,EN,"In this task, you will see two product listing...",True,dbaa749ed83b3ef412a3167f6c53c363,False,4.31,0.0,0.0,1.15,19.56,35838995.0,2022-10-11T23:36:03.957,,1675922000.0,True
3,Check the similarity between two products,78414,False,EN,"In this task, you will see two product listing...",True,dbaa749ed83b3ef412a3167f6c53c363,False,4.31,0.0,0.0,1.15,19.56,35838995.0,2022-10-11T23:36:03.957,,1703906000.0,True
4,Check the similarity between two products,78414,False,EN,"In this task, you will see two product listing...",True,dbaa749ed83b3ef412a3167f6c53c363,False,4.31,0.0,0.0,1.15,19.56,35838995.0,2022-10-11T23:36:03.957,,1706440000.0,True


In [None]:
assignment_data.head()

Unnamed: 0,uid,project_id
0,1316040758,49390
1,1316040758,54170
2,1316040758,73686
3,1316040758,80759
4,1316040758,81112


In [None]:
assignment_data.columns = ['uid', 'project_id']

In [None]:
assignment_data.shape

(1053, 2)

In [None]:
df1 = assignment_data.merge(exp_df, how = 'inner', on='project_id')

In [None]:
df1.shape

(9917, 19)

In [None]:
exp_df.columns

Index(['title', 'project_id', 'adult_content', 'lang', 'description',
       'instructions_flag', 'requester_id', 'requester_trusted_flag',
       'project_stat', 'avg_money_hourly', 'moneyMed', 'moneyTop10',
       'moneyMax3', 'pool_id', 'pool_started_at', 'acceptance_period',
       'user_id', 'task_attempted_flag'],
      dtype='object')

In [None]:
assignment_data.columns

Index(['uid', 'project_id'], dtype='object')

In [None]:
df1.columns

Index(['uid', 'project_id', 'title', 'adult_content', 'lang', 'description',
       'instructions_flag', 'requester_id', 'requester_trusted_flag',
       'project_stat', 'avg_money_hourly', 'moneyMed', 'moneyTop10',
       'moneyMax3', 'pool_id', 'pool_started_at', 'acceptance_period',
       'user_id', 'task_attempted_flag'],
      dtype='object')

In [None]:
df_drop = df1.drop(['user_id'], axis = 1)

In [None]:
df_drop.shape

(10361, 18)

In [None]:
df1.shape

(10361, 19)

In [None]:
df_drop.columns = ['user_id', 'project_id', 'title', 'adult_content', 'lang', 'description',
       'instructions_flag', 'requester_id', 'requester_trusted_flag',
       'project_stat', 'avg_money_hourly', 'moneyMed', 'moneyTop10',
       'moneyMax3', 'pool_id', 'pool_started_at', 'acceptance_period', 'task_attempted_flag']

In [None]:
assignment_data.columns = ['user_id', 'project_id']

In [None]:
df2 = df_drop.merge(user_data_final, how = 'outer', on='user_id')

In [None]:
df2.head()

Unnamed: 0,user_id,project_id,title,adult_content,lang,description,instructions_flag,requester_id,requester_trusted_flag,project_stat,...,moneyTop10,moneyMax3,pool_id,pool_started_at,acceptance_period,task_attempted_flag,country,user_language,birth_day,joined
0,1316041000.0,49390.0,Entity image selection,1,,From the selection of images for the entity(pe...,1,81ae1299dd10f764a6e59ac1c08a8a18,0,4.92,...,0.06,,36770631.0,2022-12-09T15:07:41.000Z,,True,EG,"AR,EN,",1984-03-11T00:00:00.000Z,2021-01-13T00:00:00.000Z
1,1316041000.0,49390.0,Entity image selection,1,,From the selection of images for the entity(pe...,1,81ae1299dd10f764a6e59ac1c08a8a18,0,4.92,...,0.06,,36770631.0,2022-12-09T15:07:41.000Z,,True,EG,"AR,EN,",1984-03-11T00:00:00.000Z,2021-01-13T00:00:00.000Z
2,1316041000.0,49390.0,Entity image selection,1,,From the selection of images for the entity(pe...,1,81ae1299dd10f764a6e59ac1c08a8a18,0,4.92,...,0.06,,36770631.0,2022-12-09T15:07:41.000Z,,True,EG,"AR,EN,",1984-03-11T00:00:00.000Z,2021-01-13T00:00:00.000Z
3,1316041000.0,49390.0,Entity image selection,1,,From the selection of images for the entity(pe...,1,81ae1299dd10f764a6e59ac1c08a8a18,0,4.92,...,0.06,,36775179.0,2022-12-09T22:25:32.000Z,,True,EG,"AR,EN,",1984-03-11T00:00:00.000Z,2021-01-13T00:00:00.000Z
4,1316041000.0,49390.0,Entity image selection,1,,From the selection of images for the entity(pe...,1,81ae1299dd10f764a6e59ac1c08a8a18,0,4.92,...,0.06,,36775179.0,2022-12-09T22:25:32.000Z,,True,EG,"AR,EN,",1984-03-11T00:00:00.000Z,2021-01-13T00:00:00.000Z


In [None]:
df2.shape

(10363, 22)

In [None]:
df2.isnull().sum()

user_id                    444
project_id                   2
title                      394
adult_content              394
lang                      5497
description                394
instructions_flag          394
requester_id               394
requester_trusted_flag     394
project_stat              1653
avg_money_hourly           576
moneyMed                  1939
moneyTop10                1939
moneyMax3                 4564
pool_id                    394
pool_started_at            394
acceptance_period         8852
task_attempted_flag        508
country                    444
user_language              444
birth_day                  444
joined                     444
dtype: int64

In [None]:
df_new = df2.to_csv('uncleaned_data.csv')

In [None]:
from google.colab import files
uploaded = files.upload()

Saving cleaned_data__2_.csv to cleaned_data__2_.csv


In [None]:
df = pd.read_csv('cleaned_data__2_.csv')

In [None]:
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,title,project_id,adult_content,description,instructions_flag,requester_id,requester_trusted_flag,project_stat,...,moneyTop10,moneyMax3,pool_id,pool_started_at,user_id,country,user_language,birth_day,joined,age
0,0,0,[Toloka] English Grammar Test,105273.0,0.0,Take the test and get more paid tasks.\nAfter ...,1.0,71daea4cffae4488067aebfb6583914a,0.0,4.75,...,0.0,0.0,36185755.0,2022-11-03T19:07:12.911,1709755000.0,US,"EN,ES,FR,JA,RU,",1995-06-13,2022-11-06T00:00:00.000Z,27
1,1,1,[Toloka] English Grammar Test,105273.0,0.0,Take the test and get more paid tasks.\nAfter ...,1.0,71daea4cffae4488067aebfb6583914a,0.0,4.76,...,0.0,0.0,36185755.0,2022-11-03T19:07:13.000Z,1709755000.0,US,"EN,ES,FR,JA,RU,",1995-06-13,2022-11-06T00:00:00.000Z,27
2,2,2,Check the similarity between two products,78414.0,0.0,"In this task, you will see two product listing...",1.0,dbaa749ed83b3ef412a3167f6c53c363,0.0,4.33,...,0.69,20.03,35838995.0,2022-10-11T23:36:03.957,1709755000.0,US,"EN,ES,FR,JA,RU,",1995-06-13,2022-11-06T00:00:00.000Z,27
3,3,3,Check the similarity between two products,78414.0,0.0,"In this task, you will see two product listing...",1.0,dbaa749ed83b3ef412a3167f6c53c363,0.0,4.35,...,0.23,10.62,35838995.0,2022-10-11T23:36:04.000Z,1709755000.0,US,"EN,ES,FR,JA,RU,",1995-06-13,2022-11-06T00:00:00.000Z,27
4,4,4,Check the similarity between two products,78414.0,0.0,"In this task, you will see two product listing...",1.0,dbaa749ed83b3ef412a3167f6c53c363,0.0,4.35,...,0.23,10.62,35859153.0,2022-10-13T07:13:58.000Z,1709755000.0,US,"EN,ES,FR,JA,RU,",1995-06-13,2022-11-06T00:00:00.000Z,27


In [None]:
df1 = df.iloc[: , 2:]

In [None]:
df1.to_csv('toloka_worker_data.csv')