---
# gokart task run
---
running sample task

In [1]:
import os
os.environ['TASK_WORKSPACE_DIRECTORY'] = './resources'

In [2]:
!cat ./task.py

# define tasks
import gokart
import luigi
from luigi.util import requires
from logging import getLogger

logger = getLogger(__name__)


class SampleTask(gokart.TaskOnKart):
    task_namespace = 'sample'
    name = luigi.Parameter()
    number = luigi.IntParameter()
    
    def require(self):
        return

    def output(self):
        return self.make_target(f'{self.name}/sample.pkl')

    def run(self):
        self.dump(f'this is sample output. model number: {self.number}')

        
@requires(SampleTask)
class SecondTask(gokart.TaskOnKart):
    task_namespace = 'sample'
    param = luigi.Parameter()

    def output(self):
        return self.make_target(f'SECOND_TASK/task.pkl')

    def run(self):
        sample = self.load()
        self.dump(sample + f'add task: {self.param}')
        
gokart.run()

In [3]:
# sample task run
!python task.py sample.SampleTask --name='EXAMPLE' --number=1 --local-scheduler 2> /dev/null
!python task.py sample.SampleTask --name='EXAMPLE' --number=2 --local-scheduler 2> /dev/null
!python task.py sample.SampleTask --name='EXAMPLE' --number=3 --local-scheduler 2> /dev/null
!python task.py sample.SampleTask --name='TEMP' --number=1 --local-scheduler 2> /dev/null
!python task.py sample.SampleTask --name='TEMP' --number=2 --local-scheduler 2> /dev/null
!python task.py sample.SecondTask --name='TEMP' --number=2 --param='RUN' --local-scheduler --local-temporary-directory='./resource' 2> /dev/null

In [4]:
!tree ./resources/

[0;34m./resources/[00m
├── [0;34mEXAMPLE[00m
│   ├── sample_84b0b9c5a39bce072271599c9f730660.pkl
│   ├── sample_944fc52ef5011b71b5839f035f4d7e48.pkl
│   └── sample_e883bcfad65f5fb68259d1cd4691f384.pkl
├── [0;34mSECOND_TASK[00m
│   └── task_ea1806322904199b2455d6e115c525ea.pkl
├── [0;34mTEMP[00m
│   ├── sample_d05a2ab961781d3d8eca3e2e5f0d608b.pkl
│   └── sample_d57cff8074e2560896974850e5d3174d.pkl
└── [0;34mlog[00m
    ├── [0;34mprocessing_time[00m
    │   ├── SampleTask_84b0b9c5a39bce072271599c9f730660.pkl
    │   ├── SampleTask_944fc52ef5011b71b5839f035f4d7e48.pkl
    │   ├── SampleTask_d05a2ab961781d3d8eca3e2e5f0d608b.pkl
    │   ├── SampleTask_d57cff8074e2560896974850e5d3174d.pkl
    │   ├── SampleTask_e883bcfad65f5fb68259d1cd4691f384.pkl
    │   └── SecondTask_ea1806322904199b2455d6e115c525ea.pkl
    ├── [0;34mtask_log[00m
    │   ├── SampleTask_84b0b9c5a39bce072271599c9f730660.pkl
    │   ├── SampleTask_944fc52ef5011b71b5839f035f4d7e48.pkl
    │   ├── SampleTask_d05a

---
# Init Thunderbolt
---
using thunderbolt

In [5]:
from thunderbolt import Thunderbolt

In [6]:
# 1st arg is gokart's TASK_WORKSPACE_DIRECTORY

task_dir = os.environ['TASK_WORKSPACE_DIRECTORY']
tb = Thunderbolt(task_dir) 

100%|██████████| 6/6 [00:00<00:00, 5310.37it/s]


## Check tasks param
checking thunderbolt's task_id

In [7]:
df = tb.get_task_df()

In [8]:
import pandas as pd
pd.set_option("display.max_colwidth", 200)
df

Unnamed: 0,task_id,task_name,last_modified,task_params
0,0,SecondTask,2019-08-18 10:15:48.289674,"{'name': 'TEMP', 'number': '2', 'param': 'RUN'}"
1,1,SampleTask,2019-08-18 10:15:46.210884,"{'name': 'TEMP', 'number': '1'}"
2,2,SampleTask,2019-08-18 10:15:45.179609,"{'name': 'EXAMPLE', 'number': '3'}"
3,3,SampleTask,2019-08-18 10:15:47.255227,"{'name': 'TEMP', 'number': '2'}"
4,4,SampleTask,2019-08-18 10:15:43.040046,"{'name': 'EXAMPLE', 'number': '1'}"
5,5,SampleTask,2019-08-18 10:15:44.158041,"{'name': 'EXAMPLE', 'number': '2'}"


## thunderbolt filter
2nd arg 'task_filters' is str or list.  
task_filters is partial match word for example: 
 - 'Tag' -> HogeTag, NormalizeHogeTag, TagTask, ...
 - ['Train', 'Tag'] -> TrainModel, TrainData, HogeTag, NormalizeHogeTag, TagTask, ...

In [9]:
tb = Thunderbolt(task_dir, task_filters='Sample') 

100%|██████████| 6/6 [00:00<00:00, 3103.83it/s]


In [10]:
# all_data=True
tb.get_task_df(all_data=True)

Unnamed: 0,last_modified,task_hash,task_id,task_log,task_name,task_params
0,2019-08-18 10:15:46.210884,d57cff8074e2560896974850e5d3174d,1,{'file_path': ['./resources/TEMP/sample_d57cff8074e2560896974850e5d3174d.pkl']},SampleTask,"{'name': 'TEMP', 'number': '1'}"
1,2019-08-18 10:15:45.179609,944fc52ef5011b71b5839f035f4d7e48,2,{'file_path': ['./resources/EXAMPLE/sample_944fc52ef5011b71b5839f035f4d7e48.pkl']},SampleTask,"{'name': 'EXAMPLE', 'number': '3'}"
2,2019-08-18 10:15:47.255227,d05a2ab961781d3d8eca3e2e5f0d608b,3,{'file_path': ['./resources/TEMP/sample_d05a2ab961781d3d8eca3e2e5f0d608b.pkl']},SampleTask,"{'name': 'TEMP', 'number': '2'}"
3,2019-08-18 10:15:43.040046,e883bcfad65f5fb68259d1cd4691f384,4,{'file_path': ['./resources/EXAMPLE/sample_e883bcfad65f5fb68259d1cd4691f384.pkl']},SampleTask,"{'name': 'EXAMPLE', 'number': '1'}"
4,2019-08-18 10:15:44.158041,84b0b9c5a39bce072271599c9f730660,5,{'file_path': ['./resources/EXAMPLE/sample_84b0b9c5a39bce072271599c9f730660.pkl']},SampleTask,"{'name': 'EXAMPLE', 'number': '2'}"


---
# Data Load
---
using load method
- arg: thunderbolt's task_id
- return : data list

In [11]:
for x in tb.load(task_id=3):
    print(x)

this is sample output. model number: 2


### newest data load example

In [12]:
tb = Thunderbolt(task_dir, 'Second')
tb.load(task_id=tb.get_task_df().sort_values('last_modified').task_id.iloc[0])

100%|██████████| 6/6 [00:00<00:00, 8200.01it/s]


['this is sample output. model number: 2add task: RUN']