# Welcome to mt5-thai-QG
This is notebook detailing how to finetune **mt5 for question-generation in the Thai language** 


First, we will mount our google drive so our models don't get deleted (╯°□°)╯︵ ┻━┻ 


In [1]:
from google.colab import drive

drive.mount("/content/drive")


Mounted at /content/drive


# Setup

Now we will install some requirements

In [2]:
# Remove "sample_data" in colab
!rm -rf sample_data
# Solve some protobuf problems
!export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION='python'
# Install programs

## Install ONNX
#%pip install torch-ort onnxruntime-training -f https://download.onnxruntime.ai/onnxruntime_stable_cu111.html
#!apt install ninja-build
#!python -m torch_ort.configure

## Install other stuff
%pip install ijson pandas torchmetrics lightning-bolts transformers sentencepiece protobuf beautifulsoup4 pytorch-lightning 
%pip install pythainlp epitran
%pip install -U nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ijson
  Downloading ijson-3.1.4-cp37-cp37m-manylinux2010_x86_64.whl (126 kB)
[K     |████████████████████████████████| 126 kB 7.5 MB/s 
Collecting torchmetrics
  Downloading torchmetrics-0.9.1-py3-none-any.whl (419 kB)
[K     |████████████████████████████████| 419 kB 60.6 MB/s 
[?25hCollecting lightning-bolts
  Downloading lightning_bolts-0.5.0-py3-none-any.whl (316 kB)
[K     |████████████████████████████████| 316 kB 62.1 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.19.4-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 33.4 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 30.6 MB/s 
Collecting pytorch-lightning
  Downloading pytorch_lightning-1.6.4-py3-none-any.whl (585 kB)
[K     |██████

Then we will import all the things we need

In [3]:
import math
import urllib.request
import os
import ijson
import json
import re
from typing import Optional

import torch

import numpy as np
import pandas as pd
import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

# from pl_bolts.callbacks import ORTCallback

from zipfile import ZipFile
from bs4 import BeautifulSoup
from transformers import (
    MT5ForConditionalGeneration,
    MT5TokenizerFast,
)


# Gather & Process datasets
- xquad-thai
- iapp-wiki-qa-dataset
- thaiqa

In [4]:
def download_dataset(url, file_name):
    urllib.request.urlretrieve(
        url,
        os.path.join("dataset/", file_name),
        reporthook=(
            lambda count, block, total: print(
                f"Downloading {file_name}: {math.floor((count * block) / total * 100)}%",
                end="\r",
            )
        ),
    )
    print(f"Downloaded {file_name} from {url}")


# Check if the dataset already exists
if not (
    os.path.exists("dataset/xquad.json")
    and os.path.exists("dataset/iapp-thai-wikipedia-qa.json")
):
    os.mkdir("dataset")
    # Download all datasets
    download_dataset(
        "https://github.com/deepmind/xquad/raw/master/xquad.th.json", "xquad.json"
    )
    download_dataset(
        "https://raw.githubusercontent.com/iapp-technology/iapp-wiki-qa-dataset/main/iapp-thai-wikipedia-qa-1961-docs-9170-questions.json",
        "iapp-thai-wikipedia-qa.json",
    )
    download_dataset(
        "https://github.com/PyThaiNLP/thaiqa_squad/raw/main/data.zip", "thaiqa.zip"
    )
    with ZipFile("dataset/thaiqa.zip") as zipfile:
        os.mkdir("dataset/thaiqa")
        zipfile.extractall("dataset/thaiqa/")

# This list will store all the Q&A
source_list = []
target_list = []

# Start cleaning data
squad = open(os.path.join("dataset/", "xquad.json"))
iapp = open(os.path.join("dataset/", "iapp-thai-wikipedia-qa.json"))
iapp_keys = open(os.path.join("dataset/", "iapp-thai-wikipedia-qa.json"))
thaiqa = open(os.path.join("dataset/thaiqa/data/train.jsonl"))

squad_json = ijson.items(squad, "data.item")
iapp_json = json.load(iapp)
iapp_keys = ijson.kvitems(iapp_keys, "db")
thaiqa_df = pd.read_json(thaiqa, lines=True)

# Get data from xquad
for obj in squad_json:
    paragraphs = obj["paragraphs"]
    for p in paragraphs:
        context = p["context"]
        qas = [p for p in p["qas"] if len(p) > 0]

        source_text = f"สร้าง {len(qas)} คำถาม: {context}"
        target_text = ""

        for number, qa in enumerate(qas):
            target_text += (
                f"{number + 1}. {qa['question']} A: {qa['answers'][0]['text']} "
            )

        source_list.append(source_text.strip())
        target_list.append(target_text.strip())

# Get dataset from iapp
for key in iapp_keys:
    try:
        obj = iapp_json["db"][key[0]]
        context = obj["detail"]
        qas = obj["QA"]
        target_text = ""

        qa_amount = 0

        for number, qa in enumerate(qas):
            if len(qa["a"]) != 0 and len(qa["q"]) != 0:
                target_text += f"{number + 1}. {qa['q']} A: {qa['a'][0]} "
                qa_amount += 1

        source_text = f"สร้าง {qa_amount} คำถาม: {context}"
        source_list.append(source_text.strip())
        target_list.append(target_text.strip())

    except KeyError as e:
        # Due to the dataset, there will always be a keyerror on "detail" which is the dataset's metadata
        if str(e) != "'detail'":
            print(f"KeyError: {e}")

# Get data from thaiqa
article_ids = set(thaiqa_df["article_id"])
for id in article_ids:
    questions = thaiqa_df[thaiqa_df["article_id"] == id]

    # Remove html markup
    soup = BeautifulSoup(questions["context"].iloc[0])

    # Remove parenthesis because some are empty
    context = re.sub(r"\(\)", "", soup.text)

    # Remove double spaces resulting from removing parenthesis
    context = re.sub(r"\s\s+", " ", context)

    source_text = f"สร้าง {len(questions)} คำถาม: {context}"
    target_text = ""

    qa_number = 1
    for _, question in questions.iterrows():
        target_text += f"{qa_number}. {question['question']} A: {question['answer']} "
        qa_number += 1

    source_list.append(source_text.strip())
    target_list.append(target_text.strip())

dataframe = pd.DataFrame({"source_text": source_list, "target_text": target_list})
dataframe


Downloaded xquad.json from https://github.com/deepmind/xquad/raw/master/xquad.th.json
Downloaded iapp-thai-wikipedia-qa.json from https://raw.githubusercontent.com/iapp-technology/iapp-wiki-qa-dataset/main/iapp-thai-wikipedia-qa-1961-docs-9170-questions.json
Downloaded thaiqa.zip from https://github.com/PyThaiNLP/thaiqa_squad/raw/main/data.zip


Unnamed: 0,source_text,target_text
0,สร้าง 14 คำถาม: ﻿ทีมรับของแพนเธอร์สถอดใจที่คะแ...,1. ทีมรับของแพนเธอร์สยอมแพ้ที่คะแนนเท่าไร A: 3...
1,สร้าง 16 คำถาม: ทีมบรอนคอส เอาชนะทีม พิตต์สเบิ...,1. ใครพ่ายแพ้ให้แก่ทีมบรอนคอสในรอบดิวิชั่น A: ...
2,สร้าง 17 คำถาม: เพย์ตัน แมนนิง กลายเป็นควอเตอร...,1. เพย์ตัน แมนนิง อายุเท่าไรตอนที่เขาเล่นในซูเ...
3,สร้าง 12 คำถาม: เลดีกากา ซึ่งชนะรางวัลแกรมมี ห...,1. เลดีกากา ชนะแกรมมีกี่รางวัล A: หก 2. เลดีกา...
4,สร้าง 15 คำถาม: ขณะที่เหลือเวลาอีก 4:51 นาทีแค...,1. แคโรไลนาเริ่มเล่นที่เส้นหลาที่เท่าไรเมื่อมี...
...,...,...
4485,สร้าง 2 คำถาม: ปกเกล้า อนันต์ สิบตำรวจตรี ปกเก...,1. ในปี 2560 ปกเกล้า อนันต์ เล่นในตำแหน่งกองกล...
4486,สร้าง 2 คำถาม: ธนาคารกสิกรไทย ธนาคารกสิกรไทย จ...,1. ธนาคารกสิกรไทยเป็นธนาคารในประเทศไทย มีสำนัก...
4487,สร้าง 2 คำถาม: ดิเรกสิน รัตนสิน พันโท ดิเรกสิน...,1. พันโท ดิเรกสิน รัตนสิน เกิดเมื่อวันที่เท่าไ...
4488,สร้าง 2 คำถาม: วรพล ทองคำชู วรพล ทองคำชู นักเท...,1. นักเทนนิสชายไทย วรพล ทองคำชู ได้เหรียญทองจา...


And then split our dataframe into train, valid, and test sets.

In [5]:
index_train_split = math.floor(dataframe.shape[0] * 0.8)
train_df, valid_test = (
    dataframe.iloc[
        :index_train_split,
    ],
    dataframe.iloc[
        index_train_split:,
    ],
)

index_test_split = math.floor(valid_test.shape[0] * 0.5)
valid_df, test_df = (
    valid_test.iloc[
        :index_test_split,
    ],
    valid_test.iloc[
        index_test_split:,
    ],
)


# Training
Now we can finally get to the nitty-gritty and start defining our training loops

In [6]:
pl.seed_everything(16)
torch.cuda.empty_cache()


class MT5Dataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer):
        self.data = df.reset_index()
        self.tokenizer = tokenizer
        self.source_max_len = 1024
        self.target_max_len = 1024

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        data_row = self.data.iloc[idx]
        source, target = data_row["source_text"], data_row["target_text"]

        source_encoding = self.tokenizer(
            source,
            padding="max_length",
            max_length=self.source_max_len,
            truncation=True,
            add_special_tokens=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        target_encoding = self.tokenizer(
            target,
            padding="max_length",
            max_length=self.target_max_len,
            truncation=True,
            add_special_tokens=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        # Ensure labels are correct (see huggingface T5 training documentation)
        labels = target_encoding.input_ids
        labels[labels == self.tokenizer.pad_token_id] = -100

        return dict(
            input_ids=source_encoding.input_ids.flatten(),
            attention_mask=source_encoding.attention_mask.flatten(),
            decoder_input_ids=labels.flatten(),
            decoder_attention_mask=target_encoding.attention_mask.flatten(),
        )


class MT5DataModule(pl.LightningDataModule):
    def __init__(
        self,
        tokenizer,
        train_df,
        valid_df,
        test_df,
        batch_size: int = 1,
        num_workers: int = 2,
    ):
        super().__init__()
        self.batch_size = batch_size
        self.train_df = train_df
        self.valid_df = valid_df
        self.test_df = test_df
        self.tokenizer = tokenizer

    def setup(self, stage: Optional[str] = None, batch_size=1):
        self.batch_size = batch_size
        if stage == "fit" or stage is None:
            self.train_data = MT5Dataset(self.train_df, self.tokenizer)
            self.valid_data = MT5Dataset(self.valid_df, self.tokenizer)

        if stage == "test" or stage is None:
            self.test_data = MT5Dataset(self.test_df, self.tokenizer)

    def train_dataloader(self):
        return torch.utils.data.DataLoader(
            self.train_data, batch_size=self.batch_size, shuffle=True
        )

    def val_dataloader(self):
        return torch.utils.data.DataLoader(
            self.valid_data, batch_size=self.batch_size, shuffle=False
        )

    def test_dataloader(self):
        return torch.utils.data.DataLoader(
            self.test_data, batch_size=self.batch_size, shuffle=False
        )


class MT5Lightning(pl.LightningModule):
    def __init__(self, model, tokenizer):
        super().__init__()
        self.model = model
        self.tokenizer = tokenizer
        self.avg_training_loss = None
        self.avg_val_loss = None

    def forward(
        self, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask
    ):
        output = self.model(
            input_ids,
            attention_mask=attention_mask,
            labels=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
        )
        return output.loss, output.logits

    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        decoder_input_ids = batch["decoder_input_ids"]
        decoder_attention_mask = batch["decoder_attention_mask"]

        output = self(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
        )

        self.log("loss", output[0], prog_bar=True, on_step=True, on_epoch=True)

        return output[0]

    def validation_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        decoder_input_ids = batch["decoder_input_ids"]
        decoder_attention_mask = batch["decoder_attention_mask"]

        output = self(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
        )

        self.log("val_loss", output[0], prog_bar=True, on_step=True, on_epoch=True)

        return output[0]

    def test_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        decoder_input_ids = batch["decoder_input_ids"]
        decoder_attention_mask = batch["decoder_attention_mask"]

        output = self(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
        )

        self.log("test_loss", output.loss, prog_bar=True)

        return output.loss

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=3e-4)

    def training_epoch_end(self, training_step_outputs):
        self.avg_training_loss = np.round(
            torch.mean(torch.stack([x["loss"] for x in training_step_outputs])).item(),
            4,
        )
        path = ""
        if os.path.exists("drive"):
            path += "drive/MyDrive/mt5-thai-qg/"
        else:
            path += "outputs/"
        path += f"mt5-qg-epoch-{self.current_epoch}-train-loss-{self.avg_training_loss}-val-loss-{self.avg_val_loss}"
        self.tokenizer.save_pretrained(path)
        self.model.save_pretrained(path)

    def validation_epoch_end(self, validation_step_outputs):
        _loss = [x.cpu() for x in validation_step_outputs]
        self.avg_val_loss = np.round(
            torch.mean(torch.stack(_loss)).item(),
            4,
        )


Global seed set to 16


## Actually Train
Start actually training

In [None]:
model = MT5ForConditionalGeneration.from_pretrained(
    "google/mt5-small", return_dict=True
)
tokenizer = MT5TokenizerFast.from_pretrained("google/mt5-small")
dataset = MT5DataModule(tokenizer, train_df, valid_df, test_df, batch_size=1)

MT5Model = MT5Lightning(model, tokenizer)

callbacks = []
callbacks.append(EarlyStopping(monitor="val_loss", mode="min"))
# callbacks.append(ORTCallback())

trainer = pl.Trainer(
    accelerator="gpu",
    devices=1,
    logger=True,
    max_epochs=20,
    log_every_n_steps=1,
    callbacks=callbacks,
    profiler="simple",
)
trainer.fit(MT5Model, dataset)


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


# Testing & Inferencing
Now we will load up our model and do some inference.

## Inferencing

In [7]:
model = MT5ForConditionalGeneration.from_pretrained(
    "drive/MyDrive/mt5-thai-qg/mt5-qg-epoch-4-train-loss-0.961-val-loss-0.9701",
    return_dict=True,
)
tokenizer = MT5TokenizerFast.from_pretrained(
    "drive/MyDrive/mt5-thai-qg/mt5-qg-epoch-4-train-loss-0.961-val-loss-0.9701"
)

model.cuda()


def predict(text):
    with torch.no_grad():
        input_ids = tokenizer.encode(text, return_tensors="pt", add_special_tokens=True)

        input_ids = input_ids.cuda()

        generated_ids = model.generate(
            input_ids=input_ids,
            num_beams=2,
            max_length=1024,
            repetition_penalty=1.5,
            length_penalty=1.0,
            early_stopping=True,
            top_p=50,
            top_k=0.95,
            num_return_sequences=1,
        )

        preds = [
            tokenizer.decode(
                g,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True,
            )
            for g in generated_ids
        ]
    return preds


Run this cell below to try out the model!

In [None]:
text_to_predict = """สร้าง 3 คำถาม: ดันเจียนซีจ (อังกฤษ: Dungeon Siege) เป็นเกมแอ็กชันเล่นตามบทบาทที่พัฒนาโดยแก๊สเพาเวิร์ดเกมส์ ซึ่งไมโครซอฟท์ได้จัดจำหน่ายบนแพลตฟอร์มไมโครซอฟท์ วินโดวส์ ในเดือนเมษายน ค.ศ. 2002 และเดสทิเนียร์ได้จัดจำหน่ายบนแพลตฟอร์มแมคโอเอสเท็นในปีถัดไป โดยมีฉากอยู่ในอาณาจักรยุคกลางสมมติ ชื่อ เอห์บ เกมนี้ยังจัดเป็นแนวแฟนตาซีระดับสูงที่เดินเรื่องตามชาวไร่หนุ่มคนหนึ่งและเพื่อนร่วมทางขณะที่พวกเขาออกเดินทางเพื่อกำจัดกองกำลังที่รุกราน ในตอนแรก กลุ่มตัวเอกเพียงต้องการเตือนเมืองใกล้เคียงเกี่ยวกับการรุกรานของเผ่าพันธุ์สิ่งมีชีวิตที่ชื่อครุก และในอีกไม่นาน ชาวไร่คนดังกล่าวและเพื่อนร่วมทางกับเขาตกอยู่ในสถานการณ์หาทางเอาชนะเผ่าพันธุ์อื่นที่เรียกว่าเซกอย่างหลีกเลี่ยงไม่ได้ ซึ่งฟื้นคืนพลังใหม่หลังจากถูกคุมขังอยู่ 300 ปี โลกของดันเจียนซีจไม่ใช้ระบบเลเวลเหมือนกับวิดีโอเกมเล่นตามบทบาทอื่น ๆ ในยุคนั้น หากแต่เป็นพื้นที่เดียวที่ต่อเนื่อง โดยปราศจากการโหลดหน้าจอ ซึ่งผู้เล่นเดินทางผ่านเพื่อต่อสู้กับฝูงศัตรู นอกจากนี้ แทนที่จะกำหนดคลาสตัวละครและควบคุมตัวละครทั้งหมดในกลุ่มด้วยตนเอง ผู้เล่นจะควบคุมกลยุทธ์และอาวุธ ตลอดจนการใช้เวทมนตร์โดยรวมของพวกเขา ซึ่งกำกับการเติบโตของตัวละคร"""
print(predict(text_to_predict))


## Evaluate Our Model

Setup our dataset for testm mode

In [9]:
dataset = MT5DataModule(tokenizer, train_df, valid_df, test_df, batch_size=4)
dataset.setup(stage="test")
test_loader = dataset.test_dataloader()


Define a preprocessing function so that our metrics don't die

In [10]:
from pythainlp import word_tokenize


def pre_process(texts):
    final = []
    for text in texts:
        final.append(" ".join(word_tokenize(text, keep_whitespace=False)))
    return final


Our evaluation will use the following metrics:
* ROUGE
* CHRF
* GLEU
* METEOR

Begin the evaluation!

In [15]:
import nltk
from torchmetrics.text.rouge import ROUGEScore
from torchmetrics import CHRFScore
from nltk.translate import meteor_score, gleu_score, bleu_score

model.cuda()

nltk.download("punkt")

rouge = ROUGEScore()
chrf = CHRFScore()

bleu_avg = 0
meteor_avg = 0
gleu_avg = 0

labels_collect = []
preds_collect = []

for batch in test_loader:
    generated_ids = model.generate(
        input_ids=batch["input_ids"].cuda(),
        num_beams=2,
        max_length=1024,
        repetition_penalty=1.5,
        length_penalty=1.0,
        early_stopping=True,
        top_p=50,
        top_k=0.95,
        num_return_sequences=1,
    )

    preds = [
        tokenizer.decode(
            g,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True,
        )
        for g in generated_ids
    ]

    batch["decoder_input_ids"][
        batch["decoder_input_ids"] == -100
    ] = tokenizer.pad_token_id

    labels = [
        tokenizer.decode(
            g,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True,
        )
        for g in batch["decoder_input_ids"]
    ]

    preds = pre_process(preds)
    labels = pre_process(labels)

    for p, l in zip(preds, labels):
        meteor_avg += meteor_score.single_meteor_score(p.split(" "), l.split(" "))
        gleu_avg += gleu_score.sentence_gleu(p.split(" "), l.split(" "))
        bleu_avg += bleu_score.sentence_bleu([p.split(" ")], l.split(" "), weights=(0.25, 0.25, 0.25, 0.25))

    chrf(preds, labels)
    rouge(preds, labels)
    preds_collect.append(preds)
    labels_collect.append(labels)

print("------")
print(f"Meteor: {meteor_avg / (4 * 113)}")
print(f"GLEU: {gleu_avg / (4 * 113)}")
print(f"BLEU: {bleu_avg / (4 * 113)}")
print(f"CHRF: {chrf.compute().item()}")
print(f"ROUGE: {rouge.compute()}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
  total_n_grams[n] = tensor(sum(n_grams_counts[n].values()))
  for n_gram in hyp_n_grams_counts[n]


------
Meteor: 0.5409517550337343
GLEU: 0.011632674913384072
BLEU: 1.40121632714385e-157
CHRF: 0.4429878294467926
ROUGE: {'rouge1_fmeasure': tensor(0.8655), 'rouge1_precision': tensor(0.8883), 'rouge1_recall': tensor(0.8775), 'rouge2_fmeasure': tensor(0.7003), 'rouge2_precision': tensor(0.7207), 'rouge2_recall': tensor(0.7169), 'rougeL_fmeasure': tensor(0.8457), 'rougeL_precision': tensor(0.8684), 'rougeL_recall': tensor(0.8574), 'rougeLsum_fmeasure': tensor(0.8599), 'rougeLsum_precision': tensor(0.8825), 'rougeLsum_recall': tensor(0.8720)}


# Misc
Below are some other useful chunks of code

## Export prediction & labels

In [None]:
print(labels_collect[0][1])
print(preds_collect[0][1])

label_final = [i for x in labels_collect for i in x]
pred_final = [i for x in preds_collect for i in x]


export = pd.DataFrame(
    data=zip(label_final, pred_final), columns=["Labels", "Predictions"]
)
export.to_json("output.json")


## Zip up Outputs
Zip up the outputs & logs folder (with the models) to prepare them for exporting using zstd.

### Installing dependencies

In [None]:
!apt install tar zstd

### Zipping up everything

In [None]:
!tar -c -I "zstd -19 -T0" -f "mt5-thai-qg.tar.zst" outputs/ lightning_logs/