In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [2]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import sentence_bleu
from bs4 import BeautifulSoup
import os

# Load dataset
dataset = load_dataset("HuggingFaceM4/WebSight", "v0.2", split="train[:1%]")

# Tokenizer and Model Initialization
tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

# Data Preprocessing
def preprocess_function(examples):
    inputs = examples['image']  # Placeholder for image preprocessing
    targets = tokenizer(examples['html'], truncation=True, padding="max_length", max_length=512)
    return {"input_ids": inputs, "labels": targets["input_ids"]}

# Preprocess dataset
processed_dataset = dataset.map(preprocess_function, batched=True)

# Split dataset into train and eval
train_data, eval_data = train_test_split(processed_dataset, test_size=0.1)

# Training Arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True
)

# Trainer Setup
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data
)

# Model Training
trainer.train()

# Save the trained model
model.save_pretrained("./trained_model")
tokenizer.save_pretrained("./trained_model")

# Evaluation with BLEU Score and Structural Validity
def evaluate_model(trainer, eval_data):
    predictions = trainer.predict(eval_data)
    decoded_preds = tokenizer.batch_decode(predictions.predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(predictions.label_ids, skip_special_tokens=True)

    bleu_scores = [sentence_bleu([label.split()], pred.split()) for label, pred in zip(decoded_labels, decoded_preds)]
    structural_validity = [is_valid_html(pred) for pred in decoded_preds]

    avg_bleu = sum(bleu_scores) / len(bleu_scores)
    valid_html_ratio = sum(structural_validity) / len(structural_validity)

    print(f"Average BLEU Score: {avg_bleu}")
    print(f"Valid HTML Ratio: {valid_html_ratio}")

# Check if generated HTML is structurally valid
def is_valid_html(html):
    try:
        BeautifulSoup(html, "html.parser")
        return True
    except Exception as e:
        return False

# Run Evaluation
evaluate_model(trainer, eval_data)

# Demonstration on Google Colab
def generate_html(image_input):
    inputs = tokenizer(image_input, return_tensors="pt", padding=True)
    outputs = model.generate(**inputs)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example of generating HTML from an image input
example_input = "path_to_example_image.jpg"  # Replace with actual image input
generated_html = generate_html(example_input)
print(generated_html)

# Fine-tuning (Optional)
def fine_tune_model(train_data, eval_data):
    fine_tune_args = Seq2SeqTrainingArguments(
        output_dir="./fine_tuned_results",
        evaluation_strategy="epoch",
        learning_rate=1e-5,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        weight_decay=0.01,
        save_total_limit=3,
        num_train_epochs=5,
        predict_with_generate=True,
        fp16=True
    )

    fine_tune_trainer = Seq2SeqTrainer(
        model=model,
        args=fine_tune_args,
        train_dataset=train_data,
        eval_dataset=eval_data
    )

    fine_tune_trainer.train()
    fine_tune_trainer.save_model("./fine_tuned_model")

# Uncomment to fine-tune the model
# fine_tune_model(train_data, eval_data)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/5.16k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/738 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/738 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/738 [00:00<?, ?files/s]

(…)-00000-of-00738-80a58552f2fb3344.parquet:   0%|          | 0.00/418M [00:00<?, ?B/s]

(…)-00001-of-00738-b79d703dc79dce91.parquet:   0%|          | 0.00/415M [00:00<?, ?B/s]

(…)-00002-of-00738-0ef7f6c3b72ade6f.parquet:   0%|          | 0.00/411M [00:00<?, ?B/s]

(…)-00003-of-00738-e6289cb9751c0dda.parquet:   0%|          | 0.00/425M [00:00<?, ?B/s]

(…)-00004-of-00738-13efd81ccba16eae.parquet:   0%|          | 0.00/427M [00:00<?, ?B/s]

(…)-00005-of-00738-89cf78e53b934db0.parquet:   0%|          | 0.00/408M [00:00<?, ?B/s]

(…)-00006-of-00738-ba36f1dbd3143674.parquet:   0%|          | 0.00/414M [00:00<?, ?B/s]

(…)-00007-of-00738-00b0a9a4836cf7a5.parquet:   0%|          | 0.00/411M [00:00<?, ?B/s]

(…)-00008-of-00738-e6a16adc30e4c153.parquet:   0%|          | 0.00/418M [00:00<?, ?B/s]

(…)-00009-of-00738-0b53c7e1ac5d45d6.parquet:   0%|          | 0.00/412M [00:00<?, ?B/s]

(…)-00010-of-00738-4dca33e27cbd5c4f.parquet:   0%|          | 0.00/412M [00:00<?, ?B/s]

(…)-00011-of-00738-bf556baee08afe6a.parquet:   0%|          | 0.00/421M [00:00<?, ?B/s]

(…)-00012-of-00738-b12bfb20dcee3543.parquet:   0%|          | 0.00/412M [00:00<?, ?B/s]

(…)-00013-of-00738-5a12a666a27ef1f6.parquet:   0%|          | 0.00/424M [00:00<?, ?B/s]

(…)-00014-of-00738-74fd0875ce2d2fcc.parquet:   0%|          | 0.00/423M [00:00<?, ?B/s]

(…)-00015-of-00738-d31cc11854e1805a.parquet:   0%|          | 0.00/415M [00:00<?, ?B/s]

(…)-00016-of-00738-df484770f55906ef.parquet:   0%|          | 0.00/423M [00:00<?, ?B/s]

(…)-00017-of-00738-393991c3d348b7c9.parquet:   0%|          | 0.00/426M [00:00<?, ?B/s]

(…)-00018-of-00738-9b87a4a0ab4ddbf1.parquet:   0%|          | 0.00/436M [00:00<?, ?B/s]

(…)-00019-of-00738-85728518fd2883e2.parquet:   0%|          | 0.00/418M [00:00<?, ?B/s]

(…)-00020-of-00738-aff0b2603c73cb96.parquet:   0%|          | 0.00/422M [00:00<?, ?B/s]

(…)-00021-of-00738-9c190a91c336c25d.parquet:   0%|          | 0.00/429M [00:00<?, ?B/s]

(…)-00022-of-00738-4cd069bfb2bf3527.parquet:   0%|          | 0.00/412M [00:00<?, ?B/s]

(…)-00023-of-00738-f34f24afab9ba31b.parquet:   0%|          | 0.00/418M [00:00<?, ?B/s]

(…)-00024-of-00738-6f617dbc7a477341.parquet:   0%|          | 0.00/407M [00:00<?, ?B/s]

(…)-00025-of-00738-448ddf2a5d4ea2e2.parquet:   0%|          | 0.00/420M [00:00<?, ?B/s]

(…)-00026-of-00738-5a22fb55fe17ba2e.parquet:   0%|          | 0.00/416M [00:00<?, ?B/s]

(…)-00027-of-00738-cd883eafa8ac906d.parquet:   0%|          | 0.00/407M [00:00<?, ?B/s]

(…)-00028-of-00738-feab2f7b8bd1e884.parquet:   0%|          | 0.00/415M [00:00<?, ?B/s]

(…)-00029-of-00738-2aa07322f4bd9eba.parquet:   0%|          | 0.00/411M [00:00<?, ?B/s]

(…)-00030-of-00738-bfeb012279cb6e80.parquet:   0%|          | 0.00/417M [00:00<?, ?B/s]

(…)-00031-of-00738-fae7605ef5979445.parquet:   0%|          | 0.00/416M [00:00<?, ?B/s]

(…)-00032-of-00738-0af23369b04d2faf.parquet:   0%|          | 0.00/406M [00:00<?, ?B/s]

(…)-00033-of-00738-3af11c054498f1d6.parquet:   0%|          | 0.00/420M [00:00<?, ?B/s]

(…)-00034-of-00738-986637b4c7ef1828.parquet:   0%|          | 0.00/415M [00:00<?, ?B/s]

(…)-00035-of-00738-89945e891d180a1d.parquet:   0%|          | 0.00/414M [00:00<?, ?B/s]

(…)-00036-of-00738-49694d70d2d118c4.parquet:   0%|          | 0.00/417M [00:00<?, ?B/s]

(…)-00037-of-00738-2b7df7282bc6fafa.parquet:   0%|          | 0.00/402M [00:00<?, ?B/s]

(…)-00038-of-00738-8a9aa29e7e780534.parquet:   0%|          | 0.00/423M [00:00<?, ?B/s]

(…)-00039-of-00738-16af679bc8e4640c.parquet:   0%|          | 0.00/412M [00:00<?, ?B/s]

(…)-00040-of-00738-73ce64888f12bff8.parquet:   0%|          | 0.00/422M [00:00<?, ?B/s]

(…)-00041-of-00738-ff79f866e059ce76.parquet:   0%|          | 0.00/420M [00:00<?, ?B/s]

(…)-00042-of-00738-e7e0f82b0dd070a9.parquet:   0%|          | 0.00/428M [00:00<?, ?B/s]

(…)-00043-of-00738-aa297bbff45ec2e7.parquet:   0%|          | 0.00/422M [00:00<?, ?B/s]

(…)-00044-of-00738-38a6d4d6d9c9ea26.parquet:   0%|          | 0.00/421M [00:00<?, ?B/s]

(…)-00045-of-00738-8a9b892404876d2f.parquet:   0%|          | 0.00/409M [00:00<?, ?B/s]

(…)-00046-of-00738-6c87d61e238677de.parquet:   0%|          | 0.00/411M [00:00<?, ?B/s]

(…)-00047-of-00738-2e5a3d25090105cd.parquet:   0%|          | 0.00/409M [00:00<?, ?B/s]

(…)-00048-of-00738-0c458c4e482d1f60.parquet:   0%|          | 0.00/406M [00:00<?, ?B/s]

(…)-00049-of-00738-91b0a2b2fee82d00.parquet:   0%|          | 0.00/411M [00:00<?, ?B/s]

(…)-00050-of-00738-58e03e70bce06a4f.parquet:   0%|          | 0.00/415M [00:00<?, ?B/s]

(…)-00051-of-00738-937813887b2bbd07.parquet:   0%|          | 0.00/417M [00:00<?, ?B/s]

(…)-00052-of-00738-9b18c2c0d5abfa8c.parquet:   0%|          | 0.00/420M [00:00<?, ?B/s]

(…)-00053-of-00738-66e6d0a7df51c121.parquet:   0%|          | 0.00/416M [00:00<?, ?B/s]

(…)-00054-of-00738-81b928a2cf507d49.parquet:   0%|          | 0.00/418M [00:00<?, ?B/s]

(…)-00055-of-00738-5bdfc740cc459e2f.parquet:   0%|          | 0.00/420M [00:00<?, ?B/s]

(…)-00056-of-00738-6ad018a3f537d055.parquet:   0%|          | 0.00/419M [00:00<?, ?B/s]

(…)-00057-of-00738-cbb8dfdc8ddc9eca.parquet:   0%|          | 0.00/419M [00:00<?, ?B/s]

(…)-00058-of-00738-7e71b587ad7e9e86.parquet:   0%|          | 0.00/419M [00:00<?, ?B/s]

(…)-00059-of-00738-e241ac1c11911749.parquet:   0%|          | 0.00/408M [00:00<?, ?B/s]

(…)-00060-of-00738-7c0c327810199d09.parquet:   0%|          | 0.00/426M [00:00<?, ?B/s]

(…)-00061-of-00738-35bb68034e3b78f4.parquet:   0%|          | 0.00/410M [00:00<?, ?B/s]

(…)-00062-of-00738-2b35fcf636b14d7c.parquet:   0%|          | 0.00/434M [00:00<?, ?B/s]

(…)-00063-of-00738-4ad5f7338aa800a0.parquet:   0%|          | 0.00/421M [00:00<?, ?B/s]

(…)-00064-of-00738-a8c8edbe292d593b.parquet:   0%|          | 0.00/422M [00:00<?, ?B/s]

(…)-00065-of-00738-2bb05f0de2407cb7.parquet:   0%|          | 0.00/409M [00:00<?, ?B/s]

(…)-00066-of-00738-3303d7b164b7315a.parquet:   0%|          | 0.00/415M [00:00<?, ?B/s]

(…)-00067-of-00738-b1341bf00c6cc2c8.parquet:   0%|          | 0.00/415M [00:00<?, ?B/s]

(…)-00068-of-00738-3179a031b20bc6a1.parquet:   0%|          | 0.00/419M [00:00<?, ?B/s]

(…)-00069-of-00738-37ffd158fe2dcba4.parquet:   0%|          | 0.00/416M [00:00<?, ?B/s]

(…)-00070-of-00738-601fa2118fa27b21.parquet:   0%|          | 0.00/407M [00:00<?, ?B/s]

(…)-00071-of-00738-f1792ff2b337dcb1.parquet:   0%|          | 0.00/420M [00:00<?, ?B/s]

(…)-00072-of-00738-7245856e610ea3aa.parquet:   0%|          | 0.00/419M [00:00<?, ?B/s]

(…)-00073-of-00738-8c370735679d0039.parquet:   0%|          | 0.00/421M [00:00<?, ?B/s]

(…)-00074-of-00738-ac62ecf7baf5b06a.parquet:   0%|          | 0.00/416M [00:00<?, ?B/s]

(…)-00075-of-00738-0e126ecc4babb058.parquet:   0%|          | 0.00/420M [00:00<?, ?B/s]

(…)-00076-of-00738-6ed820aab22bce19.parquet:   0%|          | 0.00/427M [00:00<?, ?B/s]

(…)-00077-of-00738-dcd1ed820ce9c13a.parquet:   0%|          | 0.00/413M [00:00<?, ?B/s]

(…)-00078-of-00738-583ac236a1c82fe2.parquet:   0%|          | 0.00/431M [00:00<?, ?B/s]

(…)-00079-of-00738-f8bf78be96ba9af5.parquet:   0%|          | 0.00/420M [00:00<?, ?B/s]

(…)-00080-of-00738-ee740248a2abc4b2.parquet:   0%|          | 0.00/420M [00:00<?, ?B/s]

(…)-00081-of-00738-d9c2abc0afc32646.parquet:   0%|          | 0.00/420M [00:00<?, ?B/s]

(…)-00082-of-00738-69973ef14928e568.parquet:   0%|          | 0.00/422M [00:00<?, ?B/s]

(…)-00083-of-00738-fe4a4801a2ae2cae.parquet:   0%|          | 0.00/424M [00:00<?, ?B/s]

(…)-00084-of-00738-00b77298bb7907e4.parquet:   0%|          | 0.00/407M [00:00<?, ?B/s]

(…)-00085-of-00738-904a7266ccd67053.parquet:   0%|          | 0.00/420M [00:00<?, ?B/s]

(…)-00086-of-00738-d028d79634b2a9fe.parquet:   0%|          | 0.00/411M [00:00<?, ?B/s]

(…)-00087-of-00738-642ca2d354e117a9.parquet:   0%|          | 0.00/412M [00:00<?, ?B/s]

(…)-00088-of-00738-f32a17ccb7c6f337.parquet:   0%|          | 0.00/422M [00:00<?, ?B/s]

(…)-00089-of-00738-89247eed4e13de18.parquet:   0%|          | 0.00/415M [00:00<?, ?B/s]

(…)-00090-of-00738-1da0dc2bdfb30775.parquet:   0%|          | 0.00/431M [00:00<?, ?B/s]

(…)-00091-of-00738-9feaacd3eb99a35d.parquet:   0%|          | 0.00/421M [00:00<?, ?B/s]

(…)-00092-of-00738-a696e516c6777aaf.parquet:   0%|          | 0.00/421M [00:00<?, ?B/s]

(…)-00093-of-00738-bb00ff35a4618ba6.parquet:   0%|          | 0.00/412M [00:00<?, ?B/s]

(…)-00094-of-00738-7a9a553724178dd5.parquet:   0%|          | 0.00/432M [00:00<?, ?B/s]

(…)-00095-of-00738-29b618e637109de7.parquet:   0%|          | 0.00/405M [00:00<?, ?B/s]

(…)-00096-of-00738-cbac3cac3967b554.parquet:   0%|          | 0.00/418M [00:00<?, ?B/s]

(…)-00097-of-00738-30725e453818e1b4.parquet:   0%|          | 0.00/418M [00:00<?, ?B/s]

(…)-00098-of-00738-b176a344a22fdeef.parquet:   0%|          | 0.00/415M [00:00<?, ?B/s]

(…)-00099-of-00738-51d02f4685c26fef.parquet:   0%|          | 0.00/422M [00:00<?, ?B/s]

(…)-00100-of-00738-a2842b42e290ac34.parquet:   0%|          | 0.00/411M [00:00<?, ?B/s]

(…)-00101-of-00738-7f97b90373eb4a35.parquet:   0%|          | 0.00/417M [00:00<?, ?B/s]

(…)-00102-of-00738-569302a737ad6f7f.parquet:   0%|          | 0.00/416M [00:00<?, ?B/s]

(…)-00103-of-00738-da3d56bc74219b84.parquet:   0%|          | 0.00/426M [00:00<?, ?B/s]

(…)-00104-of-00738-41c68bcc5daf3df7.parquet:   0%|          | 0.00/422M [00:00<?, ?B/s]

(…)-00105-of-00738-ae5c67bbafd6b071.parquet:   0%|          | 0.00/422M [00:00<?, ?B/s]

(…)-00106-of-00738-b8d926a4aff66410.parquet:   0%|          | 0.00/421M [00:00<?, ?B/s]

(…)-00107-of-00738-a0d1a728017a3944.parquet:   0%|          | 0.00/414M [00:00<?, ?B/s]

(…)-00108-of-00738-9afd7448324d82c4.parquet:   0%|          | 0.00/406M [00:00<?, ?B/s]

(…)-00109-of-00738-b8b861dc04695181.parquet:   0%|          | 0.00/421M [00:00<?, ?B/s]

(…)-00110-of-00738-d93e245e95a258ec.parquet:   0%|          | 0.00/422M [00:00<?, ?B/s]

(…)-00111-of-00738-11b979549b77618a.parquet:   0%|          | 0.00/422M [00:00<?, ?B/s]

(…)-00112-of-00738-03f7f1719fbfcfe8.parquet:   0%|          | 0.00/422M [00:00<?, ?B/s]

(…)-00113-of-00738-83f919ad75e421a3.parquet:   0%|          | 0.00/416M [00:00<?, ?B/s]

(…)-00114-of-00738-23bbd3cd842468e3.parquet:   0%|          | 0.00/417M [00:00<?, ?B/s]

(…)-00115-of-00738-c3abfdd14fdf8ad7.parquet:   0%|          | 0.00/415M [00:00<?, ?B/s]

(…)-00116-of-00738-42139474d8a6193a.parquet:   0%|          | 0.00/419M [00:00<?, ?B/s]

(…)-00117-of-00738-075b932c68cd86ed.parquet:   0%|          | 0.00/432M [00:00<?, ?B/s]

(…)-00118-of-00738-0b83c8c51a8c0f40.parquet:   0%|          | 0.00/419M [00:00<?, ?B/s]

(…)-00119-of-00738-b355286a9d191da7.parquet:   0%|          | 0.00/413M [00:00<?, ?B/s]

(…)-00120-of-00738-18c8c8aa3872fd43.parquet:   0%|          | 0.00/438M [00:00<?, ?B/s]

(…)-00121-of-00738-b67252a8b9705f9a.parquet:   0%|          | 0.00/414M [00:00<?, ?B/s]

(…)-00122-of-00738-2ed7f0d12ef3658e.parquet:   0%|          | 0.00/416M [00:00<?, ?B/s]

(…)-00123-of-00738-a2891474944817a9.parquet:   0%|          | 0.00/415M [00:00<?, ?B/s]

(…)-00124-of-00738-18c055300b48fec0.parquet:   0%|          | 0.00/422M [00:00<?, ?B/s]

(…)-00125-of-00738-b9dff8caba5a03d2.parquet:   0%|          | 0.00/419M [00:00<?, ?B/s]

(…)-00126-of-00738-b807dacca1affbe9.parquet:   0%|          | 0.00/414M [00:00<?, ?B/s]

(…)-00127-of-00738-5f319b80a8a73fdd.parquet:   0%|          | 0.00/414M [00:00<?, ?B/s]

(…)-00128-of-00738-bcc806598dac2b38.parquet:   0%|          | 0.00/401M [00:00<?, ?B/s]

(…)-00129-of-00738-7107302dcc234181.parquet:   0%|          | 0.00/426M [00:00<?, ?B/s]

(…)-00130-of-00738-07859e5316ba3d6b.parquet:   0%|          | 0.00/425M [00:00<?, ?B/s]

(…)-00131-of-00738-6524bf9d00dec272.parquet:   0%|          | 0.00/414M [00:00<?, ?B/s]

(…)-00132-of-00738-d6ad5ebe896375b9.parquet:   0%|          | 0.00/416M [00:00<?, ?B/s]

(…)-00133-of-00738-c70295c37bc694f9.parquet:   0%|          | 0.00/429M [00:00<?, ?B/s]

(…)-00134-of-00738-af6f9aae1a33bedc.parquet:   0%|          | 0.00/422M [00:00<?, ?B/s]

(…)-00135-of-00738-b0d03b1214b46857.parquet:   0%|          | 0.00/410M [00:00<?, ?B/s]

(…)-00136-of-00738-47ce33ad483b309b.parquet:   0%|          | 0.00/418M [00:00<?, ?B/s]

(…)-00137-of-00738-872d1c13a80b7eee.parquet:   0%|          | 0.00/421M [00:00<?, ?B/s]

(…)-00138-of-00738-ea686dd617b3c9a1.parquet:   0%|          | 0.00/414M [00:00<?, ?B/s]

(…)-00139-of-00738-5553d669cc901261.parquet:   0%|          | 0.00/419M [00:00<?, ?B/s]

(…)-00140-of-00738-b1fa8112254a5ee5.parquet:   0%|          | 0.00/422M [00:00<?, ?B/s]

(…)-00141-of-00738-2f0df5c78ca2f665.parquet:   0%|          | 0.00/430M [00:00<?, ?B/s]

(…)-00142-of-00738-90d06e13940f9c3d.parquet:   0%|          | 0.00/413M [00:00<?, ?B/s]

(…)-00143-of-00738-cc44e1f9bbce244d.parquet:   0%|          | 0.00/408M [00:00<?, ?B/s]

(…)-00144-of-00738-c8763b520b7e44ae.parquet:   0%|          | 0.00/419M [00:00<?, ?B/s]

(…)-00145-of-00738-32902f37a5f36167.parquet:   0%|          | 0.00/411M [00:00<?, ?B/s]

(…)-00146-of-00738-f11c9eda7744c643.parquet:   0%|          | 0.00/410M [00:00<?, ?B/s]

(…)-00147-of-00738-14ab34bd71b034be.parquet:   0%|          | 0.00/426M [00:00<?, ?B/s]

(…)-00148-of-00738-0c810c1dbc5d0a82.parquet:   0%|          | 0.00/425M [00:00<?, ?B/s]

(…)-00149-of-00738-dc0833976f1ca2fa.parquet:   0%|          | 0.00/423M [00:00<?, ?B/s]

(…)-00150-of-00738-25214382641eebe9.parquet:   0%|          | 0.00/407M [00:00<?, ?B/s]

(…)-00151-of-00738-24b5c7b0b13f1132.parquet:   0%|          | 0.00/415M [00:00<?, ?B/s]

(…)-00152-of-00738-7a1e90f96b3ff080.parquet:   0%|          | 0.00/410M [00:00<?, ?B/s]

(…)-00153-of-00738-ba6c2aaf2b9be434.parquet:   0%|          | 0.00/424M [00:00<?, ?B/s]

(…)-00154-of-00738-9344190f9e38725c.parquet:   0%|          | 0.00/409M [00:00<?, ?B/s]

(…)-00155-of-00738-1d4901d110ab7c2a.parquet:   0%|          | 0.00/416M [00:00<?, ?B/s]

(…)-00156-of-00738-00d1ceb3bd693f4e.parquet:   0%|          | 0.00/408M [00:00<?, ?B/s]

(…)-00157-of-00738-aea9c7b8519b8455.parquet:   0%|          | 0.00/432M [00:00<?, ?B/s]

(…)-00158-of-00738-05086716ffaac492.parquet:   0%|          | 0.00/412M [00:00<?, ?B/s]

(…)-00159-of-00738-2266146b26c82a8e.parquet:   0%|          | 0.00/432M [00:00<?, ?B/s]

(…)-00160-of-00738-eb7100d5bfd8df93.parquet:   0%|          | 0.00/404M [00:00<?, ?B/s]

(…)-00161-of-00738-164c130f60c8a9a8.parquet:   0%|          | 0.00/426M [00:00<?, ?B/s]

(…)-00162-of-00738-efa81bac6324bf65.parquet:   0%|          | 0.00/402M [00:00<?, ?B/s]

(…)-00163-of-00738-70d91fa73f28beca.parquet:   0%|          | 0.00/419M [00:00<?, ?B/s]

(…)-00164-of-00738-a038fcb5f2ea2f85.parquet:   0%|          | 0.00/419M [00:00<?, ?B/s]

(…)-00165-of-00738-c409762c17c0fbfe.parquet:   0%|          | 0.00/414M [00:00<?, ?B/s]

(…)-00166-of-00738-e7bddf2f4ec9d720.parquet:   0%|          | 0.00/417M [00:00<?, ?B/s]

(…)-00167-of-00738-2c078ab9d284a1b6.parquet:   0%|          | 0.00/419M [00:00<?, ?B/s]

(…)-00168-of-00738-772263b262c3e5dc.parquet:   0%|          | 0.00/419M [00:00<?, ?B/s]

(…)-00169-of-00738-f1f26a17c467f554.parquet:   0%|          | 0.00/414M [00:00<?, ?B/s]

(…)-00170-of-00738-f91d3275cba834f4.parquet:   0%|          | 0.00/422M [00:00<?, ?B/s]

(…)-00171-of-00738-b515faef144fc386.parquet:   0%|          | 0.00/423M [00:00<?, ?B/s]

(…)-00172-of-00738-58aeee814d44cd6a.parquet:   0%|          | 0.00/409M [00:00<?, ?B/s]

(…)-00173-of-00738-43cfe31b32150796.parquet:   0%|          | 0.00/413M [00:00<?, ?B/s]

(…)-00174-of-00738-8899c8b12942ea2f.parquet:   0%|          | 0.00/416M [00:00<?, ?B/s]

(…)-00175-of-00738-b74113ea563afe4e.parquet:   0%|          | 0.00/416M [00:00<?, ?B/s]

(…)-00176-of-00738-a37076fea6e44ea9.parquet:   0%|          | 0.00/413M [00:00<?, ?B/s]

(…)-00177-of-00738-08f93ba92dbe8e46.parquet:   0%|          | 0.00/419M [00:00<?, ?B/s]

(…)-00178-of-00738-c4522beabceabab5.parquet:   0%|          | 0.00/411M [00:00<?, ?B/s]

(…)-00179-of-00738-327b45b17021e805.parquet:   0%|          | 0.00/418M [00:00<?, ?B/s]

(…)-00180-of-00738-a7b88e28054bd53a.parquet:   0%|          | 0.00/428M [00:00<?, ?B/s]

(…)-00181-of-00738-17260b1dc79ae69f.parquet:   0%|          | 0.00/428M [00:00<?, ?B/s]

(…)-00182-of-00738-70cdfbc2a0cb0198.parquet:   0%|          | 0.00/412M [00:00<?, ?B/s]

(…)-00183-of-00738-51d67dd8205e3023.parquet:   0%|          | 0.00/405M [00:00<?, ?B/s]

(…)-00184-of-00738-8e5bea4b451e9801.parquet:   0%|          | 0.00/421M [00:00<?, ?B/s]

(…)-00185-of-00738-a7e916302563ba00.parquet:   0%|          | 0.00/415M [00:00<?, ?B/s]

(…)-00186-of-00738-9dfa0a2e4842f125.parquet:   0%|          | 0.00/423M [00:00<?, ?B/s]

(…)-00187-of-00738-fb363cbb3208fa8f.parquet:   0%|          | 0.00/429M [00:00<?, ?B/s]

(…)-00188-of-00738-ae106b6ce82b5547.parquet:   0%|          | 0.00/420M [00:00<?, ?B/s]

(…)-00189-of-00738-7d9c243d45a9f695.parquet:   0%|          | 0.00/415M [00:00<?, ?B/s]

(…)-00190-of-00738-f182c0e90a1ad94e.parquet:   0%|          | 0.00/413M [00:00<?, ?B/s]

(…)-00191-of-00738-6e0da8b8a5bf7386.parquet:   0%|          | 0.00/415M [00:00<?, ?B/s]

(…)-00192-of-00738-d017ffffa33d881c.parquet:   0%|          | 0.00/415M [00:00<?, ?B/s]

(…)-00193-of-00738-c38a9587e147a9c3.parquet:   0%|          | 0.00/427M [00:00<?, ?B/s]

(…)-00194-of-00738-b14857330f68da04.parquet:   0%|          | 0.00/421M [00:00<?, ?B/s]

(…)-00195-of-00738-ee93e8242f708fe2.parquet:   0%|          | 0.00/416M [00:00<?, ?B/s]

(…)-00196-of-00738-ff90e1ce5d8d6006.parquet:   0%|          | 0.00/420M [00:00<?, ?B/s]

(…)-00197-of-00738-8404fe895ca09723.parquet:   0%|          | 0.00/406M [00:00<?, ?B/s]

(…)-00198-of-00738-26a9e239f4917cc8.parquet:   0%|          | 0.00/411M [00:00<?, ?B/s]

(…)-00199-of-00738-aec0a5dc2f138c8b.parquet:   0%|          | 0.00/417M [00:00<?, ?B/s]

(…)-00200-of-00738-fcdea91675d5371f.parquet:   0%|          | 0.00/415M [00:00<?, ?B/s]

(…)-00201-of-00738-ec37751a1d4eac58.parquet:   0%|          | 0.00/419M [00:00<?, ?B/s]

(…)-00202-of-00738-f2e985be28875937.parquet:   0%|          | 0.00/417M [00:00<?, ?B/s]

(…)-00203-of-00738-7fd0d43e9b2c7e35.parquet:   0%|          | 0.00/415M [00:00<?, ?B/s]

(…)-00204-of-00738-4c839ae5fc730673.parquet:   0%|          | 0.00/417M [00:00<?, ?B/s]

(…)-00205-of-00738-79330673af6e0fa4.parquet:   0%|          | 0.00/415M [00:00<?, ?B/s]

(…)-00206-of-00738-13549ee5d1d3b229.parquet:   0%|          | 0.00/419M [00:00<?, ?B/s]

(…)-00207-of-00738-17ead1d9df5987ff.parquet:   0%|          | 0.00/412M [00:00<?, ?B/s]

(…)-00208-of-00738-22667f4ecff17e55.parquet:   0%|          | 0.00/423M [00:00<?, ?B/s]

(…)-00209-of-00738-666130bf820f4022.parquet:   0%|          | 0.00/415M [00:00<?, ?B/s]

(…)-00210-of-00738-a8067b71f9621e81.parquet:   0%|          | 0.00/412M [00:00<?, ?B/s]

(…)-00211-of-00738-ba0969f249fe2d10.parquet:   0%|          | 0.00/406M [00:00<?, ?B/s]

(…)-00212-of-00738-e9c7bc4768e01f14.parquet:   0%|          | 0.00/412M [00:00<?, ?B/s]

(…)-00213-of-00738-1e7e955e708514e6.parquet:   0%|          | 0.00/428M [00:00<?, ?B/s]



(…)-00214-of-00738-7dc0267fbae957a4.parquet:   0%|          | 0.00/415M [00:00<?, ?B/s]



(…)-00215-of-00738-9c52e50647fcbf94.parquet:   0%|          | 0.00/413M [00:00<?, ?B/s]

OSError: [Errno 28] No space left on device