In [1]:
from datasets import load_dataset, DatasetDict
from collections import Counter
import re

In [2]:
stories_dataset = load_dataset("pauhidalgoo/patufet-stories", split="train")

Downloading readme:   0%|          | 0.00/6.49k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/200M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/200M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/199313 [00:00<?, ? examples/s]

In [3]:
code_dataset = load_dataset("pauhidalgoo/patufet-code", split="train")

Downloading readme:   0%|          | 0.00/7.23k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/37.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/64842 [00:00<?, ? examples/s]

In [4]:
textbooks_dataset = load_dataset("pauhidalgoo/patufet-textbooks", split="train")

Downloading readme:   0%|          | 0.00/8.75k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/192M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/241M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/278M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/271284 [00:00<?, ? examples/s]

In [13]:


def detect_repetitions(text, ngram_size=5, repetition_threshold=15):
    words = re.findall(r'\b\w+\b', text)
    
    ngrams = [' '.join(words[i:i+ngram_size]) for i in range(len(words) - ngram_size + 1)]
    
    ngram_counts = Counter(ngrams)
    
    for ngram, count in ngram_counts.items():
        if count >= repetition_threshold:
            return True
    
    return False

def filter_repetitive_examples(example):
    return not detect_repetitions(example['solution'])


In [10]:
stories_dataset = stories_dataset.filter(filter_repetitive_examples)

Filter:   0%|          | 0/199313 [00:00<?, ? examples/s]

In [11]:
stories_dataset.shape

(199296, 2)

In [14]:
code_dataset = code_dataset.filter(filter_repetitive_examples)

Filter:   0%|          | 0/64842 [00:00<?, ? examples/s]

In [15]:
code_dataset.shape

(64763, 6)

In [16]:
educat_dataset = load_dataset("pauhidalgoo/patufet-educat", split="train")

Downloading readme:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/30 [00:00<?, ?files/s]

Generating train split:   0%|          | 0/3468535 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/30 [00:00<?, ?it/s]

In [18]:
from datasets import load_dataset, DatasetDict, concatenate_datasets


def select_and_rename_columns(example, text_col, other_cols):
    renamed_example = {'text': example[text_col]}
    for col in other_cols:
        renamed_example[col] = example.get(col, None)
    return renamed_example

textbooks_columns = ['field', 'audience']
stories_columns = ['Prompt']
code_columns = ['exercise', 'style']

dataset_textbooks = textbooks_dataset.map(lambda x: select_and_rename_columns(x, 'text', textbooks_columns))
dataset_stories = stories_dataset.map(lambda x: select_and_rename_columns(x, 'Story', stories_columns))
dataset_code = code_dataset.map(lambda x: select_and_rename_columns(x, 'solution', code_columns))


Map:   0%|          | 0/271284 [00:00<?, ? examples/s]

Map:   0%|          | 0/199296 [00:00<?, ? examples/s]

Map:   0%|          | 0/64763 [00:00<?, ? examples/s]

In [23]:
dataset_textbooks = dataset_textbooks.remove_columns(["subtopic","chapter","subunit"])

In [26]:
dataset_stories = dataset_stories.remove_columns("Story")
dataset_stories = dataset_stories.rename_column("Prompt", "prompt")

In [28]:
dataset_code = dataset_code.remove_columns(["topic", "style", "solution"])

In [29]:
combined_dataset = concatenate_datasets([educat_dataset, dataset_textbooks, dataset_stories, dataset_code])

In [31]:
combined_dataset = combined_dataset.shuffle()

In [32]:
from huggingface_hub import create_repo

repo_url = create_repo("pauhidalgoo/patufet-pretrain", repo_type="dataset", private=False)

combined_dataset.push_to_hub("pauhidalgoo/patufet-pretrain")

print("Dataset uploaded successfully to Hugging Face!")

Uploading the dataset shards:   0%|          | 0/35 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/115 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/115 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/115 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/115 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/115 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/115 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/115 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/115 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/115 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/115 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/115 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/115 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/115 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/115 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/115 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/115 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/115 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/115 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/115 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/115 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/115 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/115 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/115 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/115 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/115 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/115 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/115 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/115 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/115 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/115 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/115 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/115 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/115 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/115 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/115 [00:00<?, ?ba/s]

Dataset uploaded successfully to Hugging Face!
