In [1]:
import os
from pathlib import Path

import polars as pl

In [2]:
dfs = []
folder = Path(os.getcwd()).parent / 'data/merged_dataset/synthetic'
files = os.listdir(folder)
print(files)
for f in files:
    if f.endswith('.tsv') and not f.startswith('_'):
        dfs.append(pl.read_csv(folder / f, separator='\t', quote_char='\"', infer_schema=False))

['.DS_Store', '_test_failed.tsv', 'medium_old', 'test.tsv', 'test_pc.tsv', 'test-alt.tsv', 'test-m3.tsv']


In [3]:
dfs

[shape: (301_645, 5)
 ┌─────────────────────┬─────────────────────┬─────────────────────┬──────────────┬─────────────────┐
 │ input               ┆ output              ┆ instruct            ┆ dataset_type ┆ dataloader_name │
 │ ---                 ┆ ---                 ┆ ---                 ┆ ---          ┆ ---             │
 │ str                 ┆ str                 ┆ str                 ┆ str          ┆ str             │
 ╞═════════════════════╪═════════════════════╪═════════════════════╪══════════════╪═════════════════╡
 │ input               ┆ output              ┆ instruct            ┆ dataset_type ┆ dataloader_name │
 │ Йдеться про         ┆ <p t="VERB">Йдеться ┆ Використай наведені ┆ 8            ┆ PapersDataset   │
 │ традицію феноменол… ┆ </p> <p t="…        ┆ нижче інст…         ┆              ┆                 │
 │ 3 З огляду на       ┆ <p t="NUM">3</p> <p ┆ Використай наведені ┆ 8            ┆ PapersDataset   │
 │ багатозначність …   ┆ t="ADP">З<…         ┆ нижче інст…   

In [4]:
merged = pl.concat(dfs)
shuffled_df = merged.sample(fraction=1, shuffle=True)

In [5]:
import math

train_df = shuffled_df.head(math.ceil(len(shuffled_df) * 0.60))
not_train_df = shuffled_df.tail(math.ceil(len(shuffled_df) * 0.40))
valid_df = not_train_df.head(math.ceil(len(not_train_df) * 0.40))
test_df = not_train_df.tail(math.ceil(len(not_train_df) * 0.60))


In [6]:
len(test_df), len(valid_df), len(not_train_df), len(train_df)

(383235, 255490, 638724, 958086)

In [8]:
large_folder = folder / 'large'
train_df.write_csv(large_folder / 'train.tsv', separator='\t', quote_char='\"')
valid_df.write_csv(large_folder / 'valid.tsv', separator='\t', quote_char='\"')
test_df.write_csv(large_folder / 'test.tsv', separator='\t', quote_char='\"')

In [9]:
medium_folder = folder / 'medium'
medium_coef = 0.5
train_df.sample(fraction=medium_coef, shuffle=True).write_csv(medium_folder / 'train.tsv', separator='\t', quote_char='\"')
valid_df.sample(fraction=medium_coef, shuffle=True).write_csv(medium_folder / 'valid.tsv', separator='\t', quote_char='\"')
test_df.sample(fraction=medium_coef, shuffle=True).write_csv(medium_folder / 'test.tsv', separator='\t', quote_char='\"')


In [10]:
small_folder = folder / 'small'
small_coef = 0.1
train_df.sample(fraction=small_coef, shuffle=True).write_csv(small_folder / 'train.tsv', separator='\t', quote_char='\"')
valid_df.sample(fraction=small_coef, shuffle=True).write_csv(small_folder / 'valid.tsv', separator='\t', quote_char='\"')
test_df.sample(fraction=small_coef, shuffle=True).write_csv(small_folder / 'test.tsv', separator='\t', quote_char='\"')

#### Golden dataset processing

In [1]:
import os
from pathlib import Path

import polars as pl

In [2]:
folder = Path(os.getcwd()).parent / 'data/merged_dataset/golden'
files = os.listdir(folder)
train_df = pl.read_csv(folder / 'train.tsv', separator='\t', quote_char='\"', infer_schema=False)
test_df = pl.read_csv(folder / 'test.tsv', separator='\t', quote_char='\"', infer_schema=False)
train_df, test_df

(shape: (267_449, 5)
 ┌────────────────────┬────────────────────┬────────────────────┬──────────────┬────────────────────┐
 │ input              ┆ output             ┆ instruct           ┆ dataset_type ┆ dataloader_name    │
 │ ---                ┆ ---                ┆ ---                ┆ ---          ┆ ---                │
 │ str                ┆ str                ┆ str                ┆ str          ┆ str                │
 ╞════════════════════╪════════════════════╪════════════════════╪══════════════╪════════════════════╡
 │ Byte for France    ┆ Byte for France    ┆ Виправ граматичні  ┆ 1            ┆ UaGecDataset       │
 │ або “Мій досві…    ┆ або “Мій досві…    ┆ помилки в по…      ┆              ┆                    │
 │ Сьогодні розповім  ┆ Сьогодні розповім  ┆ Виправ граматичні  ┆ 1            ┆ UaGecDataset       │
 │ про те як і …      ┆ про те<g ed=…      ┆ помилки в по…      ┆              ┆                    │
 │ Моє бачення        ┆ Моє бачення        ┆ Виправ граматичн

In [8]:
import math

golden_shuffled = train_df.sample(fraction=1, shuffle=True)

val_df = golden_shuffled.tail(math.ceil(len(golden_shuffled) * 0.20))
train_df = golden_shuffled.head(math.ceil(len(golden_shuffled) * 0.80))
test_df = test_df.sample(fraction=1, shuffle=True)

In [9]:
golden_split_folder = folder / 'golden_split'
import os
if not golden_split_folder.exists():
    os.mkdir(golden_split_folder)

In [11]:
train_df.write_csv(golden_split_folder / 'train.tsv', separator='\t', quote_char='\"')
val_df.write_csv(golden_split_folder / 'valid.tsv', separator='\t', quote_char='\"')
test_df.write_csv(golden_split_folder / 'test.tsv', separator='\t', quote_char='\"')