In [3]:
import torch
from datasets import load_dataset, Dataset
import pandas as pd

In [18]:
# Adjust the path as necessary to point to your specific split file
split_data_path = '/Users/nash/Project/fedn/fedn/examples/mnist-pytorch/test/clients/1/dataset.pt'
split_data = torch.load(split_data_path)


In [19]:
split_data

Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 1246
})

In [6]:
# Load your dataset
dataset = load_dataset("knkarthick/dialogsum",split='train[:50%]')


In [5]:
dataset

Dataset({
    features: ['id', 'dialogue', 'summary', 'topic'],
    num_rows: 1246
})

In [7]:
# Convert to pandas DataFrame
df = pd.DataFrame(data)

# Shuffle the DataFrame
df = df.sample(frac=1).reset_index(drop=True)

# Convert back to datasets.Dataset
shuffled_data = Dataset.from_pandas(df)

In [8]:
from transformers import  AutoTokenizer

model_name='google/flan-t5-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [9]:
def tokenize_function(example):
    start_prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids
    
    return example

In [10]:
tokenized_datasets = shuffled_data.map(tokenize_function, batched=True)

Map:   0%|          | 0/6230 [00:00<?, ? examples/s]

In [11]:
tokenized_datasets

Dataset({
    features: ['id', 'dialogue', 'summary', 'topic', 'input_ids', 'labels'],
    num_rows: 6230
})

In [12]:
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'topic', 'dialogue', 'summary',])

In [13]:
tokenized_datasets

Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 6230
})

In [14]:
n = len(tokenized_datasets)
n

6230

In [15]:
out_dir='test'

In [16]:
import os


if not os.path.exists(f'{out_dir}/clients'):
        os.makedirs(f'{out_dir}/clients')


In [17]:

# Size of each section
section_size = n // 5

# Create 5 sections
sections = []
for i in range(5):
    subdir = f'{out_dir}/clients/{str(i+1)}'
    if not os.path.exists(subdir):
        os.makedirs(subdir)
    start_idx = i * section_size
    # For the last section, take all remaining data
    end_idx = (i + 1) * section_size if i != 4 else n
    section = tokenized_datasets.select(range(start_idx, end_idx))
    torch.save(section,f'{subdir}/dataset.pt')

# Now you have 5 sections stored in 'sections'


In [68]:
from datasets import load_dataset

# Load your dataset
dataset = load_dataset("knkarthick/dialogsum",split='train[:20%]')


In [69]:
dataset

Dataset({
    features: ['id', 'dialogue', 'summary', 'topic'],
    num_rows: 2492
})

In [81]:
iterable_dataset = dataset.to_iterable_dataset(num_shards=5)
shuffled_iterable_dataset = iterable_dataset.shuffle(seed=42, buffer_size=1000)

In [82]:
shuffled_iterable_dataset

IterableDataset({
    features: ['id', 'dialogue', 'summary', 'topic'],
    n_shards: 5
})

In [84]:
import itertools

# Assuming iterable_dataset has already been created
shards = [[] for _ in range(5)]  # Create 5 empty lists to store shards

# Iterate over the iterable_dataset and assign elements to shards
for i, element in enumerate(iterable_dataset):
    shard_index = i % 5  # Assign elements to shards in a round-robin fashion
    shards[shard_index].append(element)

# Now you have 5 lists, each containing the elements of the corresponding shard
shard_0, shard_1, shard_2, shard_3, shard_4 = shards

In [2]:
from datasets import load_dataset, Dataset
import pandas as pd

# Load the dataset
dataset = "knkarthick/dialogsum"
data = load_dataset(dataset, split='train[:50%]')

# Convert to pandas DataFrame
df = pd.DataFrame(data)

# Shuffle the DataFrame
df = df.sample(frac=1).reset_index(drop=True)

# Convert back to datasets.Dataset
shuffled_data = Dataset.from_pandas(df)

