In [1]:
import pandas as pd
from datasets import load_dataset, Dataset

# Load datasets
gsm8k_data = load_dataset("openai/gsm8k", "main", split="train").to_pandas()
dolly_data = load_dataset("philschmid/dolly-15k-oai-style", split="train").to_pandas()
pubmed_data = load_dataset("fedml/PubMedQA_instruction", split="train").to_pandas()
chatdoctor_data = load_dataset("LinhDuong/chatdoctor-200k", split="train").to_pandas()

# Preprocess datasets
def preprocess_gsm8k(df):
    df['inputs'] = "Question: " + df['question']
    df['outputs'] = "Answer: " + df['answer']
    return df[['inputs', 'outputs']]

def preprocess_dolly(df):
    user_content = df[df['messages'].apply(lambda x: x[0]['role'] == 'user')]['messages'].apply(lambda x: x[0]['content'])
    assistant_content = df[df['messages'].apply(lambda x: x[1]['role'] == 'assistant')]['messages'].apply(lambda x: x[1]['content'])
    df['inputs'] = user_content
    df['outputs'] = assistant_content
    return df[['inputs', 'outputs']]

def preprocess_pubmed(df):
    df['inputs'] = df['instruction'] + " " + df['context']
    df['outputs'] = df['response']
    return df[['inputs', 'outputs']]

def preprocess_chatdoctor(df):
    df['inputs'] = df['instruction'] + " " + df['input']
    df['outputs'] = df['output']
    return df[['inputs', 'outputs']]

gsm8k_preprocessed = preprocess_gsm8k(gsm8k_data)
dolly_preprocessed = preprocess_dolly(dolly_data)
pubmed_preprocessed = preprocess_pubmed(pubmed_data)
chatdoctor_preprocessed = preprocess_chatdoctor(chatdoctor_data)

# Concatenate preprocessed datasets
combined_df = pd.concat([gsm8k_preprocessed, dolly_preprocessed, pubmed_preprocessed, chatdoctor_preprocessed], ignore_index=True)

# Convert back to a Datasets object
combined_dataset = Dataset.from_pandas(combined_df)

# Save the concatenated dataset
combined_dataset.save_to_disk("C:/AI_Stuff/data_preprocessed")

# Verify the concatenated dataset
print(combined_dataset)


Saving the dataset (0/2 shards):   0%|          | 0/502410 [00:00<?, ? examples/s]

Dataset({
    features: ['inputs', 'outputs'],
    num_rows: 502410
})


In [6]:
combined_dataset[10726]

{'inputs': 'Could you plan a canoe camping trip in Michigan? I want to canoe the river from start to end, and need specific camping locations for each night.',
 'outputs': 'I would recommend a canoe camping trip on the Au Sable River in Michigan. The river is about 114 miles long beginning in Grayling and ending in Oscoda. The river features numerous campgrounds which you will be able to camp at each night. \n\nDay 1\nWhitepine Campground\n\nDay 2\nParmalee Campground\n\nDay 3\nMio Campground\n\nDay 4\nAlcona Dam Campground\n\nDay 5\nLoud Dam Campground\n\nDay 6\nEnd at Lake Huron'}

In [7]:
import os
from datasets import load_from_disk, Dataset
from transformers import AutoTokenizer

def tokenize_and_save_dataset(preprocessed_path, processed_path, model_id="microsoft/phi-1_5", max_length=1024):
    # Load the preprocessed dataset
    preprocessed_dataset = load_from_disk(preprocessed_path)
    
    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = 'left'

    # Tokenize function
    def tokenize_function(examples):
        input_encodings = tokenizer(examples['inputs'], padding="max_length", truncation=True, max_length=max_length)
        output_encodings = tokenizer(examples['outputs'], padding="max_length", truncation=True, max_length=max_length)
        return {
            'input_ids': input_encodings['input_ids'],
            'attention_mask': input_encodings['attention_mask'],
            'labels': output_encodings['input_ids']
        }

    # Tokenize the dataset
    tokenized_dataset = preprocessed_dataset.map(tokenize_function, batched=True)
    
    # Set format to PyTorch tensors
    tokenized_dataset.set_format("torch")
    
    # Save the tokenized dataset
    tokenized_dataset.save_to_disk(processed_path)

# Paths
preprocessed_path = "C:/AI_Stuff/data_preprocessed"
processed_path = "C:/AI_Stuff/data_processed"

# Run the function
tokenize_and_save_dataset(preprocessed_path, processed_path)


Map:   0%|          | 0/502410 [00:00<?, ? examples/s]

Saving the dataset (0/15 shards):   0%|          | 0/502410 [00:00<?, ? examples/s]