In [5]:
import re
import json
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer

In [6]:
# Load and Clean Text
def load_and_clean_text(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()
    
    text = re.sub(r"\s+", " ", text)  # Remove extra spaces, tabs, and newlines
    text = re.sub(r"[^\x20-\x7E]", "", text)  # Remove non-printable characters
    return text

# Split Text into Chunks
def chunk_text(text, chunk_size=3000):
    sentences = re.split(r"(?<=[.!?])\s+", text)
    chunks, current_chunk = [], ""
    
    for sentence in sentences:
        if len(current_chunk) + len(sentence) < chunk_size:
            current_chunk += sentence + " "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + " "
    
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

# Tokenization
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=4096)


# Load, process, and save dataset
def preprocess_dataset(file_path, model_name="mistralai/Mistral-7B-v0.1"):
    text = load_and_clean_text(file_path)
    text_chunks = chunk_text(text)
    
    dataset_dict = {"text": text_chunks}
    dataset = Dataset.from_dict(dataset_dict)
    
    global tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    dataset = dataset.map(tokenize_function, batched=True)
    
    dataset.save_to_disk("processed_dataset")
    print("Dataset saved successfully!")



In [10]:
file_path = "D:\\project\\output.txt"  # Replace with actual file
preprocess_dataset(file_path)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Map:   0%|                                                                               | 0/14 [00:00<?, ? examples/s]


ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.

In [13]:
# Load and Clean Text
def load_and_clean_text(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()
    
    text = re.sub(r"\s+", " ", text)  # Remove extra spaces, tabs, and newlines
    text = re.sub(r"[^\x20-\x7E]", "", text)  # Remove non-printable characters
    return text

# Split Text into Chunks
def chunk_text(text, chunk_size=3000):
    sentences = re.split(r"(?<=[.!?])\s+", text)
    chunks, current_chunk = [], ""
    
    for sentence in sentences:
        if len(current_chunk) + len(sentence) < chunk_size:
            current_chunk += sentence + " "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + " "
    
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

# Tokenization
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=4096)

# Load, process, and save dataset
def preprocess_dataset(file_path, model_name="mistralai/Mistral-7B-v0.1"):
    text = load_and_clean_text(file_path)
    text_chunks = chunk_text(text)
    
    dataset_dict = {"text": text_chunks}
    dataset = Dataset.from_dict(dataset_dict)
    
    global tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Fix: Set padding token
    tokenizer.pad_token = tokenizer.eos_token
    
    dataset = dataset.map(tokenize_function, batched=True)
    
    dataset.save_to_disk("D:\\project\\processed_dataset")
    print("Dataset saved successfully!")




Map: 100%|█████████████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 333.51 examples/s]
Saving the dataset (1/1 shards): 100%|████████████████████████████████████████| 14/14 [00:00<00:00, 1306.90 examples/s]

Dataset saved successfully!





In [None]:
# Run preprocessing
file_path = "D:\\project\\output.txt"  # Replace with actual file
preprocess_dataset(file_path)