In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from dotenv import load_dotenv
import torch
import os
from datasets import load_dataset

_ = load_dotenv()

_ = os.getenv("HF_TOKEN")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Use EOS as PAD
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"


# Load gpt-2 in 8-bit to save memory
model = AutoModelForCausalLM.from_pretrained(
    "gpt2",
    pad_token_id=tokenizer.eos_token_id,
	torch_dtype=torch.float16,
    device_map="auto"
)

In [2]:
file_name = "../data/dialogs.txt"
text_dataset = []

with open(file_name, "r") as infile:
    count = 0
    for line in infile:
        q, a = line.split("\t")
        if (count % 2 == 0):
            q = ". ".join([x.capitalize() for x in q.split(". ")])
            a = ". ".join([x.capitalize() for x in a.split(". ")])
            text = f"Human: {q} Bot: {a}"
            text_dataset.append(text)
        count += 1

In [3]:
file_name = "./text_dataset1.txt"
with open(file_name, "w") as outfile:
	outfile.writelines(text_dataset)

In [4]:
# Load full dataset first
dataset = load_dataset("text", data_files="text_dataset1.txt")

# Split into training and validation sets (80/20)
split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)

Generating train split: 0 examples [00:00, ? examples/s]

In [5]:
# Access train and validation sets
train_dataset = split_dataset["train"]
val_dataset = split_dataset["test"]

In [6]:
# Tokenize the datasets
def tokenize_function(examples):
    tokens = tokenizer(
        examples["text"],
        truncation=True,
        max_length=512
    )
    return tokens 

tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_val = val_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

Map:   0%|          | 0/1490 [00:00<?, ? examples/s]

Map:   0%|          | 0/373 [00:00<?, ? examples/s]

In [None]:
# Save the splits (optional)
train_dataset.to_json("train_dataset.json", orient='records', lines=True)
val_dataset.to_json("val_dataset.json", orient='records', lines=True)

# Load both splits at once
reloaded_dataset = load_dataset('json', data_files={
    'train': 'train_dataset.json',
    'test': 'val_dataset.json'
})