In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from dotenv import load_dotenv
import os
from datasets import load_dataset

_ = load_dotenv()

_ = os.getenv("HF_TOKEN")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Use EOS as PAD
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"


# Load gpt-2 in 8-bit to save memory
model = AutoModelForCausalLM.from_pretrained(
    "gpt2",
    pad_token_id=tokenizer.eos_token_id,
    device_map="auto"
)

In [None]:
file_name = "../data/dialogs.txt"
text_dataset = []

with open(file_name, "r") as infile:
    count = 0
    for line in infile:
        q, a = line.split("\t")
        if (count % 2 == 0):
            q = ". ".join([x.capitalize() for x in q.split(". ")])
            a = ". ".join([x.capitalize() for x in a.split(". ")])
            text = f"Human: {q} Bot: {a}"
            text_dataset.append(text)
        count += 1
    

In [None]:
file_name = "./text_dataset.txt"
with open(file_name, "w") as outfile:
	outfile.writelines(text_dataset)

In [46]:
# Load full dataset first
dataset = load_dataset("text", data_files="test_text.txt")

# Split into training and validation sets (80/20)
split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)

Generating train split: 0 examples [00:00, ? examples/s]

In [47]:
# Access train and validation sets
train_dataset = split_dataset["train"]
val_dataset = split_dataset["test"]

In [None]:
# Save the splits (optional)
train_dataset.to_json("train_dataset.json", orient='records', lines=True)
val_dataset.to_json("val_dataset.json", orient='records', lines=True)

# Load both splits at once
reloaded_dataset = load_dataset('json', data_files={
    'train': 'train_dataset.json',
    'test': 'val_dataset.json'
})