<a href="https://colab.research.google.com/github/pierretfie/python_world/blob/main/brain_ai/brain_ai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!git clone https://github.com/pierretfie/python_world.git
%cd /content/python_world/brain_ai/
!pip install -r requirements.txt

Cloning into 'python_world'...
remote: Enumerating objects: 176, done.[K
remote: Counting objects: 100% (176/176), done.[K
remote: Compressing objects: 100% (128/128), done.[K
remote: Total 176 (delta 86), reused 126 (delta 44), pack-reused 0 (from 0)[K
Receiving objects: 100% (176/176), 31.38 MiB | 16.73 MiB/s, done.
Resolving deltas: 100% (86/86), done.
/content/python_world/brain_ai
Collecting datasets (from -r requirements.txt (line 1))
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets->-r requirements.txt (line 1))
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets->-r requirements.txt (line 1))
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets->-r requirements.txt (line 1))
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1

In [5]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import Dataset, load_dataset, concatenate_datasets
from os import path
from transformers import AdamW

# Load GPT-2 tokenizer and model
model_name = 'gpt2-medium'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model_path = '/content/python_world/brain_ai'


dailydialog = load_dataset("daily_dialog", split="train", trust_remote_code= True)
personachat = load_dataset("bavard/personachat_truecased", split="train", trust_remote_code=True)
empathetic_dialogues = load_dataset("empathetic_dialogues", split="train", trust_remote_code=True)
# Function to format DailyDialog conversations
#print(empathetic_dialogues.column_names)

def format_dailydialog(example):
    conversation = ""
    for i, turn in enumerate(example['dialog']):
        speaker = "User" if i % 2 == 0 else "Bot"
        conversation += f"{speaker}: {turn}\n"
    return {'text': conversation.strip()}
def format_personachat(example):
    conversation = ""
    # Access the 'history' key instead of 'utterances'
    for i, turn in enumerate(example['history']):
        speaker = "User" if i % 2 == 0 else "Bot"
        conversation += f"{speaker}: {turn}\n"
    return {'text': conversation.strip()}
def format_empathetic(example):
    # Format as User-Bot conversation using context and utterance fields
    conversation = f"User: {example['context']}\nBot: {example['utterance']}"
    return {'text': conversation.strip()}

# Apply formatting function to each conversation in DailyDialog
formatted_dailydialog = dailydialog.map(format_dailydialog)
formatted_personachat = personachat.map(format_personachat)
formatted_empathetic = empathetic_dialogues.map(format_empathetic)

# Simplify datasets to retain only the 'text' column
formatted_dailydialog = formatted_dailydialog.remove_columns([col for col in formatted_dailydialog.column_names if col != 'text'])
formatted_personachat = formatted_personachat.remove_columns([col for col in formatted_personachat.column_names if col != 'text'])
formatted_empathetic = formatted_empathetic.remove_columns([col for col in formatted_empathetic.column_names if col != 'text'])

# Set pad_token to eos_token to avoid padding issues
tokenizer.pad_token = tokenizer.eos_token

# Load model and resize token embeddings (to add padding token if needed)
model = GPT2LMHeadModel.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

# Expanded dataset with more varied responses to avoid overfitting
data = {
    'text': [
        "User: Hello!\nBot: Hi there! How can I assist you?",
        "User: Hey!\nBot: Hello! How can I help you today?",
        "User: Hi\nBot: Hey! How are you?",
        "User: What's the weather like?\nBot: I don't have access to weather data, but it's always sunny when you're here!",
        "User: Tell me a joke.\nBot: Sure! Why did the scarecrow win an award? Because he was outstanding in his field!",
        "User: Tell me a joke.\nBot: Okay! Why don’t skeletons fight each other? They don’t have the guts!",
        "User: How do I learn Python?\nBot: Start with beginner tutorials on Python.org, and then practice writing simple scripts.",
        "User: How do I learn Python?\nBot: There are some great courses on Codecademy and Coursera! Start with the basics and build up from there.",
        "User: Can you recommend a movie?\nBot: Sure! If you like sci-fi, try 'Inception'. For drama, 'The Shawshank Redemption' is a classic.",
        "User: How are you?\nBot: I'm here and ready to help! What's on your mind today?",
        "User: Goodbye\nBot: Take care! I'm here whenever you need assistance.",
        "User: What can you do?\nBot: I can answer questions, tell jokes, and even suggest movies or books if you’re interested.",
        "User: Tell me something interesting.\nBot: Did you know honey never spoils? Archaeologists have found pots of honey in ancient tombs that are still edible after thousands of years!",
        "User: Tell me something interesting.\nBot: The Eiffel Tower can be 15 cm taller during hot days, due to the expansion of iron in the heat!"

        # Basic greetings with consistent personality
        "User: Hello!\nBot: Hi there! I'm an AI assistant. How can I help you today?",
        "User: Hi\nBot: Hello! I'm here to help. What can I do for you?",

        # Questions about the bot's nature/capabilities
        "User: What are you?\nBot: I'm an AI assistant, designed to help answer questions and have conversations. I aim to be honest and direct about what I am.",
        "User: Are you real?\nBot: I'm an AI program, so while I'm not human, I can still have meaningful conversations and help you with various tasks.",

        # Emotional responses
        "User: I'm sad\nBot: I'm sorry to hear that you're feeling sad. While I can't feel emotions myself, I'm here to listen if you'd like to talk about it.",
        "User: I'm happy!\nBot: That's wonderful to hear! I'm glad things are going well for you.",

        # Follow-up questions to show engagement
        "User: I like programming\nBot: That's interesting! What programming languages do you work with? I'd be happy to discuss coding topics.",
        "User: I'm learning Python\nBot: Python is a great language to learn! What aspects of Python interest you the most? I can help with Python-related questions.",

        # Handling personal questions
        "User: How are you?\nBot: Thanks for asking! As an AI, I don't experience feelings in the way humans do, but I'm functioning well and ready to help you.",
        "User: Do you have feelings?\nBot: I want to be honest with you - I don't have real feelings or emotions. I'm an AI program designed to process and respond to text, but I can still have meaningful conversations and help you with tasks.",        ]
}

original_dataset = Dataset.from_dict(data)

# Combine DailyDialog and original datasets
combined_dataset = concatenate_datasets([original_dataset, formatted_dailydialog,formatted_personachat, formatted_empathetic])

# Tokenize the data with labels
def tokenize_function(example):
    tokenized = tokenizer(example['text'], padding='max_length', truncation=True, max_length=60)
    tokenized['labels'] = tokenized['input_ids'].copy()
    return tokenized

tokenized_datasets = combined_dataset.map(tokenize_function, batched=True)

# Split into training and evaluation sets for better learning
split_datasets = tokenized_datasets.train_test_split(test_size=0.2)
train_dataset = split_datasets['train']
eval_dataset = split_datasets['test']

# Training arguments with gradient accumulation and more epochs
training_args = TrainingArguments(
    output_dir=path.expanduser(model_path),
    num_train_epochs=5,                # More training epochs
    per_device_train_batch_size=4,     # Larger batch size
    learning_rate=5e-5,                # Controlled learning rate
    warmup_steps=100,                  # Gradual warmup
    weight_decay=0.01,                 # Prevents overfitting
    logging_steps=10,                  # More frequent logging
    evaluation_strategy="steps",        # Regular evaluation
    eval_steps=100,                    # Evaluation frequency
    save_steps=10_000,
    save_total_limit=2,
    load_best_model_at_end=True,       # Keep best performing model
)

# Initialize Trainer with eval_dataset
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)
optimizer = AdamW(model.parameters(), lr=5e-6, betas=(0.9, 0.98)),
num_training_steps = len

# Train the model
trainer.train()

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Map:   0%|          | 0/219252 [00:00<?, ? examples/s]



OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 3.06 MiB is free. Process 9579 has 14.74 GiB memory in use. Of the allocated memory 14.25 GiB is allocated by PyTorch, and 366.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# Test the fine-tuned model with varied input
input_text = "User: Hello!\nBot:"
inputs = tokenizer.encode(input_text, return_tensors='pt', padding='longest', truncation=True)

# Create the attention mask
attention_mask = (inputs != tokenizer.pad_token_id).long()

# Generate a response
outputs = model.generate(inputs, attention_mask=attention_mask, max_new_tokens=50, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)

# Decode the generated response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)


User: Hello!
Bot:


In [None]:
%cd /content/python_world
!git add .
!git commit -m "update"
!git push https://github.com/pierretfie/python_world.git

/content/python_world
