In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
from datasets import Dataset
from transformers import Trainer, TrainingArguments

# Define your training and evaluation data with full questions and answers
train_data = Dataset.from_dict({
    'question': [
        "What is Machine Learning?", 
        "How does Power BI help in data analysis?", 
        "What is the difference between supervised and unsupervised learning?", 
        "What are the key features of Python?", 
        "Explain the importance of data preprocessing in machine learning?", 
        "How can we measure model performance in machine learning?", 
        "What is a neural network?", 
        "How does deep learning differ from machine learning?", 
        "What is Natural Language Processing (NLP)?", 
        "What is the role of a Data Scientist in machine learning?"
    ],
    'answer': [
        "Machine learning is a subset of artificial intelligence that enables systems to learn from data and improve over time without being explicitly programmed.", 
        "Power BI helps in data analysis by allowing users to connect, analyze, and visualize data from multiple sources to gain insights and make informed decisions.", 
        "Supervised learning is a type of machine learning where the model is trained on labeled data, while unsupervised learning works with unlabeled data and aims to find patterns or structures.", 
        "Python is a versatile programming language known for its simplicity, readability, and extensive libraries, making it suitable for web development, data analysis, and machine learning.", 
        "Data preprocessing involves cleaning, transforming, and organizing raw data to improve the accuracy and efficiency of machine learning models.", 
        "Model performance in machine learning can be measured using metrics like accuracy, precision, recall, F1-score, and AUC, depending on the type of model and task.", 
        "A neural network is a computational model inspired by the human brain, consisting of layers of nodes (neurons) that process and learn from data to make predictions or classifications.", 
        "Deep learning is a subset of machine learning that uses neural networks with many layers (deep networks) to analyze large and complex datasets, often used for image and speech recognition.", 
        "Natural Language Processing (NLP) is a field of AI that focuses on enabling computers to understand, interpret, and generate human language in a way that is both meaningful and useful.", 
        "A Data Scientist in machine learning is responsible for collecting, analyzing, and interpreting large datasets, building models, and deriving actionable insights for business decision-making."
    ]
})

eval_data = Dataset.from_dict({
    'question': [
        "What are the key features of Power BI?", 
        "How do Data Engineers use Python?", 
        "What is overfitting in machine learning?", 
        "What are decision trees in machine learning?", 
        "How does reinforcement learning work?", 
        "What is the role of the activation function in a neural network?", 
        "What is K-means clustering?", 
        "What is a confusion matrix in machine learning?", 
        "What is data normalization?", 
        "What are the steps in a typical machine learning pipeline?"
    ],
    'answer': [
        "Power BI features include data visualization, business intelligence dashboards, real-time data streaming, integration with multiple data sources, and sharing reports.", 
        "Data Engineers use Python to automate data pipelines, process large datasets, and write code to clean, transform, and load data into databases or data lakes.", 
        "Overfitting occurs when a machine learning model learns not only the underlying pattern in the training data but also the noise, leading to poor performance on new, unseen data.", 
        "Decision trees are a type of model used in machine learning that splits data into smaller and smaller subsets based on feature values to make predictions or classifications.", 
        "Reinforcement learning involves training models to make decisions by rewarding them for correct actions and punishing them for incorrect ones, based on their interaction with the environment.", 
        "The activation function in a neural network determines whether a neuron should be activated or not, introducing non-linearity and allowing the network to learn complex patterns.", 
        "K-means clustering is an unsupervised learning algorithm used to partition data into clusters, with each cluster having its center (mean) based on the features of the data.", 
        "A confusion matrix is a performance measurement tool for classification models that shows the actual versus predicted classifications, helping to identify false positives and negatives.", 
        "Data normalization is the process of scaling data into a specific range to ensure consistency and to help machine learning models perform better, especially those sensitive to feature magnitude.", 
        "A typical machine learning pipeline involves data collection, preprocessing, model training, evaluation, and deployment."
    ]
})

# Initialize tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Tokenization function with padding and truncation
def tokenize_function(examples):
    inputs = tokenizer(examples['question'], padding=True, truncation=True, max_length=512)
    labels = tokenizer(examples['answer'], padding=True, truncation=True, max_length=512)
    # Set the labels as the tokenized answers
    inputs['labels'] = labels['input_ids']
    return inputs

# Tokenize the datasets
train_data = train_data.map(tokenize_function, batched=True)
eval_data = eval_data.map(tokenize_function, batched=True)

# Check the columns to make sure everything is correct
print("Train columns:", train_data.column_names)
print("Eval columns:", eval_data.column_names)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy="epoch",     # evaluation strategy to use
    learning_rate=2e-5,              # learning rate
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=8,    # batch size for evaluation
    num_train_epochs=3,              # number of training epochs
    weight_decay=0.01,               # strength of weight decay
)

# Define the Trainer
trainer = Trainer(
    model=model,                        # the model to train
    args=training_args,                  # training arguments, defined above
    train_dataset=train_data,            # training dataset
    eval_dataset=eval_data,              # evaluation dataset
    tokenizer=tokenizer,                 # tokenizer used for encoding the inputs
)

# Train the model
trainer.train()

# Save the model and tokenizer after training
model.save_pretrained("./trained_t5_model")
tokenizer.save_pretrained("./trained_t5_model")


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Train columns: ['question', 'answer', 'input_ids', 'attention_mask', 'labels']
Eval columns: ['question', 'answer', 'input_ids', 'attention_mask', 'labels']


  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,No log,5.848193
2,No log,5.781467
3,No log,5.75457


('./trained_t5_model\\tokenizer_config.json',
 './trained_t5_model\\special_tokens_map.json',
 './trained_t5_model\\spiece.model',
 './trained_t5_model\\added_tokens.json')