In [None]:
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split

# Load the data
file_path = 'Sample_Training_Data.csv'
df = pd.read_csv(file_path, sep=';', header=None, names=['Input', 'Value'])

# Drop the first row (header row in data)
df = df.drop(0)

# Combine input and value into one sequence for training
df['text'] = df['Input'].str.strip() + " ; " + df['Value'].str.strip()

# Split the data into training and validation sets (80% train, 20% validation)
train_df, val_df = train_test_split(df, test_size=0.2)

# Prepare datasets
train_dataset = Dataset.from_pandas(train_df[['Input', 'Value']])
val_dataset = Dataset.from_pandas(val_df[['Input', 'Value']])

# Load pre-trained T5 tokenizer and model
model_name = "t5-base"  # t-small "t5-base" or "t5-large" for larger models
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Tokenization function
def tokenize_function(examples):
    inputs = ["translate Input to Output: " + inp for inp in examples["Input"]]
    outputs = [outp for outp in examples["Value"]]
    model_inputs = tokenizer(inputs, max_length=250, truncation=True, padding="max_length")  # Max length = 250
    labels = tokenizer(outputs, max_length=250, truncation=True, padding="max_length")      # Max length = 250
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize the datasets
train_tokenized = train_dataset.map(tokenize_function, batched=True, remove_columns=['Input', 'Value'])
val_tokenized = val_dataset.map(tokenize_function, batched=True, remove_columns=['Input', 'Value'])

# Load pre-trained T5 model
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",                       # Changed to match the current strategy
    save_strategy="epoch",                       # Save at each epoch
    learning_rate=3e-4,
    per_device_train_batch_size=8,              # Larger batch size if memory allows
    per_device_eval_batch_size=8,
    num_train_epochs=30,                         # More epochs for better convergence
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500                               # Keeping this as per your preference
)

# Initialize Trainer with training and validation datasets
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,  # Use the training dataset
    eval_dataset=val_tokenized      # Use the validation dataset
)

# Train the model
trainer.train()

# Save the model and tokenizer
model.save_pretrained('./trained_model_t5_V2')
tokenizer.save_pretrained('./trained_model_t5_V2')

# Example function to generate a response based on user input
def generate_response(input_text):
    input_text = "translate Input to Output: " + input_text.strip()
    inputs = tokenizer(input_text, return_tensors="pt", max_length=250, truncation=True)  # Max length = 250
    outputs = model.generate(**inputs, max_length=150, num_beams=2, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
# Test the model
test_input = "EInschluss: Alter zw. 26 und 80"
response = generate_response(test_input)
print("Generated Output:", response)


In [None]:
# Test the token length of the longest input-output pair
longest_example = "Einschluss: Produkt Mitgliedschaft mit Pannenhilfe, Motiv unbezahlt, Kündigungsdatum : 01.06.2022 – 31.03.2023;isIncludedWithDate(listProducts_Benificiary_latestCancellationReasons, '101-breakdownVariant-MOT','2022-06-01') and isIncludedWithDate(listProducts_Benificiary_latestCancellationReasons, '101-cancellationReason-motif54,101-cancellationReason-motif16','2022-06-01') and !isIncludedWithDate(listProducts_Benificiary_latestCancellationReasons, '101-breakdownVariant-MOT','2023-03-31')"

# Tokenize and print the token length
tokenized_example = tokenizer(longest_example, return_tensors="pt")
print(f"Token length of the longest example: {len(tokenized_example['input_ids'][0])}")
