In [None]:
# Setp 1: Clean chats avoiding white lines
#

import os
from dotenv import load_dotenv
load_dotenv()

# Clean chats avoiding white lines

# File paths
input_file_path = os.getenv("ORIGINAL_CHAT_FILE")
cleaned_file_path = os.getenv("CLEANED_CHAT_FILE")

# Function to process and concatenate messages
def process_chat(input_path, output_path):
    with open(input_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile:
        previous_line = None

        for line in infile:
            stripped_line = line.strip()
            
            # Check if the line starts with "[", indicating a timestamp
            if stripped_line.startswith("["):
                # Write the previous accumulated message to the output file
                if previous_line:
                    outfile.write(previous_line + "\n")
                previous_line = stripped_line
            else:
                # Concatenate non-timestamp lines with the previous line
                if previous_line:
                    previous_line += ". " + stripped_line
                else:
                    previous_line = stripped_line

        # Write the last accumulated message to the output file
        if previous_line:
            outfile.write(previous_line + "\n")

process_chat(input_file_path, cleaned_file_path)
print(f"Processed chat saved to {cleaned_file_path}")

In [None]:
# Step 2: Create one-line conversations for CSV training
#

#import os
import csv
from datetime import datetime, timedelta
#from dotenv import load_dotenv
#load_dotenv()

# File paths
cleaned_file_path = os.getenv("CLEANED_CHAT_FILE")
conversation_file_path = os.getenv("CONVERSATION_CHAT_FILE")

# Phrases to ignore
ignore_phrases = [
    "audio omitted", "image omitted", "video note omitted",
    "<This message was edited>", "sticker omitted",
    "video omitted", "GIF omitted",
    "You deleted this message", "document omitted"
]

# Source name to be used
source_name = os.getenv("CLON_NAME")

# Define conversation duration in hours
conversation_duration = int(os.getenv("CONVERSATION_DURATION"))

def parse_datetime(timestamp):
    """
    Parse the timestamp from the chat log.
    """
    cleaned_timestamp = timestamp.strip().replace('\u200e', '').replace('\u202f', '')
    cleaned_timestamp = cleaned_timestamp.replace('PM', ' PM').replace('AM', ' AM')  # Add space before AM/PM
    return datetime.strptime(cleaned_timestamp, "[%d/%m/%Y, %I:%M:%S %p]")

def remove_quotes(text):
    """
    Remove quotes from the beginning and end of a string.
    """
    return text.strip('"')

def process_chat(cleaned_file_path, ignore_phrases, source_name, conversation_duration):
    """
    Process the chat log and create one-line conversations for CSV training.
    """
    inputs_outputs = []
    current_input = ""
    current_output = ""
    last_timestamp = None

    with open(cleaned_file_path, 'r', encoding='utf-8') as file:
        for line in file:
            if "]" in line:
                timestamp, rest = line.split("]", 1)
                timestamp = timestamp + "]"
                name, message = rest.split(":", 1)
                name = name.strip()
                message = message.strip()

                # Check if the message contains any ignore phrases
                if any(phrase in message for phrase in ignore_phrases):
                    continue  # Skip this message

                current_time = parse_datetime(timestamp)

                # Check if more than conversation_duration hours have passed
                if last_timestamp and (current_time - last_timestamp > timedelta(hours=conversation_duration)):
                    # Save the previous input/output if they exist
                    if current_input and current_output:  # Only save if both are filled
                        inputs_outputs.append((current_input.strip(), current_output.strip()))
                    current_input = ""
                    current_output = ""

                last_timestamp = current_time

                # Handle messages from other users and source_name
                if name != source_name:
                    # If source_name has been speaking, save the conversation before adding the new message
                    if current_output:
                        if current_input and current_output:  # Only save if both input and output exist
                            inputs_outputs.append((current_input.strip(), current_output.strip()))
                        current_input = ""
                        current_output = ""
                    
                    # Accumulate input
                    current_input += message + ". "
                elif name == source_name:
                    # Accumulate source_name's messages
                    current_output += message + ". "

        # Save any remaining input/output if not empty
        if current_input and current_output:  # Only save if both are filled
            inputs_outputs.append((current_input.strip(), current_output.strip()))

    return inputs_outputs

def write_to_csv(conversation_file_path, inputs_outputs):
    """
    Write the processed inputs and outputs to a CSV file.
    """
    with open(conversation_file_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Input', 'Output'])

        for input_text, output_text in inputs_outputs:
            if input_text and output_text:  # Ensure both input and output have content
                writer.writerow([remove_quotes(input_text), remove_quotes(output_text)])

# Process the chat log and save the training data to a CSV file
inputs_outputs = process_chat(cleaned_file_path, ignore_phrases, source_name, conversation_duration)
write_to_csv(conversation_file_path, inputs_outputs)
print(f"Training data saved to {conversation_file_path}")

In [None]:
# Step 3: Load the dataset
#

import os
import numpy as np
import pandas as pd
from datasets import Dataset
from dotenv import load_dotenv
load_dotenv()

# File paths
conversation_file_path = os.getenv("CONVERSATION_CHAT_FILE")

# Load the dataset from the CSV file using pandas
df = pd.read_csv(conversation_file_path)

# Ensure the dataset has the correct columns
df.columns = ['Input', 'Output']

# Replace NaN or inf values with an empty string and remove rows with empty strings
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)
df = df[df['Input'] != '']
df = df[df['Output'] != '']

# Optionally limit to 1000 records for testing
#df = df[:1000]

# Convert the DataFrame to a Hugging Face dataset
dataset = Dataset.from_pandas(df)

# Split the dataset into training and test sets (80% train, 20% test)
dataset = dataset.train_test_split(test_size=0.2)
print(f"Total records after processing: {len(df)}")

In [None]:
# Step 4: Configure training
#

#import os
import numpy as np
import torch
from torch.optim import AdamW
from transformers import (
    T5Tokenizer, 
    T5ForConditionalGeneration, 
    Trainer, 
    TrainingArguments, 
    DataCollatorForSeq2Seq,
    get_polynomial_decay_schedule_with_warmup
)
#from dotenv import load_dotenv
#load_dotenv()   

# Constants
HF_TOKEN = os.getenv("HF_TOKEN")
MODEL_NAME = "vgaraujov/t5-base-spanish"
OUTPUT_MODEL_DIR = os.getenv("OUTPUT_MODEL_DIR")
MAX_LENGTH = 128
LEARNING_RATE = 1e-5
BATCH_SIZE = 2
NUM_EPOCHS = 6
WEIGHT_DECAY = 0.001
LOGGING_STEPS = 100
SAVE_TOTAL_LIMIT = 2
SAVE_STEPS = 1000
FP16 = False
REPORT_TO = "tensorboard"
WARMUP_STEPS = 500
GRADIENT_ACCUMULATION_STEPS = 4

# Load tokenizer and model
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME, use_auth_token=HF_TOKEN)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, use_auth_token=HF_TOKEN)

# Set pad_token to eos_token
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    """
    Tokenize the input and output text.
    """
    return tokenizer(
        examples['Input'], 
        text_target=examples['Output'], 
        truncation=True, 
        padding='max_length', 
        max_length=MAX_LENGTH
    )

def log_filter_results(dataset, dataset_name):
    """
    Log the number of filtered elements in the dataset.
    """
    original_count = len(dataset)
    filtered_dataset = dataset.filter(lambda example: 
        not np.any(np.isnan(example['input_ids'])) and 
        not np.any(np.isnan(example['labels'])) and 
        not np.any(np.isinf(example['input_ids'])) and 
        not np.any(np.isinf(example['labels']))
    )
    filtered_count = len(filtered_dataset)
    print(f"{dataset_name} - Filtered out {original_count - filtered_count} elements (from {original_count} total).")
    return filtered_dataset

# Tokenize training and evaluation datasets
train_dataset = dataset['train'].map(tokenize_function, batched=True)
eval_dataset = dataset['test'].map(tokenize_function, batched=True)

# Apply filtering to both datasets and log the results
train_dataset = log_filter_results(train_dataset, "Training Dataset")
eval_dataset = log_filter_results(eval_dataset, "Evaluation Dataset")

# Set format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Use DataCollator for dynamic padding
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Training arguments setup
training_args = TrainingArguments(
    output_dir=OUTPUT_MODEL_DIR,
    evaluation_strategy="steps",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=WEIGHT_DECAY,
    logging_dir=os.path.join(OUTPUT_MODEL_DIR, 'logs'),
    logging_steps=LOGGING_STEPS,
    save_total_limit=SAVE_TOTAL_LIMIT,
    save_steps=SAVE_STEPS,
    fp16=FP16,
    report_to=REPORT_TO,
    warmup_steps=WARMUP_STEPS,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    lr_scheduler_type="polynomial",  # Use polynomial scheduler
)

print("Train preparation completed.")



In [None]:
# Step 5: Setup and train the model
#

#import os
from torch.optim import AdamW

#from dotenv import load_dotenv
#load_dotenv()  

def setup_optimizer_and_scheduler(model, training_args, train_dataset):
    """
    Setup the optimizer and polynomial decay scheduler for training.
    """
    optimizer = AdamW(model.parameters(), lr=training_args.learning_rate, weight_decay=training_args.weight_decay)

    total_training_steps = len(train_dataset) // training_args.per_device_train_batch_size * training_args.num_train_epochs

    # Use polynomial decay instead of linear decay
    scheduler = get_polynomial_decay_schedule_with_warmup(
        optimizer,
        num_warmup_steps=training_args.warmup_steps,
        num_training_steps=total_training_steps,
        lr_end=1e-7,
        power=2.0 
    )

    return optimizer, scheduler

def setup_trainer(model, training_args, train_dataset, eval_dataset, data_collator, optimizer, scheduler):
    """
    Setup the Trainer with the optimizer and scheduler.
    """
    return Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator,
        optimizers=(optimizer, scheduler)
    )

def train_and_evaluate(trainer):
    """
    Train and evaluate the model.
    """
    # Move model to GPU if available
    if torch.cuda.is_available():
        trainer.model.to('cuda')

    # Train the model
    try:
        print("Starting training...")
        trainer.train()
        print("Training completed.")
    except Exception as e:
        print(f"Error during training: {e}")

    # Evaluate the model
    try:
        print("Starting evaluation...")
        eval_results = trainer.evaluate()
        print(f"Evaluation results: {eval_results}")
    except Exception as e:
        print(f"Error during evaluation: {e}")

def save_model_and_tokenizer(trainer, tokenizer, OUTPUT_MODEL_DIR):
    """
    Save the model and tokenizer.
    """
    try:
        trainer.save_model(OUTPUT_MODEL_DIR)
        print("Model saved.")
        tokenizer.save_pretrained(OUTPUT_MODEL_DIR)
        print("Tokenizer saved.")
    except Exception as e:
        print(f"Error during saving model/tokenizer: {e}")

# Setup optimizer and scheduler
optimizer, scheduler = setup_optimizer_and_scheduler(model, training_args, train_dataset)
# Setup trainer
trainer = setup_trainer(model, training_args, train_dataset, eval_dataset, data_collator, optimizer, scheduler)
# Train and evaluate
train_and_evaluate(trainer)
# Save model and tokenizer
save_model_and_tokenizer(trainer, tokenizer, OUTPUT_MODEL_DIR)


In [None]:
# Step 6: Test text generation
#

import os
#from dotenv import load_dotenv
#load_dotenv()

# Path to the directory where the trained model is stored
model_dir = os.getenv("OUTPUT_MODEL_DIR")  # Adjust as needed

# Load the tokenizer and model from the directory
tokenizer = T5Tokenizer.from_pretrained(model_dir)
model = T5ForConditionalGeneration.from_pretrained(model_dir)

# Set padding token if needed
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Ensure the model is moved to GPU if available
if torch.cuda.is_available():
    model.to('cuda')

# Function to generate text
def generate_text(prompt, max_length=200, num_return_sequences=1, temperature=0.9, top_k=50, top_p=0.95):
    try:
        if not prompt or not isinstance(prompt, str):
            raise ValueError("The input prompt is not valid.")
        
        # Tokenize the input prompt with appropriate padding
        input_ids = tokenizer.encode(prompt, return_tensors='pt', padding='longest').to('cuda')
        
        # Generate text
        outputs = model.generate(
            input_ids, 
            max_length=max_length, 
            num_return_sequences=num_return_sequences, 
            temperature=temperature, 
            top_k=top_k, 
            top_p=top_p,
            do_sample=True  # Enable sampling for more diverse outputs
        )
        
        # Decode the generated text, ensuring special tokens like <pad> are skipped
        generated_texts = [tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True).strip() for output in outputs]
        
        return generated_texts
    except (RuntimeError, ValueError) as e:
        print(f"Error during text generation: {e}")
        return []

# Test the text generation with a prompt
prompt = "Test Prompt"
generated_texts = generate_text(prompt, num_return_sequences=1, temperature=0.7, top_k=50, top_p=0.95)

for i, text in enumerate(generated_texts):
    print(f"{os.getenv("CLON_NAME")}: {i+1}: {text}")
