In [None]:
! pip install transformers torch sentencepiece accelerate protobuf

In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from torch.utils.data import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split

# Load and preprocess the dataset
file_path = 'quantized_coordinates.csv'
coordinates_df = pd.read_csv(file_path)
coordinates_df['sequence'] = coordinates_df['y_quant'].astype(str) + ' ' + coordinates_df['x_quant'].astype(str)

# Prepare input-output pairs for training
def prepare_data(df, input_len=5):
    input_sequences = []
    output_sequences = []
    for i in range(len(df) - input_len):
        input_seq = ' '.join(df['sequence'].iloc[i:i+input_len])
        output_seq = ' '.join(df['sequence'].iloc[i:i+input_len+1])
        input_sequences.append(input_seq)
        output_sequences.append(output_seq)
    return input_sequences, output_sequences

input_seqs, output_seqs = prepare_data(coordinates_df)

# Split the data into train and validation sets (80% train, 20% validation)
train_inputs, val_inputs, train_outputs, val_outputs = train_test_split(
    input_seqs, output_seqs, test_size=0.2, random_state=42
)

# Custom Dataset Class
class CoordinateDataset(Dataset):
    def __init__(self, inputs, outputs, tokenizer, max_len, device):
        self.inputs = inputs
        self.outputs = outputs
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.device = device

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_seq = self.inputs[idx]
        output_seq = self.outputs[idx]
        
        inputs = self.tokenizer(input_seq, max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt")
        outputs = self.tokenizer(output_seq, max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt")
        
        return {
            'input_ids': inputs.input_ids.flatten().to(self.device),
            'attention_mask': inputs.attention_mask.flatten().to(self.device),
            'labels': outputs.input_ids.flatten().to(self.device)
        }

# Initialize model and tokenizer
model_name = "t5-base"  # Or 't5-small' for faster training
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Check if MPS is available and move model to MPS
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
model = model.to(device)

# Dataset parameters
MAX_LEN = 50  # Reduced sequence length for faster training
BATCH_SIZE = 2  # Smaller batch size to fit in memory

# Create train and eval datasets
train_dataset = CoordinateDataset(train_inputs, train_outputs, tokenizer, max_len=MAX_LEN, device=device)
eval_dataset = CoordinateDataset(val_inputs, val_outputs, tokenizer, max_len=MAX_LEN, device=device)

# Training arguments without FP16
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,  # Start with fewer epochs
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=4,  # Simulate larger batch size
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=50,  # Evaluation every 50 steps
    save_steps=500,  # Save checkpoint every 500 steps
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()




In [1]:

# Inference: Generate a sequence given the first few points
def generate_sequence(model, tokenizer, input_sequence, max_length=150):
    inputs = tokenizer(input_sequence, return_tensors="pt").to(device)
    output = model.generate(inputs.input_ids, max_length=max_length)
    return tokenizer.decode(output[0], skip_special_tokens=True)


In [2]:
# Example prediction
input_sequence = '80 118 79 123 88 127' 
predicted_sequence = generate_sequence(model, tokenizer, input_sequence)
print(f"Input: {input_sequence}")
print(f"Predicted Sequence: {predicted_sequence}")

NameError: name 'model' is not defined

In [5]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('results/checkpoint-730')

# Load the model
model = AutoModelForSeq2SeqLM.from_pretrained('results/checkpoint-730')

# Set the model to evaluation mode
model.eval()

# Example prediction
input_sequence = '80 118 79 123 88 127'
inputs = tokenizer(input_sequence, return_tensors='pt')

# Generate sequence
outputs = model.generate(
    **inputs,
    max_length=200,    # Increase max_length for longer output
    num_beams=5,       # Use beam search with multiple beams for diverse results
    repetition_penalty=1.2  # Penalize repetition
)

predicted_sequence = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"Input: {input_sequence}")
print(f"Predicted Sequence: {predicted_sequence}")

Input: 80 118 79 123 88 127
Predicted Sequence: 80 118 79 123 88 127 88 127


In [2]:
import torch
print(torch.backends.mps.is_available())  # Should return True on M1 Mac
print(torch.backends.mps.is_built())      # Should return True if PyTorch was built with MPS support


True
True


In [3]:
print("hello")

hello
