In [1]:
import math
import random
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from accelerate import Accelerator

from preprocessor import load_and_preprocess, decoding, process_data
from qwen import load_qwen

import numpy as np

import matplotlib.pyplot as plt

from torch.optim.lr_scheduler import StepLR, CosineAnnealingLR
from transformers import get_cosine_schedule_with_warmup
from preprocessor import get_dataset

import wandb
import joblib

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt

import gc

  from .autonotebook import tqdm as notebook_tqdm
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


In [2]:
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

In [3]:
import torch
torch.cuda.empty_cache()

In [4]:
#for matplotlib plots
SMALL_SIZE = 15+5
MEDIUM_SIZE = 20+5
BIGGER_SIZE = 25+5

plt.rc('font', size=SMALL_SIZE)
plt.rc('axes', titlesize=SMALL_SIZE)
plt.rc('axes', labelsize=MEDIUM_SIZE)
plt.rc('xtick', labelsize=SMALL_SIZE)
plt.rc('ytick', labelsize=SMALL_SIZE)
plt.rc('legend', fontsize=SMALL_SIZE)
plt.rc('figure', titlesize=BIGGER_SIZE)

In [5]:
best_overall_params = joblib.load("best_overall_params.joblib")

lora_rank = best_overall_params['lora_rank']
lora_alpha = 2*lora_rank
batch_size = 4
learning_rate = best_overall_params['learning_rate']
test_size = 0.2
max_steps = 200
max_ctx_length = best_overall_params['max_ctx_length']
points = 80

schedulers = ['StepLR', 'CosineAnnealingLR', 'CosineScheduleWithWarmup']

schedule_choice = schedulers[2]

if schedule_choice == 'StepLR':
    step_size = 100
    gamma = 0.1
elif schedule_choice == 'CosineAnnealingLR':
    T_max = max_steps
elif schedule_choice == 'CosineScheduleWithWarmup':
    warmup_steps = int(0.075*max_steps)
    num_training_steps = max_steps

In [6]:
class LoRALinear(nn.Module):
    def __init__(self, original_linear: nn.Linear, r: int, alpha: int = None):
        super().__init__()
        assert isinstance(original_linear, nn.Linear)
        self.original_linear = original_linear
        self.original_linear.weight.requires_grad = False
        if self.original_linear.bias is not None:
            self.original_linear.bias.requires_grad = False
        in_dim = original_linear.in_features
        out_dim = original_linear.out_features
        self.r = r
        self.alpha = alpha if alpha else r

        device = original_linear.weight.device
        self.A = nn.Parameter(torch.empty(r, in_dim, device=device))
        self.B = nn.Parameter(torch.zeros(out_dim, r, device=device))
        
        # Initialise A with He initialization
        nn.init.kaiming_normal_(self.A, nonlinearity="linear")

    def forward(self, x):
        base_out = self.original_linear(x)
        lora_out = (x @ self.A.T) @ self.B.T
        return base_out + lora_out * (self.alpha / self.r)

In [7]:
model, tokenizer = load_qwen()

In [8]:
# Process the data into sequences of text
train_texts, val_texts, test_texts = load_and_preprocess("lotka_volterra_data.h5", test_size=test_size)

# ^Each of these is a `list[str]` representing contiguous parts of the time series,
#  in text form (using the LLMTIME scheme).

# Modified tokenization with chunking
def process_sequences(texts, tokenizer, max_length=512, stride=256):
    all_input_ids = []
    for text in texts:
        # Apply Qwen's tokenization scheme to the text:
        encoding = tokenizer(text, return_tensors="pt", add_special_tokens=False, padding_side='left')
        seq_ids = encoding.input_ids[0]

        # Create sliding windows to further divide the data into chunks:
        for i in range(0, len(seq_ids), stride):
            chunk = seq_ids[i : i + max_length]
            if len(chunk) < max_length:
                chunk = torch.cat(
                    [
                        torch.full((max_length - len(chunk),), tokenizer.pad_token_id),
                        chunk,
                    ]
                )
            all_input_ids.append(chunk)
    return torch.stack(all_input_ids)


def process_data(texts, tokenizer, points=80):
    given_input_ids = []
    for text in texts:
        given_text = ';'.join([chunk for i, chunk in enumerate(text.split(';')) if i < points])
        encoding_given = tokenizer(given_text, return_tensors="pt", padding='max_length', padding_side='left', max_length=1200)
        given_input_ids.append(encoding_given.input_ids[0])
    return np.stack([text for text in texts]), torch.stack(given_input_ids)

def running_mse(prediction, actual):
    mse = []
    for i in range(len(prediction)):
        mse.append(mean_squared_error(prediction[:i+1], actual[:i+1]))
    return mse

def evaluate_model(model, val_loader, step, max_batches=None):
    model.eval()
    total_loss = 0
    
    with torch.no_grad():
        for batch_idx, (batch,) in enumerate(tqdm(val_loader, desc="val set")):
            # Exit loop after processing max_batches
            if max_batches is not None and batch_idx >= max_batches:
                break
            outputs = model(batch, labels=batch)
            loss = outputs.loss
            total_loss += loss.item()
            del outputs
            del loss
    
    # Calculate metrics - divide by actual number of batches processed
    num_batches = min(len(val_loader), max_batches) if max_batches is not None else len(val_loader)
    avg_loss = total_loss / num_batches

    print(f'Loss on validation subset ({num_batches}/{len(val_loader)} batches) at step {step}: {avg_loss:.4f}')
    return avg_loss

# Defines the maximum context length for the model
train_input_ids = process_sequences(
    train_texts, tokenizer, max_ctx_length, stride=max_ctx_length // 2
)
val_input_ids = process_sequences(
    val_texts, tokenizer, max_ctx_length, stride=max_ctx_length
)
test_texts_all, test_input_ids_some = process_data(
    test_texts, tokenizer, points=points
)

In [9]:

train_dataset = TensorDataset(train_input_ids)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

val_dataset = TensorDataset(val_input_ids)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

test_dataset = TensorDataset(test_input_ids_some)
test_loader = DataLoader(test_dataset, shuffle=False, pin_memory=True)

In [10]:
# Dictionary to store results
grid_results = {}

print(f"\n{'='*50}")
print(f"Training with max_ctx_length={max_ctx_length}")
print(f"{'='*50}\n")

# Apply LoRA with current rank
for layer in model.model.layers:
    layer.self_attn.q_proj = LoRALinear(layer.self_attn.q_proj, r=lora_rank, alpha=lora_alpha)
    layer.self_attn.v_proj = LoRALinear(layer.self_attn.v_proj, r=lora_rank, alpha=2*lora_alpha)

# Create optimizer with current learning rate
optimizer = torch.optim.Adam(
    (p for p in model.parameters() if p.requires_grad), 
    lr=learning_rate,
)

# Create scheduler
if schedule_choice == 'StepLR':
    scheduler = StepLR(optimizer, step_size=step_size, gamma=gamma)
elif schedule_choice == 'CosineAnnealingLR':
    scheduler = CosineAnnealingLR(optimizer, T_max=T_max)
elif schedule_choice == 'CosineScheduleWithWarmup':
    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps)

#Prepare with accelerator
accelerator = Accelerator(mixed_precision='fp16')
model, optimizer, scheduler, train_loader_local, val_loader_local = accelerator.prepare(
    model, optimizer, scheduler, train_loader, val_loader
)


Training with max_ctx_length=768



In [11]:

# Train the model (shortened training for grid search)
steps = 0
train_losses = []
val_losses = []
early_stop_steps = min(max_steps, 500)  # Reduce training for grid search

while steps < early_stop_steps:
    progress_bar = tqdm(train_loader_local, desc=f"Steps {steps}")
    for (batch,) in progress_bar:
        model.train()
        optimizer.zero_grad()
        outputs = model(batch, labels=batch)
        loss = outputs.loss
        train_losses.append([loss.item(), steps])
        accelerator.backward(loss)
        optimizer.step()
        
        if steps % 50 == 0:
            avg_loss = evaluate_model(model, val_loader_local, steps)
            val_losses.append([avg_loss, steps])
            model.train()
            
        steps += 1
        progress_bar.set_postfix(loss=loss.item())

        del loss
        del outputs
        del batch
        
        if steps >= early_stop_steps:
            break

# Final evaluation
final_val_loss = evaluate_model(model, val_loader_local, steps)

# Store results
grid_results[(schedule_choice)] = {
    "final_val_loss": final_val_loss,
    "train_losses": train_losses,
    "val_losses": val_losses,
}

if schedule_choice == 'StepLR':
    grid_results[(schedule_choice)]["step_size"] = step_size
    grid_results[(schedule_choice)]["gamma"] = gamma
elif schedule_choice == 'CosineAnnealingLR':
    grid_results[(schedule_choice)]["T_max"] = T_max
elif schedule_choice == 'CosineScheduleWithWarmup':
    grid_results[(schedule_choice)]["warmup_steps"] = warmup_steps
    grid_results[(schedule_choice)]["num_training_steps"] = num_training_steps


del model
del tokenizer
del optimizer
del train_loader_local
del val_loader_local
del accelerator
del train_losses
del val_losses
torch.cuda.empty_cache()

val set: 100%|██████████| 50/50 [02:03<00:00,  2.47s/it]
Steps 0:   0%|          | 1/800 [02:26<32:27:00, 146.21s/it, loss=3.93]

Loss on validation subset (50/50 batches) at step 0: 3.8824


val set: 100%|██████████| 50/50 [01:37<00:00,  1.94s/it], loss=3.84]   
Steps 0:   6%|▋         | 51/800 [07:52<6:59:53, 33.64s/it, loss=2.43]

Loss on validation subset (50/50 batches) at step 50: 3.8824


val set: 100%|██████████| 50/50 [03:03<00:00,  3.66s/it]t, loss=2.38] 
Steps 0:  13%|█▎        | 101/800 [14:54<11:33:28, 59.53s/it, loss=6.71]

Loss on validation subset (50/50 batches) at step 100: 3.8824


val set: 100%|██████████| 50/50 [01:41<00:00,  2.04s/it]/it, loss=5.32] 
Steps 0:  19%|█▉        | 151/800 [27:16<7:50:46, 43.52s/it, loss=1.1] 

Loss on validation subset (50/50 batches) at step 150: 3.8824


Steps 0:  25%|██▍       | 199/800 [33:59<1:42:39, 10.25s/it, loss=5.19]
val set: 100%|██████████| 50/50 [01:14<00:00,  1.48s/it]


Loss on validation subset (50/50 batches) at step 200: 3.8824


In [12]:
joblib.dump(grid_results, f"../results/grid_results_{schedule_choice}.joblib")

['../results/grid_results_CosineScheduleWithWarmup.joblib']