In [None]:
import os

os.chdir('/kaggle/working')

REPO_NAME = "Project-PlotArmor"
REPO_PATH = f"/kaggle/working/{REPO_NAME}"
REPO_URL = "https://github.com/nileshcharan24/Project-PlotArmor.git"

if os.path.exists(REPO_PATH):
    print("ðŸ”„ Repo exists. Pulling latest changes...")
    os.chdir(REPO_PATH)
    !git pull origin main
    os.chdir('/kaggle/working')
else:
    print("ðŸ“¥ Repo not found. Cloning fresh...")
    !git clone {REPO_URL}

os.chdir(REPO_PATH)
print(f"âœ… Current Working Directory: {os.getcwd()}")
print("DONE")


In [None]:
!pip install -r requirements.txt
print("DONE")


In [None]:
# Download TinyStories dataset for real training
!python tools/download_data.py
print("Data downloaded")


In [None]:
# Pre-tokenize to memmap to avoid re-encoding on each run
!python tools/pre_tokenize.py --input research/data/tinystories_train.txt --output research/data/tinystories_train.bin --chunk_lines 10000
print("Pretokenization complete")


In [None]:
import sys
sys.path.append('.')

# Use long training config
from research.config.kaggle_long_train import KAGGLE_LONG_CONFIGS

# Override defaults
import argparse
args = argparse.Namespace(
    model='bdh',  # or 'gpt2'
    data_path='research/data/tinystories_train.txt',
    pretokenized_path='research/data/tinystories_train.bin',
    max_steps=KAGGLE_LONG_CONFIGS['bdh']['max_steps'],
    batch_size=KAGGLE_LONG_CONFIGS['bdh']['batch_size'],
    val_interval=500,
    gen_interval=500
)

# Monkey patch sys.argv for parser
sys.argv = ['train.py'] + [f'--{k}={v}' for k, v in vars(args).items() if not k.startswith('_')]
print("DEBUG: sys.argv set")

from research.utils.train import main
print("DEBUG: main imported")
print("DEBUG: Starting training...")
main()
print("DONE")


In [None]:
# Files are automatically copied to /kaggle/working/ after training
# Download them from Kaggle's output panel
import os
print("Available downloads:")
for f in os.listdir('/kaggle/working'):
    if f.endswith('.csv') or f.endswith('.pt'):
        print(f"- {f}")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import glob

# Find the latest CSV in /kaggle/working/
csv_files = glob.glob('/kaggle/working/training_log_*.csv')
if csv_files:
    latest_csv = max(csv_files, key=os.path.getctime)
    print(f"Plotting from: {latest_csv}")
    
    df = pd.read_csv(latest_csv)
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
    
    # Loss curves
    ax1.plot(df['step'], df['train_loss'], label='Train Loss')
    ax1.plot(df['step'], df['val_loss'], label='Val Loss')
    ax1.set_xlabel('Step')
    ax1.set_ylabel('Loss')
    ax1.set_title('Training and Validation Loss')
    ax1.legend()
    ax1.grid(True)
    
    # Perplexity curve
    ax2.plot(df['step'], df['perplexity'], color='orange')
    ax2.set_xlabel('Step')
    ax2.set_ylabel('Perplexity')
    ax2.set_title('Perplexity Over Time')
    ax2.grid(True)
    
    plt.tight_layout()
    plt.savefig('/kaggle/working/training_curve.png')
    plt.show()
    
    print("Plot saved to /kaggle/working/training_curve.png - download it!")
else:
    print("No CSV files found in /kaggle/working/")
