<a href="https://colab.research.google.com/github/mohhomadfarman/PyCodeAI/blob/main/pycodeai_colab_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PyCodeAI - Google Colab Training (GitHub Version)

This notebook trains your PyCodeAI model using code from GitHub and saves the results to Google Drive.

## Instructions

1.  **Configure**: Set your GitHub Repository URL in the first code cell.
2.  **Mount Drive**: Run the cell to connect Google Drive (for saving the trained model).
3.  **Run All**: Run all cells to clone, install, and train.

In [None]:
# CONFIGURATION
# Replace this with your repository URL
GITHUB_REPO = 'https://github.com/mohhomadfarman/PyCodeAI.git'
BRANCH = 'main'  # or 'master'

# This is where the model will be SAVED in your Google Drive
DRIVE_SAVE_PATH = '/content/drive/MyDrive/PyCodeAI_Models'

In [None]:
# 1. Mount Google Drive
from google.colab import drive
import os

drive.mount('/content/drive')

# Create the save directory if it doesn't exist
os.makedirs(DRIVE_SAVE_PATH, exist_ok=True)
print(f"Models will be saved to: {DRIVE_SAVE_PATH}")

In [None]:
# 2. Clone Repository & Install Dependencies
!git clone {GITHUB_REPO} PyCodeAI_Repo
%cd PyCodeAI_Repo
!git checkout {BRANCH}
!git pull origin {BRANCH}  # Ensure we have the latest

# Install cupy for GPU
!pip install cupy-cuda12x

In [None]:
# 3. Check for Existing Model
import os
import shutil

# If you have a 'best_model.npz' in your Drive, we can copy it here to resume training
# Uncomment the lines below if you want to pull a model FROM Drive
# DRIVE_MODEL = os.path.join(DRIVE_SAVE_PATH, 'best_model.npz')
# if os.path.exists(DRIVE_MODEL):
#     print("Found model in Drive, copying to local workspace...")
#     shutil.copy(DRIVE_MODEL, 'best_model.npz')

if os.path.exists('best_model.npz'):
    print("Starting training from existing 'best_model.npz'...")
else:
    print("No 'best_model.npz' found. Starting fresh training (or finding it in repo).")

In [None]:
# 4. Run Training
# - Resumes from best_model.npz (if it exists)
# - Saves to best_model_new.npz
# - Creates a NEW tokenizer file

# Protect original tokenizer
!cp tokenizer.json tokenizer_new.json 2>/dev/null || echo "No tokenizer.json found, will build new one."

!python cli.py train \
    --device gpu \
    --load-model best_model.npz \
    --output-model best_model_new.npz \
    --output-tokenizer tokenizer_new.json \
    --epochs 5 \
    --batch-size 32 \
    --log-interval 10

In [None]:
# 5. Save Results to Drive
import shutil
import datetime

timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
new_model_name = f"best_model_{timestamp}.npz"
new_token_name = f"tokenizer_{timestamp}.json"

print(f"Backing up to Drive as {new_model_name}...")

# Copy model
if os.path.exists("best_model_new.npz"):
    shutil.copy("best_model_new.npz", os.path.join(DRIVE_SAVE_PATH, new_model_name))
    # Also update the 'latest' one
    shutil.copy("best_model_new.npz", os.path.join(DRIVE_SAVE_PATH, "best_model_latest.npz"))
    print("Model saved.")
else:
    print("ERROR: best_model_new.npz not found!")

# Copy tokenizer
if os.path.exists("tokenizer_new.json"):
    shutil.copy("tokenizer_new.json", os.path.join(DRIVE_SAVE_PATH, new_token_name))
    shutil.copy("tokenizer_new.json", os.path.join(DRIVE_SAVE_PATH, "tokenizer_latest.json"))
    print("Tokenizer saved.")
else:
    print("WARNING: tokenizer_new.json not found!")