<a href="https://colab.research.google.com/github/markNZed/GPT-NeoX-Colab/blob/main/notebooks/codecompletion_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Modify torch and transformers which requires manual notebook restart
# Could not redirect to /dev/null in the standard Colab notebook (maybe no output for a particular time?)
# Currently deepspeed from GTP-NeoX is not compatible with logging in torch >= 2.4
%pip install --use-feature=fast-deps -q torch==2.3 &
%pip install --use-feature=fast-deps -q torchaudio==2.3.0 &
%pip install --use-feature=fast-deps -q torchvision==0.18.0 &
%pip install --use-feature=fast-deps -q transformers==4.38.0 &
%pip install --use-feature=fast-deps -q sentence-transformers==2.2.2 &

In [None]:
!apt install -y sysbench
# The Colab vCPU should give around 200 events per second. The high RAM instance type is around 350 (at twice the price)
# Given the price of T4 GPU and the percentage of time we spendin in setup is it worth using a high RAM instance
# The high RAM instance has 8 threads instead of 2 and can reach around 1600 events per second with all 8 threads
!sysbench cpu --cpu-max-prime=20000 run

In [None]:
# We could modify these paths to "stub" behavior for test/dev
workspaceDir = "/content"
GPTNeoXDirName = "gpt-neox"
GPTNeoXDir = f"{workspaceDir}/{GPTNeoXDirName}"
GPTNeoXColabDirName = "GPT-NeoX-Colab"
GPTNeoXColabDir = f"{workspaceDir}/{GPTNeoXColabDirName}"

# Clone CodeXGLUE Repo

In [None]:
!git clone https://github.com/microsoft/CodeXGLUE.git &

In [None]:
#@title Clone GPT-NeoX-Colab
%%time
%cd {workspaceDir}
# Don't use --depth 1 because that does not play nice with git-annex
!git clone https://github.com/markNZed/GPT-NeoX-Colab.git
%cd {GPTNeoXColabDir}
#%pip install --use-feature=fast-deps -q -r requirements_colab.txt
!cat requirements_colab.txt | xargs -n 1 -P 8 pip install --use-feature=fast-deps -q
%pip install --use-feature=fast-deps -q .
from dotenv import load_dotenv
import os
load_dotenv(f"{GPTNeoXColabDir}/.env")
import GPTNeoXColab
GPTNeoXColab.utils.colab.fetch_data("data/codecompletion/processed_data")

In [None]:
%cd {workspaceDir}
#!git clone https://github.com/EleutherAI/gpt-neox.git
!git clone -b pipe_parallel_size_1 --depth 1 https://github.com/markNZed/gpt-neox.git

In [None]:
!mkdir -p /content/gpt-neox/processed_data
!cp {GPTNeoXColabDir}/data/codecompletion/processed_data/* /content/gpt-neox/processed_data
processed_data_path = "/content/gpt-neox/processed_data"

# Cloning GPT-NeoX Repo

# Downloading and Preprocessing Dataset

In [None]:
import os

# Check if the file exists
if not os.path.exists(processed_data_path):
    # Change directory
    %cd /content/CodeXGLUE/Code-Code/CodeCompletion-token/dataset/py150
    # Run the shell script to download and extract
    !bash /content/CodeXGLUE/Code-Code/CodeCompletion-token/dataset/py150/download_and_extract.sh
    # Run the preprocessing Python script
    !python preprocess.py --base_dir=py150_files --output_dir=token_completion
else:
    print("File already exists, skipping download and preprocessing.")


# Install Dependencies

In [None]:
%%time
%cd /content/gpt-neox
%pip install --use-feature=fast-deps -q -r ./requirements/requirements.txt
#!cat ./requirements/requirements.txt | xargs -n 1 -P 8 pip install --use-feature=fast-deps -q

# Preparing Custom Dataset


In [None]:
%cd /content/gpt-neox
!mkdir -p data

In [None]:
import json
import os

# Generate a list of dictionaries
if not os.path.exists(processed_data_path):
  lines = []
  with open("/content/CodeXGLUE/Code-Code/CodeCompletion-token/dataset/py150/token_completion/train.txt", encoding="utf8") as f:
      for line in f.read().splitlines():
          if line:
              lines.append({"text": line})

  # Convert to a list of JSON strings
  json_lines = [json.dumps(l) for l in lines]

  # Join lines and save to .jsonl file
  json_data = '\n'.join(json_lines)
  with open('/content/gpt-neox/data/py95K_train.jsonl', 'w') as f:
      f.write(json_data)

# Using Byte-Pair Encoding Tokenizer

In [None]:
%cd data
!wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json &
!wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt

In [None]:
%%time
import os

# Check if the file exists
if not os.path.exists(processed_data_path):
  %cd /content/gpt-neox
  !mkdir -p processed_data
  !python tools/datasets/preprocess_data.py \
    --input ./data/py95K_train.jsonl \
    --vocab ./data/gpt2-vocab.json \
    --merge-file ./data/gpt2-merges.txt \
    --output-prefix ./processed_data/py150 \
    --tokenizer-type GPT2BPETokenizer \
    --dataset-impl mmap \
    --append-eod
else:
    print("File already exists, skipping download and preprocessing.")


# Tokens count in Dataset

In [None]:
from transformers import GPT2Tokenizer

# Initialize the GPT-2 tokenizer (BPE-based)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Path to your text file
file_path = "/content/CodeXGLUE/Code-Code/CodeCompletion-token/dataset/py150/token_completion/train.txt"

# Initialize a token counter
total_token_count = 0

# Open the file and read line by line to count tokens
#with open(file_path, "r", encoding="utf-8") as file:
#    for line in file:
#        tokens = tokenizer.encode(line)
#        total_token_count += len(tokens)
        #print(total_token_count)

print(f"Total token count: {total_token_count}")


# Training

In [None]:
%load_ext tensorboard

In [None]:
%%time
%cd /content/gpt-neox
!nohup python ./deepy.py train.py --conf_dir /content/GPT-NeoX-Colab/configs codecompletion codecompletion_train &

In [None]:
%cd {GPTNeoXDir}
%tensorboard --logdir tensorboard

In [None]:
!pip show datasets
!pip install datasets==1.18.0
!pip install hf-transfer
!pip install lm-eval --upgrade

# Inference

In [None]:
%cd /content/gpt-neox

import tensorflow as tf
import matplotlib.pyplot as plt
from tensorboard.backend.event_processing import event_accumulator
import os
import numpy as np
# Path to the latest log file
log_dir = "tensorboard"
log_files = [os.path.join(log_dir, d) for d in os.listdir(log_dir)]
latest_log_dir = max(log_files, key=os.path.getmtime)

# Initialize EventAccumulator to load scalar data
ea = event_accumulator.EventAccumulator(latest_log_dir)
ea.Reload()  # Load all logs

# List all scalar keys available in the logs
scalar_keys = ea.Tags()['scalars']
print("Available scalar keys:", scalar_keys)

# Extract training and validation losses
train_loss = ea.Scalars('train/lm_loss')  # Adjust for actual name if necessary
val_loss = ea.Scalars('validation/lm_loss')  # Adjust for actual name if necessary

# Convert to lists for plotting
train_loss_values = [x.value for x in train_loss]
val_loss_values = [x.value for x in val_loss]

# Find the lengths of both arrays
len_train = len(train_loss_values)
len_val = len(val_loss_values)

iterations = None
# Interpolate the shorter array
if len_train != len_val:
    if len_train > len_val:
        # Interpolate validation loss to match the training loss length
        iterations = np.linspace(1, len_train, len_train)
        val_iterations = np.linspace(1, len_train, len_val)
        val_loss_values = np.interp(iterations, val_iterations, val_loss_values)
    else:
        # Interpolate training loss to match the validation loss length
        iterations = np.linspace(1, len_val, len_val)
        train_iterations = np.linspace(1, len_val, len_train)
        train_loss_values = np.interp(iterations, train_iterations, train_loss_values)
else:
    iterations = range(1, len_train + 1)

# Plot training and validation loss
plt.figure(figsize=(10, 5))
plt.plot(iterations, train_loss_values, label='Training Loss')
plt.plot(iterations, val_loss_values, label='Validation Loss')
plt.xlabel('Iterations')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss')
plt.show()


# HuggingFace Inference

# Convert Our Model to HuggingFace Format

In [None]:
import os

# Define the path to the checkpoints directory
checkpoints_dir = "/content/gpt-neox/checkpoints"

# Read the 'latest' file to get the latest checkpoint name
with open(os.path.join(checkpoints_dir, "latest"), "r") as f:
    latest_checkpoint_name = f.read().strip()

# Construct the full path to the latest checkpoint directory
latest_checkpoint_path = os.path.join(checkpoints_dir, latest_checkpoint_name)
print("Path to the latest checkpoint:", latest_checkpoint_path)

In [None]:
!python ./tools/ckpts/convert_neox_to_hf.py --input_dir {latest_checkpoint_path} --config_file /content/GPT-NeoX-Colab/configs/codecompletion.yml --output_dir hf_model/save/location --precision auto --architecture neox

# Code Completion

In [None]:
from transformers import GPTNeoXForCausalLM
import torch

# Move to model directory
%cd /content/gpt-neox

# Assuming CharLevelTokenizer is properly imported and instantiated
from megatron.tokenizer.tokenizer import _GPT2BPETokenizer
tokenizer = _GPT2BPETokenizer(vocab_file="data/gpt2-vocab.json", merge_file="data/gpt2-merges.txt")

# Load your model
model_path = "/content/gpt-neox/hf_model/save/location"
model = GPTNeoXForCausalLM.from_pretrained(model_path)

# Define a simple char-level tokenizer if not provided
def token_level_tokenize(text):
    return tokenizer.tokenize(text)

def token_level_detokenize(tokens):
    return tokenizer.detokenize(tokens)

# Set the model to evaluation mode
model.eval()

# Prompt the user for input
input_text = """<s> import sys , os <EOL> import imp <EOL> from optparse import make_option <EOL> from django . conf import settings <EOL> from django . utils . importlib import import_module <EOL> from django . core . management import call_command <EOL> from django . core . management import BaseCommand <EOL> from django . db import connections <EOL> def import_app ( app_label , verbosity ) : <EOL> try : <EOL> app_path = __import__ ( app_label , { } , { } , [ app_label . split ( '<STR_LIT:.>' ) [ - <NUM_LIT:1> ] ] ) . __path__ <EOL>"""

# Tokenize and prepare input
input_ids = torch.tensor([token_level_tokenize(input_text)], dtype=torch.long)
attention_mask = torch.ones_like(input_ids)  # Create an attention mask for non-padded input

# Generate text with specified pad_token_id and attention_mask
with torch.no_grad():
    output = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=200,          # Adjust this for desired output length
        temperature=0.7,        # Controls creativity
        top_k=50,               # Controls diversity
        top_p=0.9,              # Nucleus sampling
        num_return_sequences=1, # Number of sequences to return
        pad_token_id=model.config.eos_token_id,  # Set pad_token_id explicitly
        do_sample=True           # Enable sampling mode to use temperature and top_p
    )

# Decode the generated text
generated_text = token_level_detokenize(output[0].tolist())

# Function to replace special tokens with original representations
def replace_special_tokens(text):
    replacements = {
        "<EOL>": "\n",  # Replace with actual newline
        "<s>": "",
        "</s>": "",     # Remove end token
        "<STR_LIT>": "STR_LITERAL",  # Example replacement, adjust as necessary
        "<NUM_LIT>": "NUM_LITERAL",   # Example replacement, adjust as necessary
    }

    for token, replacement in replacements.items():
        text = text.replace(token, replacement)

    return text.strip()  # Strip leading/trailing whitespace

# Replace special tokens in the generated text
final_text = replace_special_tokens(generated_text)

# Print the final output
print("Generated text:", final_text)


In [None]:
import os

# Check if the file exists
if not os.path.exists("/content/CodeXGLUE/Code-Code/CodeCompletion-token/dataset/py150/token_completion"):
    # Change directory
    %cd /content/CodeXGLUE/Code-Code/CodeCompletion-token/dataset/py150
    # Run the shell script to download and extract
    !bash /content/CodeXGLUE/Code-Code/CodeCompletion-token/dataset/py150/download_and_extract.sh
    # Run the preprocessing Python script
    !python preprocess.py --base_dir=py150_files --output_dir=token_completion
else:
    print("File already exists, skipping download and preprocessing.")

In [None]:
!cp /content/gpt-neox/data/gpt2-vocab.json /content/gpt-neox/hf_model/save/location/vocab.json
!cp /content/gpt-neox/data/gpt2-merges.txt /content/gpt-neox/hf_model/save/location/merges.txt
%cd /content/CodeXGLUE/Code-Code/CodeCompletion-token/code
!python -u run_lm.py \
        --data_dir=../dataset/py150/token_completion \
        --lit_file=../dataset/py150/literals.json \
        --langs=$LANG \
        --output_dir=../dataset/py150 \
        --pretrain_dir=/content/gpt-neox/hf_model/save/location \
        --log_file=../completion_python_eval.log \
        --model_type=gpt2 \
        --block_size=2048 \
        --do_eval \
        --per_gpu_eval_batch_size=4 \
        --logging_steps=100 \
        --seed=42

# Using git on VM
- Create PAT on GitHub with content permission for repo
- To store the details on first entry: `git config --global credential.helper store`
- To check password and store: `git push --dry-run`
# Using DVC
- `export AWS_SECRET_ACCESS_KEY=xxxx`
- `export AWS_ACCESS_KEY_ID=xxx`
- `mkdir -p /content/GPT-NeoX-Colab/models/codecompletion`
- `cd /content/GPT-NeoX-Colab/models/codecompletion`
- `tar -cf global_step7000.tar -C /content/gpt-neox/checkpoints global_step7000`
- `gzip global_step7000.tar`
- `dvc add global_step7000.tar.gz`
- `cd ../..`
- `git add .`
- `git commit -m"add to dvc"`
- `git push`
- `dvc  push`

