In [1]:
import os
from pathlib import Path
from pprint import pprint

import nltk
import numpy as np
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)

# to prevent CUDA OOM due to memory fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [2]:
# load tokenizer and model
# (if running for the first time, automatically downloaded from huggingface cloud)
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-coder-1.3b-instruct")
model = AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-coder-1.3b-instruct")

Unrecognized keys in `rope_scaling` for 'rope_type'='linear': {'type'}


In [3]:
# move model to GPU if available
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

print(device)
model.to(device)

cuda


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32256, 2048)
    (layers): ModuleList(
      (0-23): 24 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5504, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5504, bias=False)
          (down_proj): Linear(in_features=5504, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-06)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-06)
      )
    )
    (norm)

# Inference

In [4]:
# run model with some demo input
# the model is pre-trained for the masked language modeling task: predict the tokens that are masked with <extra_id_0>
src = """You are a helpful AI assistant for migrating code between two programming languages.
Today, you are tasked with translating a code snippet in Python into Java. The code snippet is given below.
```python
context = []
for _ in range(100):
    token = ngram.generate_next(context)
    if token == ngram.eos:
        break
    context.append(token)
    print(token, end=" ")
```
Please output only the code in the target programming language and nothing else.
"""
print("--- src ---\n", src)
src_ids = tokenizer(src, return_tensors="pt").to(device)
print("--- src_ids ---\n", src_ids.input_ids)

tgt_ids = model.generate(**src_ids, max_new_tokens=512)
print("--- tgt_ids ---\n", tgt_ids)
tgt = tokenizer.decode(tgt_ids[0][len(src_ids.input_ids[0]):], skip_special_tokens=True)
print("--- tgt ---\n", tgt)

Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.


--- src ---
 You are a helpful AI assistant for migrating code between two programming languages.
Today, you are tasked with translating a code snippet in Python into Java. The code snippet is given below.
```python
context = []
for _ in range(100):
    token = ngram.generate_next(context)
    if token == ngram.eos:
        break
    context.append(token)
    print(token, end=" ")
```
Please output only the code in the target programming language and nothing else.

--- src_ids ---
 tensor([[32013,  2042,   417,   245,  9396, 20926, 20391,   327,  8290, 17278,
          2974,  1433,   979, 14244, 13867,    13,   185, 16197,    11,   340,
           417,  5256,   271,   365,  7700,  1128,   245,  2974,  4494,   515,
          6479,   279, 13003,   878,  9840,    13,   428,  2974,  4494,   515,
          6479,   317,  2017,  2867,    13,   185, 10252, 11364,   185,  6349,
           405,  9635,   185,  1459,  1070,   279,  3160,     7,    16,    15,
            15,  1772,   185,   315, 10

## Zero-shot vs. few-shot

In [5]:
src_zero = """You are a helpful AI assistant for migrating code between two programming languages.
Today, you are tasked with translating a code snippet in Python into Java. The code snippet is given below.
```python
context = []
for _ in range(100):
    token = ngram.generate_next(context)
    if token == ngram.eos:
        break
    context.append(token)
    print(token, end=" ")
```
Please output only the code in the target programming language and nothing else.
"""
src_zero_ids = tokenizer(src_zero, return_tensors="pt").to(device)
tgt_zero = tokenizer.decode(model.generate(**src_zero_ids, max_new_tokens=512)[0][len(src_zero_ids.input_ids[0]):], skip_special_tokens=True)
print("--- tgt_zero ---\n", tgt_zero)

src_few = """You are a helpful AI assistant for migrating code between two programming languages.
Today, you are tasked with translating a code snippet in Python into Java. Please output only the code in the target programming language and nothing else. 
For example:
<INPUT>
l = [1, 2, 3]
</INPUT>

<OUTPUT>
List<Integer> l = List.of(1, 2, 3);
</OUTPUT>

Now it is your turn
<INPUT>
context = []
for _ in range(100):
    token = ngram.generate_next(context)
    if token == ngram.eos:
        break
    context.append(token)
    print(token, end=" ")
</INPUT>

"""
src_few_ids = tokenizer(src_few, return_tensors="pt").to(device)
tgt_few = tokenizer.decode(model.generate(**src_few_ids, max_new_tokens=512)[0][len(src_few_ids.input_ids[0]):], skip_special_tokens=True)
print("--- tgt_few ---\n", tgt_few)

Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.


--- tgt_zero ---
 
The target programming language is Java.

Here is the Java equivalent of the Python code:
```java
import java.util.ArrayList;
import java.util.List;

public class Main {
    public static void main(String[] args) {
        List<Integer> context = new ArrayList<>();
        for (int i = 0; i < 100; i++) {
            int token = ngram.generateNext(context);
            if (token == ngram.eos) {
                break;
            }
            context.add(token);
            System.out.print(token + " ");
        }
    }
}
```
Please note that the `ngram` and `eos` are not defined in the provided Python code. You need to define them in the Java code.

--- tgt_few ---
 <OUTPUT>
List<Integer> l = new ArrayList<>();
for (int i = 0; i < 100; i++) {
    int token = ngram.generate_next(context);
    if (token == ngram.eos) {
        break;
    }
    l.add(token);
    System.out.print(token + " ");
}
</OUTPUT>

Please note that the code snippet provided is in Python and the o

## Greedy, beam search, sampling

In [6]:
src = """You are a helpful AI assistant for migrating code between two programming languages.
Today, you are tasked with translating a code snippet in Python into Java. The code snippet is given below.
```python
context = []
for _ in range(100):
    token = ngram.generate_next(context)
    if token == ngram.eos:
        break
    context.append(token)
    print(token, end=" ")
```
Please output only the code in the target programming language and nothing else.
"""
src_ids = tokenizer(src, return_tensors="pt").to(device)

In [7]:
tgt_greedy = tokenizer.decode(model.generate(**src_ids, max_new_tokens=512)[0][len(src_ids.input_ids[0]):], skip_special_tokens=True)
print("--- greedy ---\n", tgt_greedy)

Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.


--- greedy ---
 
The target programming language is Java.

Here is the Java equivalent of the Python code:
```java
import java.util.ArrayList;
import java.util.List;

public class Main {
    public static void main(String[] args) {
        List<Integer> context = new ArrayList<>();
        for (int i = 0; i < 100; i++) {
            int token = ngram.generateNext(context);
            if (token == ngram.eos) {
                break;
            }
            context.add(token);
            System.out.print(token + " ");
        }
    }
}
```
Please note that the `ngram` and `eos` are not defined in the provided Python code. You need to define them in the Java code.



In [8]:
tgt_beam = tokenizer.decode(model.generate(**src_ids, max_new_tokens=512, num_beams=5, num_return_sequences=1)[0][len(src_ids.input_ids[0]):], skip_special_tokens=True)
print("--- beam search (num_beams = 5) ---\n", tgt_beam)

Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.


--- beam search (num_beams = 5) ---
 
In Java, the code would look like this:
```java
ArrayList<Integer> context = new ArrayList<>();
for (int i = 0; i < 100; i++) {
    int token = ngram.generateNext(context);
    if (token == ngram.eos) {
        break;
    }
    context.add(token);
    System.out.print(token + " ");
}
```
Please note that the `ngram.eos` and `ngram.generateNext(context)` are not defined in the provided Python code snippet. You would need to define these in your Java code.



In [9]:
tgt_sample = tokenizer.decode(model.generate(**src_ids, max_new_tokens=512, do_sample=True, num_return_sequences=1)[0][len(src_ids.input_ids[0]):], skip_special_tokens=True)
print("--- sampling ---\n", tgt_sample)

Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.


--- sampling ---
 
Here is the Python code translated to Java:
```java
List<String> context = new ArrayList<>();
for(int i=0; i<100; i++){
    String token = ngram.generateNext(context);
    if(token.equals(ngram.eos)){
        break;
    }
    context.add(token);
    System.out.print(token + " ");
}
```
Please note, this code is assuming that your Python code uses a similar setup for generating tokens. In Java, you might need to create similar classes and methods to match the Python code.



In [10]:
tgt_sample_topp = tokenizer.decode(model.generate(**src_ids, max_new_tokens=512, do_sample=True, num_return_sequences=1, top_p=0.9)[0][len(src_ids.input_ids[0]):], skip_special_tokens=True)
print("--- sampling (top-p = 0.9) ---\n", tgt_sample_topp)

Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.


--- sampling (top-p = 0.9) ---
 
(Note: I'm assuming `ngram` is a class with an attribute `eos` and a method `generate_next`. It's worth mentioning that Python, Java and JavaScript have different syntaxes and semantics)

I'm also assuming that the `ngram.eos` attribute corresponds to a character in the Python code you've provided that signifies the end of a sequence, which in Java is indicated by a `null`, and the `generate_next` method is used to generate the next token in the sequence. If these assumptions are not true, please provide the correct ones so I can provide a more accurate translation.)



In [11]:
tgt_sample_topk = tokenizer.decode(model.generate(**src_ids, max_new_tokens=512, do_sample=True, num_return_sequences=1, top_k=10)[0][len(src_ids.input_ids[0]):], skip_special_tokens=True)
print("--- sampling (top-k = 10) ---\n", tgt_sample_topk)

Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.


--- sampling (top-k = 10) ---
 
Here's the equivalent code in Java:
```java
int[] context = new int[100];
for(int i=0; i<100; i++){
    int token = ngram.generateNext(context);
    if(token == ngram.eos){ 
        break;
    }
    context[i] = token; 
    System.out.print(token + " ");
}
```

Please note that the Java code has been adjusted to fit into the context and range of the Python code. The loop variable 'i' is also used instead of '_' in the for loop in the Python code. Java's array index starts from 0 and to print the tokens you can use System.out.print() in Java.



In [12]:
tgt_sample_topk_topp = tokenizer.decode(model.generate(**src_ids, max_new_tokens=512, do_sample=True, num_return_sequences=1, top_k=10, top_p=0.9)[0][len(src_ids.input_ids[0]):], skip_special_tokens=True)
print("--- sampling (top-k = 10, top-p = 0.9) ---\n", tgt_sample_topk_topp)

Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.


--- sampling (top-k = 10, top-p = 0.9) ---
 


In [13]:
tgt_sample_lowt = tokenizer.decode(model.generate(**src_ids, max_new_tokens=512, do_sample=True, num_return_sequences=1, top_k=10, top_p=0.9, temperature=0.5)[0][len(src_ids.input_ids[0]):], skip_special_tokens=True)
print("--- sampling (top-k = 10, top-p = 0.9, temperature = 0.5) ---\n", tgt_sample_lowt)

Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.


--- sampling (top-k = 10, top-p = 0.9, temperature = 0.5) ---
 
Here is the equivalent Java code:
```java
List<Integer> context = new ArrayList<>();
for (int i = 0; i < 100; i++) {
    int token = ngram.generateNext(context);
    if (token == ngram.eos) {
        break;
    }
    context.add(token);
    System.out.print(token + " ");
}
```
Please note that the Java code uses `List` instead of Python's `list`, and `ArrayList` instead of Python's `append`. Also, Java uses `System.out.print` instead of Python's `print`.



In [14]:
tgt_sample_hight = tokenizer.decode(model.generate(**src_ids, max_new_tokens=512, do_sample=True, num_return_sequences=1, top_k=10, top_p=0.9, temperature=2.0)[0][len(src_ids.input_ids[0]):], skip_special_tokens=True)
print("--- sampling (top-k = 10, top-p = 0.9, temperature = 2.0) ---\n", tgt_sample_hight)

Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.


--- sampling (top-k = 10, top-p = 0.9, temperature = 2.0) ---
 


# Training (Supervised Finetuning)

## Prepare the dataset

In [4]:
# get a dataset
# a collection of datasets for various code<->text tasks: https://github.com/microsoft/CodeXGLUE
# also available on huggingface: https://huggingface.co/datasets?search=code_x_glue
dataset = load_dataset("google/code_x_glue_cc_code_to_code_trans")


In [5]:
print("Train: ", len(dataset["train"]))
print("Validation: ", len(dataset["validation"]))
print("Test: ", len(dataset["test"]))
print("Example data: ")
pprint(dataset["validation"][0])

Train:  10300
Validation:  500
Test:  1000
Example data: 
{'cs': 'public DVRecord(RecordInputStream in1){_option_flags = '
       'in1.ReadInt();_promptTitle = ReadUnicodeString(in1);_errorTitle = '
       'ReadUnicodeString(in1);_promptText = ReadUnicodeString(in1);_errorText '
       '= ReadUnicodeString(in1);int field_size_first_formula = '
       'in1.ReadUShort();_not_used_1 = in1.ReadShort();_formula1 = '
       'NPOI.SS.Formula.Formula.Read(field_size_first_formula, in1);int '
       'field_size_sec_formula = in1.ReadUShort();_not_used_2 = '
       'in1.ReadShort();_formula2 = '
       'NPOI.SS.Formula.Formula.Read(field_size_sec_formula, in1);_regions = '
       'new CellRangeAddressList(in1);}\n',
 'id': 0,
 'java': 'public DVRecord(RecordInputStream in) {_option_flags = '
         'in.readInt();_promptTitle = readUnicodeString(in);_errorTitle = '
         'readUnicodeString(in);_promptText = readUnicodeString(in);_errorText '
         '= readUnicodeString(in);int field_size_f

In [6]:
# preprocessing the dataset
def preprocess(elements, context_length=model.config.max_position_embeddings):
    seq_list = []
    for i in range(len(elements["id"])):
        seq_list.append(f"""You are an AI programming assistant, utilizing the DeepSeek Coder model, developed by DeepSeek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.
### Instruction:
Translate the following Java code into C#.
```java
{elements['java'][i].strip()}
```

### Response:
```cs
{elements['cs'][i].strip()}
```
""")
    return tokenizer(
        seq_list,
        truncation=True,
        padding="longest",
        max_length=context_length,
        return_tensors="pt",
        return_length=True,
    )


In [7]:
# only use a subset of the dataset
dataset_train = dataset["train"].select(range(800))
dataset_val = dataset["validation"].select(range(100))
dataset_test = dataset["test"].select(range(100))

# preprocess
dataset_train = dataset_train.map(preprocess, remove_columns=dataset_train.column_names, batched=True)
dataset_val = dataset_val.map(preprocess, remove_columns=dataset_val.column_names, batched=True)
dataset_test = dataset_test.map(preprocess, remove_columns=dataset_test.column_names, batched=True)

print(dataset_val[0])
print(tokenizer.decode(dataset_val[0]['input_ids']))

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

{'input_ids': [32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014, 32014

## Setting up evaluation metrics

In [8]:
HINT = "### Response:"
def postprocess(output):
    if HINT in output:
        return output.split(HINT)[1].strip()
    else:
        return output.strip()

def bleu(gold, pred) -> float:
    """
    Calculate BLEU score, using smoothing method 2 with auto reweighting, in the range of 0~100.

    :param gold: list of gold tokens
    :param pred: list of predicted tokens
    :return: BLEU score
    """
    if len(pred) == 0 or len(gold) == 0:
        return 0.0
    return 100.0 * nltk.translate.bleu_score.sentence_bleu(
        [gold],
        pred,
        smoothing_function=nltk.translate.bleu_score.SmoothingFunction().method2,
        auto_reweigh=True,
    )

def compute_metrics(eval_preds, ignore_pad_token_for_loss: bool = True):
    preds, labels = eval_preds
    preds = preds.argmax(-1)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    if ignore_pad_token_for_loss:
        # Replace -100 in the labels as we can't decode them.
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    print(decoded_labels)

    # Some simple post-processing
    decoded_preds = [postprocess(pred) for pred in decoded_preds]
    decoded_labels = [postprocess(label) for label in decoded_labels]

    result = {}
    result["xmatch"] = np.mean([100 if pred == label else 0 for pred, label in zip(decoded_preds, decoded_labels)])
    result["bleu"] = np.mean([bleu(label, pred) for pred, label in zip(decoded_preds, decoded_labels)])
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

In [9]:
training_args = TrainingArguments(
    output_dir=Path.cwd() / "_work" / "model" / "04-llm4code",
    learning_rate=2e-5,
    eval_accumulation_steps=10,
    num_train_epochs=1,
    logging_steps=20,
    evaluation_strategy="steps",
    eval_steps=20,
    # working on an Ada6000 48GB
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    fp16=True,
)
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


## Before finetuning

In [10]:
# run pre-trained model on test set
res = trainer.predict(dataset_test)
res.metrics

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)




[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mpengyunie[0m ([33mpengyunie-uwaterloo[0m). Use [1m`wandb login --relogin`[0m to force relogin


{'test_loss': 2.2803239822387695,
 'test_model_preparation_time': 0.0041,
 'test_xmatch': 0.0,
 'test_bleu': 13.668417404005783,
 'test_gen_len': 585.67,
 'test_runtime': 7.3461,
 'test_samples_per_second': 13.613,
 'test_steps_per_second': 0.953}

## After finetuning

In [11]:
# finetune the model
trainer.train()

Step,Training Loss,Validation Loss,Model Preparation Time,Xmatch,Bleu,Gen Len
20,0.628,0.35959,0.0041,41.0,94.24402,693.0
40,0.3091,0.338351,0.0041,40.0,94.572475,693.0
60,0.2979,0.324405,0.0041,45.0,95.46134,693.0
80,0.3132,0.314987,0.0041,49.0,95.588964,693.0
100,0.3111,0.311079,0.0041,51.0,96.090948,693.0


['You are an AI programming assistant, utilizing the DeepSeek Coder model, developed by DeepSeek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.\n### Instruction:\nTranslate the following Java code into C#.\n```java\npublic DVRecord(RecordInputStream in) {_option_flags = in.readInt();_promptTitle = readUnicodeString(in);_errorTitle = readUnicodeString(in);_promptText = readUnicodeString(in);_errorText = readUnicodeString(in);int field_size_first_formula = in.readUShort();_not_used_1 = in.readShort();_formula1 = Formula.read(field_size_first_formula, in);int field_size_sec_formula = in.readUShort();_not_used_2 = in.readShort();_formula2 = Formula.read(field_size_sec_formula, in);_regions = new CellRangeAddressList(in);}\n```\n\n### Response:\n```cs\npublic DVRecord(RecordInputStream in1){_option_flags = in1.ReadInt();_promptTitle = Re

TrainOutput(global_step=100, training_loss=0.3718496131896973, metrics={'train_runtime': 170.0501, 'train_samples_per_second': 4.704, 'train_steps_per_second': 0.588, 'total_flos': 5629713933926400.0, 'train_loss': 0.3718496131896973, 'epoch': 1.0})

In [12]:
# run fine-tuned model on test set
# we expect better performance than directly using the pre-trained model (not much better performance as we're only fine-tuning on a small dataset for a small number of steps)
res = trainer.predict(dataset_test)
res.metrics



{'test_loss': 0.3106990158557892,
 'test_model_preparation_time': 0.0041,
 'test_xmatch': 48.0,
 'test_bleu': 94.02824981912185,
 'test_gen_len': 588.0,
 'test_runtime': 6.2342,
 'test_samples_per_second': 16.041,
 'test_steps_per_second': 1.123}

In cases of GPU out of memory: try running the following two cells (the first cell is expected to raise an exception) to free up things that are withholded by the ipython notebook
see https://stackoverflow.com/questions/57858433/how-to-clear-gpu-memory-after-pytorch-model-training-without-restarting-kernel

In [31]:
1/0

ZeroDivisionError: division by zero

In [32]:
import gc

gc.collect()
torch.cuda.empty_cache()