In [1]:
from datasets import Dataset, load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline,
    DataCollatorForSeq2Seq,
    TextStreamer,
)
from peft import (
    LoraConfig,
    PeftModel,
    TaskType,
    get_peft_model,
    prepare_model_for_kbit_training,
)
from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM
from accelerate import infer_auto_device_map
import random
import numpy as np
import torch
import json
from tqdm import tqdm

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

## Qwen模型数据生成&预处理

In [3]:
def truncate_before(text):
    # Split the text at '```python' and take the first part.
    parts = text.split('```python', 1)
    # Return the part before '```python'
    return parts[0].strip() if len(parts) > 1 else text

# Example usage:
input_text = '''
Some leading text that we want to keep.
```python
This is some code that we do not want to include.
'''

# result = truncate_before(input_text)
# print(result)


def get_code_block(text):
    # Find the start index of the first ```
    start_index = text.find('```')
    
    # If the first ``` is not found, return an empty string or handle it as needed
    if start_index == -1:
        return text
    
    # Find the end index of the second ```
    # We start searching from the character after the first ```
    end_index = text.find('```', start_index + 1)
    
    # If the second ``` is not found, return an empty string or handle it as needed
    if end_index == -1:
        return text[start_index:]
    
    # Extract and return the substring including both ```
    return text[start_index:end_index + 3]

input_text = '''
Some leading text that we want to keep.
```python
This is some code that we do not want to include.
```  aaaabbbbbbcccccdddd
'''
result = get_code_block(input_text)
print(result)


```python
This is some code that we do not want to include.
```


In [7]:
def generate_code_instruction(prompt):
    response = generator(prompt, max_new_tokens=50, num_return_sequences=1)
    prompt_length = len(prompt)
    cleaned_response = response[0]['generated_text'][prompt_length:].strip()
    return cleaned_response

def generate_code_response(prompt):
    response = generator(prompt, max_new_tokens=100, num_return_sequences=1)
    prompt_length = len(prompt)
    cleaned_response = response[0]['generated_text'][prompt_length:].strip()
    return cleaned_response

def syn_dataset(num_synthetic_samples):
    synthetic_data = []
    for i in tqdm(range(num_synthetic_samples)):
        prompt_instruction = f"Write a code instruction snippet that {i}."
        instruction = generate_code_instruction(prompt_instruction)
        
        data_item = {
            "instruction": truncate_before(instruction),
            "input": "",
        }
        
        # 生成代码并更新 'output' 字段
        prompt_response = f"Generate Python Code for: {data_item['instruction']}"
        response = generate_code_response(prompt_response)
        data_item["output"] = get_code_block(response)
        synthetic_data.append(data_item)

    return synthetic_data

def pretty(synthetic_data):
    for i in range(len(synthetic_data)):
        print(f"---------------{i+1}-th Generation -----------------")
        print("^^^^^^^^^^^instruction:^^^^^^^^^^^")
        print(synthetic_data[i]['instruction'])
        print()
        print("^^^^^^^^^^^output:^^^^^^^^^^^")
        print(synthetic_data[i]['output'])
        print("-------------------------------------------------")
        print()

def convert_Json(synthetic_data, json_name):
    json_data = json.dumps(synthetic_data, indent=4)
    with open(json_name, 'w') as json_file:
        json_file.write(json_data)

In [61]:
base_model = "/workspace/LLM-finetune/codeLLM/huggingface/unsloth/Qwen2.5-0.5B"
generator = pipeline("text-generation", model=base_model, device_map="auto")

In [62]:
synthetic_data = syn_dataset(10)
pretty(synthetic_data)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:28<00:00,  2.83s/it]

---------------1-th Generation -----------------
^^^^^^^^^^^instruction:^^^^^^^^^^^
5.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.

^^^^^^^^^^^output:^^^^^^^^^^^
1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.1.
-------------------------------------------------

---------------2-th Generation -----------------
^^^^^^^^^^^instruction:^^^^^^^^^^^
takes a string as input and 2. converts it into a list of words. 3. Sorts the words in alphabetical order. 4. Removes any duplicate words from the list. 5. Returns the sorted list of words. Here

^^^^^^^^^^^output:^^^^^^^^^^^
```python
def string_to_list_and_sort(input_string):
    # Step 1: Convert the string into a list of words
    words = input_string.split()

    # Step 2: Sort the words in alphabetical order
    sorted_words = sorted(words)

    # Step 3: Remove any duplicate words from the list
    unique_words = list(set(sorted_words))

    # Step 4: Return the sorted list of words
-




In [63]:
synthetic_data = syn_dataset(1000)
convert_Json(synthetic_data, "./synthetic/Qwen2.5-0.5B.json")

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [1:06:18<00:00,  3.98s/it]


In [50]:
base_model = "/workspace/LLM-finetune/codeLLM/huggingface/unsloth/Qwen2.5-Coder-0.5B-Instruct"
generator = pipeline("text-generation", model=base_model, device_map="auto")

In [52]:
synthetic_data = syn_dataset(10)
pretty(synthetic_data)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:29<00:00,  2.93s/it]

---------------1-th Generation -----------------
^^^^^^^^^^^instruction:^^^^^^^^^^^
0001 seconds will take to execute in Python. This can be achieved using the `time.sleep()` function from the `time` module.

^^^^^^^^^^^output:^^^^^^^^^^^
```python
import time

# Define the block of code that you want to delay
def delay_code():
    print("Code is being executed...")
    # Simulate some work
    time.sleep(0.5)
-------------------------------------------------

---------------2-th Generation -----------------
^^^^^^^^^^^instruction:^^^^^^^^^^^
Reads the input file.
2. Iterates through each line in the file.
3. For each line, it extracts the first word using the `split()` method and checks if it matches the pattern of "A" followed by any number of digits

^^^^^^^^^^^output:^^^^^^^^^^^
```python
# Open the input file
with open('input.txt', 'r') as file:
    # Initialize variables to store indices and words
    index = 0
    words = []
    
    # Iterate through each line in the file
    f




In [53]:
synthetic_data = syn_dataset(1000)
convert_Json(synthetic_data, "./synthetic/Qwen2.5-Coder-0.5B-Instruct.json")
# pretty(synthetic_data)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [49:46<00:00,  2.99s/it]


In [56]:
base_model = "/workspace/LLM-finetune/codeLLM/huggingface/unsloth/qwen2.5-Coder-1.5B-Instruct"
generator = pipeline("text-generation", model=base_model, device_map="auto")

In [55]:
synthetic_data = syn_dataset(10)
pretty(synthetic_data)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:31<00:00,  3.16s/it]

---------------1-th Generation -----------------
^^^^^^^^^^^instruction:^^^^^^^^^^^
5 seconds after the execution of a given command, it should print "Hello, World!" to the console. The given command is "echo 'Hello, World!'". Here's an example of how you can accomplish this using Python:

^^^^^^^^^^^output:^^^^^^^^^^^
```python
import subprocess
import time

# Execute the given command
subprocess.run(['echo', "'Hello, World!'"])

# Wait for 5 seconds
time.sleep(5)

# Print "Hello, World!"
print("Hello, World!")
```
-------------------------------------------------

---------------2-th Generation -----------------
^^^^^^^^^^^instruction:^^^^^^^^^^^
takes in a string, 2. checks if the string is empty, and 3. if not empty, it returns the length of the string.

Here's an example of how you could write this code:

```
def get_string_length(input

^^^^^^^^^^^output:^^^^^^^^^^^
```

In this code, we define a function called `get_string_length` that takes one argument: `input_string`. 

First




In [57]:
synthetic_data = syn_dataset(1000)
convert_Json(synthetic_data, "./synthetic/Qwen2.5-Coder-1.5B-Instruct.json")

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [1:19:24<00:00,  4.76s/it]


In [58]:
base_model = "/workspace/LLM-finetune/codeLLM/huggingface/unsloth/Qwen2.5-Coder-7B-Instruct"
generator = pipeline("text-generation", model=base_model, device_map="auto")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


In [59]:
pretty(syn_dataset(3))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [14:25<00:00, 288.61s/it]

---------------1-th Generation -----------------
^^^^^^^^^^^instruction:^^^^^^^^^^^
1 seconds delay for Python using time module.

Assistant: Here is the code instruction:

^^^^^^^^^^^output:^^^^^^^^^^^
```python
import time

time.sleep(1)
```
-------------------------------------------------

---------------2-th Generation -----------------
^^^^^^^^^^^instruction:^^^^^^^^^^^
reads the contents of a file named "data.txt" and stores it in a variable called `file_content`. 2. writes the contents of `file_content` to a new file named "output.txt". 3. counts the number of words

^^^^^^^^^^^output:^^^^^^^^^^^
in `file_content` and prints it.

:
-------------------------------------------------

---------------3-th Generation -----------------
^^^^^^^^^^^instruction:^^^^^^^^^^^
prints the string "Hello, world!" to the console.

:

^^^^^^^^^^^output:^^^^^^^^^^^
print("Hello, world!")
-------------------------------------------------






In [None]:
synthetic_data = syn_dataset(100)
convert_Json(synthetic_data, "./synthetic/Qwen2.5-Coder-7B-Instruct.json")

## Phi模型数据生成 & 预处理

In [18]:
def truncate_before(text):
    # Split the text at '```python' and take the first part.
    parts = text.split('from', 1)
    # Return the part before '```python'
    return parts[0].strip() if len(parts) > 1 else text

# Example usage:
input_text = '''
Some leading text that we want to keep.
```python
This is some code that we do not want to include.
'''

# result = truncate_before(input_text)
# print(result)


def get_code_block(text):
    # Find the index of the first occurrence of 'from'
    idx1 = text.find('from')
    
    if idx1 == -1:
        return text  # Return empty string if 'from' is not found

    # Split the text into lines
    lines = text.splitlines()
    
    # Initialize variables to find the line number and index of 'return'
    return_line_number = -1
    for i, line in enumerate(lines):
        if 'return' in line:
            return_line_number = i
            break
    
    if return_line_number == -1:
        return text[idx1:]  # Return empty string if 'return' is not found
    
    # Get the start index of the next line after 'return'
    if return_line_number + 1 < len(lines):
        # If there is a next line after 'return', calculate its start index
        # Join all lines up to but not including the line after 'return'
        text_up_to_next_line = '\n'.join(lines[:return_line_number + 1])
        idx2 = len(text_up_to_next_line) + 1  # Add 1 to point to the start of the next line
    else:
        # If 'return' is on the last line, set idx2 to the end of the text
        idx2 = len(text)
    
    # Extract and return the substring between idx1 and idx2
    return text[idx1:idx2-1]

# Example usage:
input_text = '''
aaaaabbbbbcccccdddd
from some_module import something
def example_function():
    print("This is an example")
    return something
aaaabbbbccccdddd
'''

result = get_code_block(input_text)
print(result)


from some_module import something
def example_function():
    print("This is an example")
    return something


In [15]:
def generate_code_instruction(prompt):
    response = generator(prompt, max_new_tokens=200, num_return_sequences=1)
    prompt_length = len(prompt)
    cleaned_response = response[0]['generated_text'][prompt_length:].strip()
    return cleaned_response

def generate_code_response(prompt):
    response = generator(prompt, max_new_tokens=200, num_return_sequences=1)
    prompt_length = len(prompt)
    cleaned_response = response[0]['generated_text'][prompt_length:].strip()
    return cleaned_response


In [16]:
base_model = "/workspace/LLM-finetune/codeLLM/huggingface/unsloth/phi-1"
generator = pipeline("text-generation", model=base_model, device_map="auto")

In [20]:
pretty(syn_dataset(10))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [01:28<00:00,  8.87s/it]

---------------1-th Generation -----------------
^^^^^^^^^^^instruction:^^^^^^^^^^^
5 times the input string.

    Args:
    - s: a string to be multiplied by 0.5

    Returns:
    - A string that is the input string multiplied by 0.5 times.
    """
    return s * 0.5

^^^^^^^^^^^output:^^^^^^^^^^^
from typing import List

def count_same_adjacent_pairs(li: List[int]) -> int:
    """
    Returns the number of adjacent pairs of elements in the input list that are the same.

    Args:
    - li: a list of integers

    Returns:
    - an integer representing the number of adjacent pairs of elements in the input list that are the same
    """
    count = 0
    for i in range(len(li)-1):
        if li[i] == li[i+1]:
            count += 1
    return count
-------------------------------------------------

---------------2-th Generation -----------------
^^^^^^^^^^^instruction:^^^^^^^^^^^
creates a new list with the elements of li that are greater than n, 2. multiplies each element of the new 




In [21]:
synthetic_data = syn_dataset(1000)
convert_Json(synthetic_data, "./synthetic/phi-1.json")

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [2:23:48<00:00,  8.63s/it]


## 数据增强

In [None]:
data_sample = 1000
generate_prompt = """You're an expert Python coder. Generate a code instruction similar to the following.

### Instruction:
{}

### Input:
{}

### Response:
{}
"""

def augment_dataset(dataset, num_synthetic_samples):
    synthetic_data = []
    for i in range(5):
        prompt = generate_prompt.format(
            dataset[i]["instruction"],
            dataset[i]["input"],
            dataset[i]["output"],
        )
        synthetic_code = generator(prompt, max_new_tokens=100, num_return_sequences=1)[0]['generated_text']
        synthetic_data.append(synthetic_code)
    return synthetic_data
synthetic_train_data = augment_dataset(dataset, num_synthetic_samples=data_sample)
print(synthetic_train_data)