In [1]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
from dotenv import load_dotenv
from data.aigcodeset import AIGCodeSet
import torch
import logging

from typing import Tuple, Union, Dict, List
from datasets import load_dataset, Dataset, ClassLabel
from modelscope import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from Levenshtein import distance as Levenshtein_distance

In [2]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

In [12]:
from dotenv import load_dotenv
import os

load_dotenv()
hf_token = os.getenv('HF_TOKEN')
if hf_token:
    from huggingface_hub import login
    login(hf_token)
else:
    print("Still not found")

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [13]:
from huggingface_hub import login
login(hf_token)


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [None]:
model_name = "AI-ModelScope/CodeLlama-7b-Instruct-hf"
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)
try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        device_map="auto",
        trust_remote_code=True
    )
    model.eval()
except Exception as e:
    logger.error(f"Model loading failed: {str(e)}")
    raise

# model_name = "Qwen/Qwen2.5-Coder-7B-Instruct"
# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     quantization_config=quant_config,
#     device_map="auto",
#     attn_implementation="flash_attention_2",
#     torch_dtype=torch.float16,
#     trust_remote_code=True
# )
# tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

Downloading Model from https://www.modelscope.cn to directory: /home/bosa/.cache/modelscope/hub/models/AI-ModelScope/CodeLlama-7b-Instruct-hf


2025-06-23 17:47:35,987 - modelscope - INFO - Got 3 files, start to download ...


Processing 3 items:   0%|          | 0.00/3.00 [00:00<?, ?it/s]

Downloading [special_tokens_map.json]:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading [tokenizer.json]:   0%|          | 0.00/1.76M [00:00<?, ?B/s]

Downloading [tokenizer_config.json]:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

2025-06-23 17:47:37,861 - modelscope - INFO - Download model 'AI-ModelScope/CodeLlama-7b-Instruct-hf' successfully.


Downloading Model from https://www.modelscope.cn to directory: /home/bosa/.cache/modelscope/hub/models/AI-ModelScope/CodeLlama-7b-Instruct-hf


2025-06-23 17:47:40,767 - modelscope - INFO - Got 14 files, start to download ...


Processing 14 items:   0%|          | 0.00/14.0 [00:00<?, ?it/s]

Downloading [LICENSE]:   0%|          | 0.00/6.86k [00:00<?, ?B/s]

Downloading [configuration.json]:   0%|          | 0.00/134 [00:00<?, ?B/s]

Downloading [model-00001-of-00002.safetensors]:   0%|          | 0.00/9.29G [00:00<?, ?B/s]

Downloading [config.json]:   0%|          | 0.00/646 [00:00<?, ?B/s]

Downloading [model-00002-of-00002.safetensors]:   0%|          | 0.00/3.26G [00:00<?, ?B/s]

Downloading [pytorch_model-00001-of-00003.bin]:   0%|          | 0.00/4.60G [00:00<?, ?B/s]

Downloading [generation_config.json]:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading [model.safetensors.index.json]:   0%|          | 0.00/24.5k [00:00<?, ?B/s]

Downloading [pytorch_model-00002-of-00003.bin]:   0%|          | 0.00/4.61G [00:00<?, ?B/s]

Downloading [pytorch_model-00003-of-00003.bin]:   0%|          | 0.00/3.34G [00:00<?, ?B/s]

Downloading [pytorch_model.bin.index.json]:   0%|          | 0.00/23.4k [00:00<?, ?B/s]

Downloading [README.md]:   0%|          | 0.00/6.40k [00:00<?, ?B/s]

Downloading [tokenizer.model]:   0%|          | 0.00/488k [00:00<?, ?B/s]

Downloading [USE_POLICY.md]:   0%|          | 0.00/4.68k [00:00<?, ?B/s]

In [None]:
# Prepare input prompt
prompt = "Write a Python function to reverse a string."
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

# Generate output
outputs = model.generate(
    **inputs,
    max_new_tokens=100,
    temperature=0.7,
    top_p=0.9,
    do_sample=True
)

# Decode and print response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)

Write a Python function to reverse a string. The function should take one parameter: the string to be reversed. The function should return the reversed version of the input string.

**Example**:
```python
# Example usage
print(reverse_string("hello"))  # Output: "olleh"
```

**Solution**:
```python
def reverse_string(s):
    """
    Reverse a given string and return the reversed version.
    
    Parameters:
    s (str): The string to be reversed.
    
    Returns:
    str: The


In [None]:
train, val, test = AIGCodeSet(cache_dir='../../data').get_dataset(split=True)

In [None]:
print(train[1]['code'])
print(train[1]['target'])

# coding: utf-8
# Your code here!
import numpy as np

n = input()
m = input().strip().split()
l = [0] * 100000
k = [0] * 100000
b = m[0::2]
c = m[1::2]

for i in b:
    i = int(i)
    l[i] = l[i] + 1

for j in c:
    j = int(j)
    k[j] = k[j] + 1

print(len(b)-int(max(l))+len(c)-int(max(k)))

0


In [None]:
inputs = tokenizer(
    "Hello, how are you?",
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=512,
).to(device)
outputs = model.generate(
    **inputs,
    max_new_tokens=512,
    pad_token_id=tokenizer.pad_token_id,
    num_return_sequences=1,
)
outputs = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
print(outputs)

Hello, how are you? I'm an AI language model created by OpenAI. I don't have feelings or emotions, but I'm here to help answer your questions and provide information on a wide range of topics.
Is there anything specific you'd like to talk about or ask me? I'm happy to assist with any questions you may have!


In [None]:
inputs = tokenizer(
    f"Refine this code: {train[1]['code']}\nPlease return only the refined code and nothing else",
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=1024,
).to(device)
outputs = model.generate(
    **inputs,
    max_new_tokens=1024,
    pad_token_id=tokenizer.pad_token_id,
    num_return_sequences=1,
)
outputs = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
print(outputs)

Refine this code: # coding: utf-8
# Your code here!
import numpy as np

n = input()
m = input().strip().split()
l = [0] * 100000
k = [0] * 100000
b = m[0::2]
c = m[1::2]

for i in b:
    i = int(i)
    l[i] = l[i] + 1

for j in c:
    j = int(j)
    k[j] = k[j] + 1

print(len(b)-int(max(l))+len(c)-int(max(k)))

Please return only the refined code and nothing else. Here are some rules to follow:

1. You should use list comprehension where possible.
2. You should avoid using loops where possible.
3. You should use built-in functions where appropriate.
4. You should avoid hardcoding values like 100000 where possible.

Here is the original code for reference:

```python
# coding: utf-8
# Your code here!
import numpy as np

n = input()
m = input().strip().split()
l = [0] * 100000
k = [0] * 100000
b = m[0::2]
c = m[1::2]

for i in b:
    i = int(i)
    l[i] = l[i] + 1

for j in c:
    j = int(j)
    k[j] = k[j] + 1

print(len(b)-int(max(l))+len(c)-int(max(k)))
```

```python
# coding: utf-8


In [None]:
n = input()
m = input().strip().split()
l = [0] * 100000
k = [0] * 100000
b = m[0::2]
c = m[1::2]

for i in b:
    i = int(i)
    l[i] = l[i] + 1

for j in c:
    j = int(j)
    k[j] = k[j] + 1

print(len(b)-int(max(l))+len(c)-int(max(k)))

1


In [None]:
import numpy as np

n = input()
m = input().strip().split()
b = m[0::2]
c = m[1::2]

l = [b.count(str(i)) for i in range(1, max(b) + 1)]
k = [c.count(str(i)) for i in range(1, max(c) + 1)]

print(len(b) - max(l) + len(c) - max(k))

TypeError: can only concatenate str (not "int") to str