In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cuda" # the device to load the model onto

model = AutoModelForCausalLM.from_pretrained(
    "/home/lyz/hf-models/Qwen2.5-3B-Instruct/",
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("/home/lyz/hf-models/Qwen2.5-3B-Instruct/")

prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)

generated_ids = model.generate(
    model_inputs.input_ids,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]


Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


'A large language model (LLM) is a type of artificial intelligence designed to understand and generate human-like text based on vast amounts of training data. These models are trained using advanced machine learning techniques, often involving deep neural networks, to recognize patterns in language. \n\nKey characteristics of LLMs include their ability to handle complex tasks such as translation, summarization, question-answering, and even creative writing. They can process and produce text in multiple languages, making them valuable tools for language translation and content generation. LLMs continue to evolve with advancements in natural language processing, aiming to improve their accuracy, efficiency, and adaptability to diverse linguistic contexts.'

In [2]:
import pandas as pd
import string
import numpy as np



In [4]:
from sentence_transformers import SentenceTransformer
sentences_1 = ["样例数据-1", "样例数据-2"]
sentences_2 = ["样例数据-3", "样例数据-4"]
bge_model = SentenceTransformer('/home/lyz/hf-models/bge-small-zh-v1.5/')
embeddings_1 = bge_model.encode(sentences_1, normalize_embeddings=True)
embeddings_2 = bge_model.encode(sentences_2, normalize_embeddings=True)
similarity = embeddings_1 @ embeddings_2.T
print(similarity)

[[0.8692142  0.8665447 ]
 [0.89948714 0.87843263]]


In [6]:
data = pd.read_csv('test/test_cn2en.txt', sep='\t', header=None)
for row in data.iterrows():
    if row[1][0][0].lower() in string.ascii_letters:
        s1 = row[1][0]
        s2 = row[1][1]
    else:
        s1 = row[1][1]
        s2 = row[1][0]

    messages = [
        {"role": "user", "content": f"将中文翻译为英文，不要有其他输出：{s1}"},
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(device)
    
    generated_ids = model.generate(
        model_inputs.input_ids,
        max_new_tokens=512
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    embeddings_1 = bge_model.encode([response, s2], normalize_embeddings=True)
    score = np.dot(embeddings_1[0], embeddings_1[1])
    score = int(score * 100)

    with open('submit/cn2en.txt', 'a') as up:
        up.write(f'{score}\n')

In [7]:
data = pd.read_csv('test/test_en2cn.txt', sep='\t', header=None)
for row in data.iterrows():
    if row[1][0][0].lower() in string.ascii_letters:
        s1 = row[1][0]
        s2 = row[1][1]
    else:
        s1 = row[1][1]
        s2 = row[1][0]

    messages = [
        {"role": "user", "content": f"将英文翻译为中文，不要有其他输出：{s1}"},
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(device)
    
    generated_ids = model.generate(
        model_inputs.input_ids,
        max_new_tokens=512
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    embeddings_1 = bge_model.encode([response, s2], normalize_embeddings=True)
    score = np.dot(embeddings_1[0], embeddings_1[1])
    score = int(score * 100)

    with open('submit/en2cn.txt', 'a') as up:
        up.write(f'{score}\n')

In [9]:
!mkdir submit
!\rm submit/.ipynb_checkpoints/ -rf
!zip -r submit.zip submit

mkdir: 无法创建目录 "submit": 文件已存在


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  adding: submit/ (stored 0%)
  adding: submit/cn2en.txt (deflated 65%)
  adding: submit/en2cn.txt (deflated 63%)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
