In [None]:
!pip install datasets evaluate transformers[sentencepiece]  accelerate

In [1]:
from datasets import load_dataset

imdb_dataset = load_dataset("imdb")
imdb_dataset

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [2]:
from datasets import load_dataset, DatasetDict
raw_datasets = DatasetDict(
    {
        "train": imdb_dataset["train"].shuffle().select(range(25000)),  # .shuffle().select(range(50000)),
        "valid": imdb_dataset["test"].shuffle().select(range(1000)),  # .shuffle().select(range(500))
    }
)

raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    valid: Dataset({
        features: ['text', 'label'],
        num_rows: 1000
    })
})

In [3]:
# 初始化tokenizer
from datasets import load_dataset
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig
from transformers import GPT2Tokenizer,GPT2Model,AutoModel
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from tokenizers import Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2') #'gpt2-medium
tokenizer.pad_token = tokenizer.eos_token  # GPT2 does not have a pad token, so we use the eos_token as pad_token



In [4]:
#构建提示词
def format_input(example):

  instruction = "Determine whether the sentiment of following text is positive or negative"

  instruction_text = (
      f"Below is an instruction that describes a task. "
      f"Write a response that appropriately completes the request."
      f"\n\n### Instruction:\n{instruction}"
  )

  input = example["text"]
  input_text = f"\n\n### Input:\n{input}"

  prompt =  instruction_text + input_text + "\n\n### Response:\n"

  return prompt




def build_prompt(example):
  input_data = format_input(example)

  if example['label'] == 1:
    label = 'positive'
  else:
    label = 'negative'

  output = label
  desired_response = output

  prompt =  input_data + desired_response

  return prompt

In [5]:
example = raw_datasets["valid"][2]
example

{'text': 'This epic brings together a superbly-gifted cast and crew, a narrative depth superior to most novels, wonderful music, philosophy and a connection to LIFE that I find difficult to explain. To immerse oneself in Die Zweite Heimat is for me akin to a spiritual experience, similar to the awe one gets when looking at the stars in a clear night sky. The language, and use of both colour and monochrome segments adds to the dramatic impact. The film inspired me to go to Munich and visit some of the locations, including the Edgar Reitz office. From then on, I vowed to improve my German skills - after Die Zweite Heimat I feel almost German, as if I am in the head of the characters. I also try to match the piano playing of Henry Arnold (Hermann), but this is the one thing that will always elude me ! This drama is unparalleled and I have been fortunate to see it on BBC2 in the UK and SBS in Australia. The sequel, Heimat 3, is currently being filmed in Germany.',
 'label': 1}

In [6]:
prompt = build_prompt(example)
print(prompt)

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Determine whether the sentiment of following text is positive or negative

### Input:
This epic brings together a superbly-gifted cast and crew, a narrative depth superior to most novels, wonderful music, philosophy and a connection to LIFE that I find difficult to explain. To immerse oneself in Die Zweite Heimat is for me akin to a spiritual experience, similar to the awe one gets when looking at the stars in a clear night sky. The language, and use of both colour and monochrome segments adds to the dramatic impact. The film inspired me to go to Munich and visit some of the locations, including the Edgar Reitz office. From then on, I vowed to improve my German skills - after Die Zweite Heimat I feel almost German, as if I am in the head of the characters. I also try to match the piano playing of Henry Arnold (Hermann), but this is the one thing that will always e

In [7]:
def tokenize_function(example):
    prompt =  build_prompt(example)
    result = tokenizer(prompt, padding='max_length', truncation=True,max_length=1024) # max_length=1024
    return result

# Use batched=false for easy
tokenized_datasets = raw_datasets.map(
    tokenize_function, batched=False, remove_columns=["text", "label"]
)
tokenized_datasets

Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25000/25000 [00:25<00:00, 961.85 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:01<00:00, 751.73 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 25000
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [8]:
tokenizer

GPT2Tokenizer(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True)

In [9]:
prompt

'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nDetermine whether the sentiment of following text is positive or negative\n\n### Input:\nThis epic brings together a superbly-gifted cast and crew, a narrative depth superior to most novels, wonderful music, philosophy and a connection to LIFE that I find difficult to explain. To immerse oneself in Die Zweite Heimat is for me akin to a spiritual experience, similar to the awe one gets when looking at the stars in a clear night sky. The language, and use of both colour and monochrome segments adds to the dramatic impact. The film inspired me to go to Munich and visit some of the locations, including the Edgar Reitz office. From then on, I vowed to improve my German skills - after Die Zweite Heimat I feel almost German, as if I am in the head of the characters. I also try to match the piano playing of Henry Arnold (Hermann), but this is the one thing that will a

In [10]:
len(tokenizer.encode(prompt))

272

In [11]:
tokenizer(prompt)

{'input_ids': [21106, 318, 281, 12064, 326, 8477, 257, 4876, 13, 19430, 257, 2882, 326, 20431, 32543, 262, 2581, 13, 198, 198, 21017, 46486, 25, 198, 35, 2357, 3810, 1771, 262, 15598, 286, 1708, 2420, 318, 3967, 393, 4633, 198, 198, 21017, 23412, 25, 198, 1212, 12191, 6774, 1978, 257, 21840, 306, 12, 70, 21715, 3350, 290, 5462, 11, 257, 8689, 6795, 9098, 284, 749, 16122, 11, 7932, 2647, 11, 8876, 290, 257, 4637, 284, 36821, 326, 314, 1064, 2408, 284, 4727, 13, 1675, 545, 647, 325, 27186, 287, 6733, 1168, 732, 578, 679, 320, 265, 318, 329, 502, 22107, 284, 257, 8557, 1998, 11, 2092, 284, 262, 25030, 530, 3011, 618, 2045, 379, 262, 5788, 287, 257, 1598, 1755, 6766, 13, 383, 3303, 11, 290, 779, 286, 1111, 9568, 290, 937, 5374, 5998, 17894, 6673, 284, 262, 10092, 2928, 13, 383, 2646, 7867, 502, 284, 467, 284, 22418, 290, 3187, 617, 286, 262, 7064, 11, 1390, 262, 29166, 797, 4224, 2607, 13, 3574, 788, 319, 11, 314, 19982, 284, 2987, 616, 2679, 4678, 532, 706, 6733, 1168, 732, 578, 679, 320,

In [12]:
# 创建DataCollator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # 因为GPT2是自回归模型，不需要MLM
)

In [13]:
model = GPT2LMHeadModel.from_pretrained("gpt2")
#model.config.pad_token_id = 50256

In [14]:
def inference(text, model, tokenizer, max_input_tokens=1000, max_output_tokens=1000):
  # Tokenize
  input_ids = tokenizer.encode(
          text,
          return_tensors="pt",
          truncation=True,
          max_length=max_input_tokens
          # return_attention_mask=True,
  )

  # Generate
  device = model.device
  generated_tokens_with_prompt = model.generate(
    input_ids=input_ids.to(device),
    #max_length=max_output_tokens,
    max_new_tokens=5,
  )

    
  generated_text_with_prompt = tokenizer.decode(generated_tokens_with_prompt[0], skip_special_tokens=True)
  generated_text_answer = generated_text_with_prompt[len(text):]


  return generated_text_answer

# 如果需要进一步清理
def clean_generated_text(text):
    # 去除 'Ġ' 符号并替换为空格
    text = text.replace('Ġ', ' ')
    # 去除多余的空格
    text = ' '.join(text.split())
    return text

In [15]:
label_dict = {1:"positive",
             0:"negative"}

      
input_text = format_input(raw_datasets["valid"][10])

print("input (test):", input_text)
label_id = raw_datasets["valid"][10]["label"]
label = label_dict[label_id]
print("label (test):", label_id, label)

print("--------------------------\n")

print("model's answer: \n")
print(inference(input_text, model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


input (test): Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Determine whether the sentiment of following text is positive or negative

### Input:
A bare-faced rip-off of Se7en and not fit to clean its shoes. The word 'predictable' must have invented for just such an occasion as this. Lambert is wooden, as always (his moments of 'emotion' are laughable, as is his accent). The 'climax' is not that at all as we've had so many signals, and by the end we're simply immune to flesh, rotting and otherwise. Altogether a real mess.

### Response:

label (test): 0 negative
--------------------------

model's answer: 


A simple, but


In [16]:
training_args = TrainingArguments(
        output_dir='./results',
        overwrite_output_dir=True,
        num_train_epochs=3,
        per_device_train_batch_size=8,
        save_steps=2000,
        save_total_limit=2,
        prediction_loss_only=True,
        fp16=True, #v100没法用
    )


In [17]:
# 初始化Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['valid'],
    data_collator=data_collator
)

  self.scaler = torch.cuda.amp.GradScaler()


In [18]:
# 开始训练
trainer.train()

  else torch.cuda.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)


Step,Training Loss
500,3.1498
1000,3.0796
1500,3.0611
2000,3.0505
2500,3.0288
3000,3.0224
3500,2.9617
4000,2.9454
4500,2.9483
5000,2.9311


  else torch.cuda.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)
  else torch.cuda.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)
  else torch.cuda.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)
  else torch.cuda.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)


TrainOutput(global_step=9375, training_loss=2.9615108984375, metrics={'train_runtime': 3314.8196, 'train_samples_per_second': 22.626, 'train_steps_per_second': 2.828, 'total_flos': 3.91938048e+16, 'train_loss': 2.9615108984375, 'epoch': 3.0})

In [22]:
save_dir = 'gpt_ft/final'
trainer.save_model(save_dir)
print("Saved model to:", save_dir)

Saved model to: gpt_ft/final


In [23]:
save_dir = 'gpt_ft/final'
finetuned_model = GPT2LMHeadModel.from_pretrained(save_dir, local_files_only=True)

  return torch.load(checkpoint_file, map_location="cpu")


In [26]:
print("input (test):", input_text)

print("--------------------------\n")

print("model's answer: \n")
print(inference(input_text, finetuned_model, tokenizer))

print("--------------------------\n")
print("real answer: \n")
print(  label_dict[ raw_datasets["valid"][0]["label"] ] )

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


input (test): Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Determine whether the sentiment of following text is positive or negative

### Input:
A bare-faced rip-off of Se7en and not fit to clean its shoes. The word 'predictable' must have invented for just such an occasion as this. Lambert is wooden, as always (his moments of 'emotion' are laughable, as is his accent). The 'climax' is not that at all as we've had so many signals, and by the end we're simply immune to flesh, rotting and otherwise. Altogether a real mess.

### Response:

--------------------------

model's answer: 

negative

### Response
--------------------------

real answer: 

negative


In [29]:
test_data = raw_datasets["valid"].shuffle(seed=190).select(range(100))

data_list = []

for entry in test_data:
    input_text = format_input(entry)
    #print(input_text)
    response_text = inference(input_text, finetuned_model, tokenizer)
    #print(response_text)
    data = {
        "instruction": "Determine whether the sentiment of following text is positive or negative",
         "input":entry["text"],
         "output":label_dict[entry["label"]],
        "model_response":response_text
    }

    data_list.append(data)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

In [30]:
import json

# 定义输出文件路径
output_file = 'gpt2-small3-1024.json'

# 将 Dataset 对象导出为 JSON 文件
# test_data.to_json(output_file)
with open(output_file, "w") as file:
    json.dump(data_list, file, indent=4)  # "indent" for pretty-printing

In [31]:
import json



with open(output_file, "r") as file:
    test_data = json.load(file)

all_num = len(test_data)
right_sum = 0
same_sum = 0
for item in test_data:
    output = item["output"] 
    #output = " ".join(tokenizer.tokenize(output))
    model_response = item["model_response"]
    if model_response == output: #same it
        same_sum = same_sum + 1
        
    if model_response.find(output)!=-1: #find it
        right_sum = right_sum + 1


print("presicion", right_sum/all_num, "same", same_sum/all_num)


presicion 0.86 same 0.0
