In [1]:
!pip install transformers



In [25]:

import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer

#Model
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

#Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')



Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<class 'str'>


In [None]:
question = '''What is Machine Learning?'''

paragraph = ''' Machine learning (ML) is the scientific study of algorithms and statistical models that computer systems use to progressively improve their performance 
                on a specific task. Machine learning algorithms build a mathematical model of sample data, known as "training data", in order to make predictions or 
                decisions without being explicitly programmed to perform the task. Machine learning algorithms are used in the applications of email filtering, detection 
                of network intruders, and computer vision, where it is infeasible to develop an algorithm of specific instructions for performing the task. Machine learning 
                is closely related to computational statistics, which focuses on making predictions using computers. The study of mathematical optimization delivers methods, 
                theory and application domains to the field of machine learning. Data mining is a field of study within machine learning, and focuses on exploratory 
                data analysis through unsupervised learning.In its application across business problems, machine learning is also referred to as predictive analytics. '''
            
encoding = tokenizer.encode_plus(text=question,text_pair=paragraph)

inputs = encoding['input_ids']  #Token embeddings
sentence_embedding = encoding['token_type_ids']  #Segment embeddings
tokens = tokenizer.convert_ids_to_tokens(inputs) #input tokens
start_scores, end_scores = model(torch.tensor([inputs]), token_type_ids=torch.tensor([sentence_embedding]))
print(type(start_scores))

In [19]:
start_scores, end_scores = model(input_ids=torch.tensor([inputs]), token_type_ids=torch.tensor([sentence_embedding]))
print(type(start_scores))

# 如果它不是 torch.Tensor，嘗試將其轉換為 torch.Tensor
# if not isinstance(start_scores, torch.Tensor):
#     start_scores = torch.tensor(start_scores)

# # 再次檢查 start_scores 的類型
# print(type(start_scores))

# # 最後檢查數據類型
# print(start_scores.dtype)

start_logits
<class 'str'>


In [26]:
start_index = torch.argmax(start_scores)

end_index = torch.argmax(end_scores)

answer = ' '.join(tokens[start_index:end_index+1])

TypeError: argmax(): argument 'input' (position 1) must be Tensor, not str

In [None]:

corrected_answer = ''

for word in answer.split():
    
    #If it's a subword token
    if word[0:2] == '##':
        corrected_answer += word[2:]
    else:
        corrected_answer += ' ' + word

print(corrected_answer)

#重新做一個新的訓練模型

In [30]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset

# 定義一個簡單的資料集
class CustomDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return {'text': self.texts[idx], 'label': self.labels[idx]}

# 創建一個小型的訓練資料集
texts = ["這是一個正面的例子", "這是一個負面的例子", "這是另一個正面的例子", "這是另一個負面的例子"]
labels = [1, 0, 1, 0]

train_dataset = CustomDataset(texts, labels)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)

# 初始化BertTokenizer和BertForSequenceClassification模型
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=2)

# 定義訓練參數
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
num_epochs = 3

# 訓練模型
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)
for epoch in range(num_epochs):
    for batch in train_loader:
        inputs = tokenizer(batch['text'], return_tensors='pt', padding=True, truncation=True)
        labels = torch.tensor(batch['label']).unsqueeze(0).to(device)

        inputs.to(device)
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

# 保存訓練好的模型
model.save_pretrained('your_model_directory')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  labels = torch.tensor(batch['label']).to(device)  # 直接移動到 GPU 上


用GPT-2嘗試finetuned

In [13]:
from transformers import pipeline, set_seed
generator = pipeline('text-generation', model='./gpt2-finetuned')
set_seed(42)
generator("你好嗎?", max_length=30, num_return_sequences=5)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': '你好嗎?       11/7/2014 17:36:48 T:3486 DEBUG:'},
 {'generated_text': '你好嗎?                   None  \n'},
 {'generated_text': '你好嗎?\n                     '},
 {'generated_text': '你好嗎?\n                     '},
 {'generated_text': '你好嗎? 你好嗎?             '}]

In [7]:
from transformers import GPT2Tokenizer, TFGPT2Model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = TFGPT2Model.from_pretrained('gpt2')
text = "How are you"
encoded_input = tokenizer(text, return_tensors='tf')
output = model(encoded_input)

All PyTorch model weights were used when initializing TFGPT2Model.

All the weights of TFGPT2Model were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2Model for predictions without further training.


In [7]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

# 載入預訓練模型和分詞器
model_name = 'gpt2'
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# 載入並分詞您的自定義數據集
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="pythonTrainingData_w3schools.txt",  # 替換為您的訓練數據集的路徑
    block_size=128  # 根據您的數據集調整塊大小
)

# 設置訓練參數
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# 初始化Trainer並fine-tuning模型
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
    train_dataset=train_dataset,
)

# 開始訓練
trainer.train()

model.save_pretrained("./gpt2-finetuned")
tokenizer.save_pretrained("./gpt2-finetuned")

100%|██████████| 406/406 [29:01<00:00,  4.29s/it]


{'train_runtime': 1741.9966, 'train_samples_per_second': 0.932, 'train_steps_per_second': 0.233, 'train_loss': 2.3571622500865916, 'epoch': 1.0}


('./gpt2-finetuned\\tokenizer_config.json',
 './gpt2-finetuned\\special_tokens_map.json',
 './gpt2-finetuned\\vocab.json',
 './gpt2-finetuned\\merges.txt',
 './gpt2-finetuned\\added_tokens.json')

In [11]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, set_seed

# 載入預訓練模型和分詞器
model_name = './gpt2-finetuned'  # 替換為你的微調模型的路徑
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# 設定隨機種子
set_seed(42)

# 生成問題
question_prompt = "How can I print Hello World in Python?"
question_input_ids = tokenizer.encode(question_prompt, return_tensors='tf')

# 生成答案
answer_output = model.generate(
    question_input_ids,
    max_length=50,  # 調整生成答案的最大長度
    num_return_sequences=5,
    no_repeat_ngram_size=2,
    top_k=50,
    temperature=0.7,
    do_sample=True
)

# 解碼並打印生成的答案
for i, answer_ids in enumerate(answer_output):
    generated_answer = tokenizer.decode(answer_ids, skip_special_tokens=True)
    print(f"Generated Answer {i+1}: {generated_answer}")


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()