In [None]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available. Device:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU is not available. Using CPU instead.")


GPU is available. Device: Tesla T4


In [None]:
!pip install torch==2.3.1 torchvision==0.18.1+cu121 torchaudio==2.3.1+cu121
!pip install transformers==4.31.0 datasets==2.13.1

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.3.1)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.3.1)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.3.1)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch==2.3.1)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch==2.3.1)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch==2.3.1)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch==2.3.1)


In [None]:
!pip install torch==2.3.1+cu121 -f https://download.pytorch.org/whl/torch_stable.html
!pip install transformers==4.24.0 datasets==2.3.0

Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting transformers==4.24.0
  Downloading transformers-4.24.0-py3-none-any.whl.metadata (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.5/90.5 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets==2.3.0
  Downloading datasets-2.3.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.6 (from datasets==2.3.0)
  Downloading dill-0.3.5.1-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting responses<0.19 (from datasets==2.3.0)
  Downloading responses-0.18.0-py3-none-any.whl.metadata (29 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
Collecting multiprocess (from datasets==2.3.0)
  Using cached multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
  Using cached multiprocess-0.70.15-py310-none-any.whl.metadata (7.2 kB)
  Downloading multiprocess-0.70.13-py310-none-an

In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import json

In [None]:
paragraph_path = '/content/drive/MyDrive/DATA/Paragraph.txt'
paragraph_qa_path = '/content/drive/MyDrive/DATA/Paragraph_QA_Pairs.txt'
spring_qa_path = '/content/drive/MyDrive/DATA/Spring_QA_Pairs.txt'
fall_qa_path = '/content/drive/MyDrive/DATA/Fall_QA_Pairs.txt'
spring_extras_qa_path = '/content/drive/MyDrive/DATA/Spring_Extras.txt'
fall_extras_qa_path = '/content/drive/MyDrive/DATA/Fall_Extras.txt'

In [None]:
def load_qa_pairs(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

In [None]:
paragraph_qa_pairs = load_qa_pairs(paragraph_qa_path)
spring_qa_pairs = load_qa_pairs(spring_qa_path)
fall_qa_pairs = load_qa_pairs(fall_qa_path)
spring_extras_qa_pairs = load_qa_pairs(spring_extras_qa_path)
fall_extras_qa_pairs = load_qa_pairs(fall_extras_qa_path)

In [None]:
with open(paragraph_path, 'r') as f:
    paragraph_text = f.read()

In [None]:
all_qa_pairs = paragraph_qa_pairs + spring_qa_pairs + fall_qa_pairs + spring_extras_qa_pairs + fall_extras_qa_pairs

In [None]:
for i in range(3):
    print(all_qa_pairs[i])

{'question': 'What is the purpose of Table 1 in the document?', 'answer': "Table 1 gives a general comparative rating of 'burndown' herbicides used in no-till corn and/or soybean production."}
{'question': 'What factors can influence the performance of burndown herbicides?', 'answer': 'Herbicide rate, weed size and stage of growth, and environmental conditions interact to influence herbicide performance.'}
{'question': 'What is the weed control rating for a score of 9?', 'answer': 'A weed control rating of 9 means 90% to 100% control.'}


In [None]:
print(paragraph_text[:500])

Table 1. Weed Response to “Burndown” Herbicides

This table gives a general comparative rating of “burndown” herbicides used in no-till corn and/or soybean production. Under unfavorable conditions, some herbicides may not perform as well as indicated below. Under very favorable conditions, control may be better than indicated. Herbicide rate, weed size and stage of growth, and environmental conditions interact to influence herbicide performance. 

Weed control rating: 
> 9 = 90% to 100% 
> 8 = 8


In [None]:
train_data = [
    {"prompt": f"Q: {pair['question']}\nA: {pair['answer']}"}
    for pair in all_qa_pairs
]

In [None]:
train_data.append({
    "prompt": f"Context: {paragraph_text}\n\nQ: What is this document about?\nA: This document provides information about herbicides and weed control methods."
})


In [None]:
for i in range(3):
    print(train_data[i])

{'prompt': "Q: What is the purpose of Table 1 in the document?\nA: Table 1 gives a general comparative rating of 'burndown' herbicides used in no-till corn and/or soybean production."}
{'prompt': 'Q: What factors can influence the performance of burndown herbicides?\nA: Herbicide rate, weed size and stage of growth, and environmental conditions interact to influence herbicide performance.'}
{'prompt': 'Q: What is the weed control rating for a score of 9?\nA: A weed control rating of 9 means 90% to 100% control.'}


In [None]:
from datasets import Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")



In [None]:
tokenizer.pad_token = tokenizer.eos_token

In [None]:
dataset = Dataset.from_list(train_data)

In [None]:
def tokenize_function(examples):
    inputs = tokenizer(examples["prompt"], padding="max_length", truncation=True, max_length=512)
    labels = inputs.input_ids.copy()

    labels = [-100 if token == tokenizer.pad_token_id else token for token in labels]
    inputs["labels"] = labels

    return inputs

In [None]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/559 [00:00<?, ? examples/s]

In [None]:
print(tokenized_datasets)

Dataset({
    features: ['prompt', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 559
})


In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    fp16=True,
    gradient_accumulation_steps=2,
    save_total_limit=2,
    save_steps=500,
)

In [None]:
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))



Embedding(50257, 768)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,
)

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,0.286196
2,No log,0.203852


TrainOutput(global_step=70, training_loss=1.5420188903808594, metrics={'train_runtime': 114.1098, 'train_samples_per_second': 9.798, 'train_steps_per_second': 0.613, 'total_flos': 292124491776000.0, 'train_loss': 1.5420188903808594, 'epoch': 2.0})

In [None]:
model.save_pretrained("/content/drive/MyDrive/DATA/fine-tuned-gpt2")
tokenizer.save_pretrained("/content/drive/MyDrive/DATA/fine-tuned-gpt2")

('/content/drive/MyDrive/DATA/fine-tuned-gpt2/tokenizer_config.json',
 '/content/drive/MyDrive/DATA/fine-tuned-gpt2/special_tokens_map.json',
 '/content/drive/MyDrive/DATA/fine-tuned-gpt2/vocab.json',
 '/content/drive/MyDrive/DATA/fine-tuned-gpt2/merges.txt',
 '/content/drive/MyDrive/DATA/fine-tuned-gpt2/added_tokens.json')

In [None]:
def generate_response(prompt):
    inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)

    outputs = model.generate(
        inputs,
        max_length=512,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        top_k=50,
        top_p=0.95,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

In [None]:
from IPython.display import display, HTML
import ipywidgets as widgets

In [None]:
input_box = widgets.Textarea(
    placeholder='Type your question here...',
    description='question:',
    disabled=False
)

output_box = widgets.Textarea(
    placeholder='Bot response will appear here...',
    description='answer:',
    disabled=True
)

In [None]:
button = widgets.Button(description="Ask")

In [None]:
def on_button_click(b):
    prompt = input_box.value
    response = generate_response(prompt)
    output_box.value = response

In [None]:
button.on_click(on_button_click)

In [None]:
display(input_box, button, output_box)

Textarea(value='', description='question:', placeholder='Type your question here...')

Button(description='Ask', style=ButtonStyle())

Textarea(value='', description='answer:', disabled=True, placeholder='Bot response will appear here...')