In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

In [None]:
!pip install transformers datasets torch
!pip install accelerate -U

import accelerate
import os

os.environ["ACCELERATE_INITIALIZED"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import json

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import Dataset

# Load the JSON file
with open('colabdataset.json') as f:
    data = json.load(f)

# Convert data into a Dataset object
questions = [item['question'] for item in data]
answers = [item['answer'] for item in data]
dataset = Dataset.from_dict({'question': questions, 'answer': answers})

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('distilgpt2')
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained('distilgpt2')

# Tokenize the dataset
def preprocess(examples):
    inputs = [q + tokenizer.eos_token + a for q, a in zip(examples['question'], examples['answer'])]
    model_inputs = tokenizer(inputs, max_length=32, truncation=True, padding='max_length')
    model_inputs['labels'] = model_inputs['input_ids'].copy()
    return model_inputs

tokenized_dataset = dataset.map(preprocess, batched=True)

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=100,
    save_total_limit=2
)

# Set up Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

# Train the model
trainer.train()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Map:   0%|          | 0/1164 [00:00<?, ? examples/s]

Step,Training Loss
10,5.1336
20,4.7611
30,3.9022
40,3.164
50,3.0509
60,2.7194
70,2.656
80,2.552
90,2.4732
100,2.4475


TrainOutput(global_step=438, training_loss=2.179734145125298, metrics={'train_runtime': 20.3194, 'train_samples_per_second': 171.855, 'train_steps_per_second': 21.556, 'total_flos': 28514007908352.0, 'train_loss': 2.179734145125298, 'epoch': 3.0})

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import Dataset

# Load the JSON file
with open('colabdataset.json') as f:
    data = json.load(f)

# Convert data into a Dataset object
questions = [item['question'] for item in data]
answers = [item['answer'] for item in data]
dataset = Dataset.from_dict({'question': questions, 'answer': answers})

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('distilgpt2')
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained('distilgpt2')

# Tokenize the dataset
def preprocess(examples):
    inputs = [q + tokenizer.eos_token + a for q, a in zip(examples['question'], examples['answer'])]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding='max_length')
    model_inputs['labels'] = model_inputs['input_ids'].copy()
    return model_inputs

tokenized_dataset = dataset.map(preprocess, batched=True)

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./resultados',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=100,
    save_total_limit=2
)

# Set up Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

# Train the model
trainer.train()




Map:   0%|          | 0/1164 [00:00<?, ? examples/s]

Step,Training Loss
10,10.2787
20,9.8256
30,8.9824
40,7.6739
50,5.4263
60,2.9265
70,1.243
80,0.992
90,0.9448
100,0.8581


TrainOutput(global_step=438, training_loss=1.624899183778458, metrics={'train_runtime': 28.8136, 'train_samples_per_second': 121.193, 'train_steps_per_second': 15.201, 'total_flos': 114056031633408.0, 'train_loss': 1.624899183778458, 'epoch': 3.0})

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load tokenizer from the original model
tokenizer = AutoTokenizer.from_pretrained('distilgpt2')
model = AutoModelForCausalLM.from_pretrained('distilgpt2')

# Define the prompt
prompt = "Q: What is the focal point of a lens?\nA:"

# Tokenize the prompt
inputs = tokenizer(prompt, return_tensors="pt")

# Generate text
output = model.generate(
    **inputs,
    max_length=50,  # Set the max length for the generated text
    num_return_sequences=1,  # Number of sequences to generate
    no_repeat_ngram_size=2,  # Prevent repetition of phrases
    temperature=0.7,  # Control the randomness of the text
    top_p=0.9,  # Probability threshold for nucleus sampling
    top_k=50  # Limit the vocabulary
)

# Decode the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Q: What is the focal point of a lens?
A: The focal length of the lens is about 1/2mm. The lens has a focal distance of about 2.5mm, which is a very small amount. It is very


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load tokenizer from the original model
tokenizer = AutoTokenizer.from_pretrained('distilgpt2')
model = AutoModelForCausalLM.from_pretrained('./resultados/checkpoint-400')

# Define the prompt
prompt = "Q: What is the focal point of a lens?\nA:"

# Tokenize the prompt
inputs = tokenizer(prompt, return_tensors="pt")

# Generate text
output = model.generate(
    **inputs,
    max_length=100,  # Set the max length for the generated text
    num_return_sequences=1,  # Number of sequences to generate
    no_repeat_ngram_size=2,  # Prevent repetition of phrases
    temperature=0.7,  # Control the randomness of the text
    top_p=0.9,  # Probability threshold for nucleus sampling
    top_k=50  # Limit the vocabulary
)

# Decode the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Q: What is the focal point of a lens?
A: It is a focal length of the lens that measures the distance between the two focal points.


In [None]:
!zip -r resultados.zip ./resultados

In [None]:
!zip -r resultados.zip ./resultados/checkpoint-400

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp -r ./resultados /content/drive/MyDrive/
