In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install datasets
!pip install transformers
!pip install torch



In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
import torch

model_name = 'microsoft/DialoGPT-medium'

# Cargar el conjunto de datos desde Hugging Face
dataset = load_dataset('ArtifactAI/arxiv-physics-instruct-tune-30k')

# Cargar el tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Añadir un token de padding si no existe
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

print(dataset['train'][0])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

{'question': 'Which integral prime represents the least weight?', 'answer': 'The least weight of an integral prime is 2. This is because 2 is the smallest prime number and also happens to be the only even prime number.'}


In [4]:
# Preprocesar los datos
def preprocess_function(examples):
    inputs = ["question: " + q + " answer: " + a for q, a in zip(examples['question'], examples['answer'])]
    model_inputs = tokenizer(inputs, padding="max_length", truncation=True, max_length=512)
    model_inputs["labels"] = model_inputs["input_ids"].copy()
    return model_inputs

In [5]:
# Definir el modelo
model = AutoModelForCausalLM.from_pretrained(model_name)

# Añadir los tokens especiales al modelo
model.resize_token_embeddings(len(tokenizer))

# Tokenizar el conjunto de datos
tokenized_datasets = dataset.map(preprocess_function, batched=True)

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/863M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Map:   0%|          | 0/30231 [00:00<?, ? examples/s]

In [6]:
# Dividir el conjunto de datos en entrenamiento y validación
train_test_split = tokenized_datasets['train'].train_test_split(test_size=0.1)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

In [7]:
# Añadir los tokens especiales al modelo
model.resize_token_embeddings(len(tokenizer))

Embedding(50258, 1024)

In [8]:
# Asegurarse de que se está utilizando la GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50258, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50258, bias=False)
)

In [9]:
# Configurar los argumentos de entrenamiento
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/Fine_tunning/Dialo_GPT_medium/results',
    evaluation_strategy="steps",
    learning_rate=2e-4,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    num_train_epochs=1,
    weight_decay=0.01,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir='./logs',  # Directorio para los logs de TensorBoard
    logging_steps=100,  # Frecuencia de los logs
    report_to="none",  # Deshabilitar reporte a WandB o TensorBoard si no es necesario
)

# Crear el entrenador
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)



In [10]:
# Entrenar el modelo
try:
    trainer.train()
except Exception as e:
    print(f"Ocurrió un error durante el entrenamiento: {e}")

Step,Training Loss,Validation Loss
100,0.9559,0.660415
200,0.6769,0.620499
300,0.6561,0.602386
400,0.6334,0.589508
500,0.6135,0.580184
600,0.5915,0.570479
700,0.5982,0.564103
800,0.5952,0.559452
900,0.5668,0.553708
1000,0.584,0.54632


In [11]:
# Guardar el modelo
model.save_pretrained('/content/drive/MyDrive/Fine_tunning/Dialo_GPT_medium/fine-tuned-model')
tokenizer.save_pretrained('/content/drive/MyDrive/Fine_tunning/Dialo_GPT_medium/fine-tuned-model')

('/content/drive/MyDrive/Fine_tunning/Dialo_GPT_medium/fine-tuned-model/tokenizer_config.json',
 '/content/drive/MyDrive/Fine_tunning/Dialo_GPT_medium/fine-tuned-model/special_tokens_map.json',
 '/content/drive/MyDrive/Fine_tunning/Dialo_GPT_medium/fine-tuned-model/vocab.json',
 '/content/drive/MyDrive/Fine_tunning/Dialo_GPT_medium/fine-tuned-model/merges.txt',
 '/content/drive/MyDrive/Fine_tunning/Dialo_GPT_medium/fine-tuned-model/added_tokens.json',
 '/content/drive/MyDrive/Fine_tunning/Dialo_GPT_medium/fine-tuned-model/tokenizer.json')

In [12]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Cargar el modelo y el tokenizer ajustados
model = AutoModelForCausalLM.from_pretrained('/content/drive/MyDrive/Fine_tunning/Dialo_GPT_medium/fine-tuned-model')
tokenizer = AutoTokenizer.from_pretrained('/content/drive/MyDrive/Fine_tunning/Dialo_GPT_medium/fine-tuned-model')

# Configurar el token de padding si no está definido
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Función para generar respuestas
def generate_response(prompt):
    inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True, max_length=512)
    inputs = {key: value.to(model.device) for key, value in inputs.items()}  # Mover inputs a la GPU si es necesario
    outputs = model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=100,
        num_return_sequences=1,
        pad_token_id=tokenizer.pad_token_id
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response



In [16]:
# Probar el modelo
prompt = "que es la interaccion nuclear debil?"
response = generate_response(prompt)
print(response)

que es la interaccion nuclear debil? No se que la interaccion nuclear de la interaccion de la interaccion de la interaccion de la interaccion de la interaccion de la interaccion de la interaccion de la interaccion de la interaccion de la interaccion de la interaccion de la interaccion de la interaccion de la interaccion de la interaccion de la interaccion de la
