In [None]:
!pip install transformers datasets accelerate




In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
from datasets import load_dataset
from transformers import GPT2Tokenizer

# Load the dataset
dataset = load_dataset("csv", data_files="/content/final_cleaned_quotes.csv")  # Adjust path as needed

# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")

# Assign eos_token as padding token
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the dataset
def tokenize_function(example):
    return tokenizer(example["quote"], truncation=True, padding="max_length", max_length=50)

tokenized_dataset = dataset.map(tokenize_function, batched=True)



Generating train split: 0 examples [00:00, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

Map:   0%|          | 0/265 [00:00<?, ? examples/s]

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset

# Load model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
model = GPT2LMHeadModel.from_pretrained("distilgpt2")

# Define padding token
tokenizer.pad_token = tokenizer.eos_token  # Use eos_token as pad_token

# Load your dataset
dataset = load_dataset("csv", data_files="/content/final_cleaned_quotes.csv")

# Tokenization function with labels for language modeling
def tokenize_function(example):
    # Tokenize the quote
    encodings = tokenizer(example["quote"], truncation=True, padding="max_length", max_length=50)
    # Set the labels to be the same as the input_ids for language modeling
    encodings["labels"] = encodings["input_ids"].copy()
    return encodings

# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",         # Directory to save model outputs
    num_train_epochs=3,             # Number of training epochs
    per_device_train_batch_size=2,  # Batch size for training
    save_steps=10,                  # Save checkpoint every 10 steps
    save_total_limit=1,             # Keep only the latest checkpoint
    logging_dir="./logs",           # Log directory for TensorBoard
    logging_steps=10,               # Log every 10 steps
)

# Setup Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],  # Use tokenized train dataset
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained("/content/small-motivator")
tokenizer.save_pretrained("/content/small-motivator")


Map:   0%|          | 0/265 [00:00<?, ? examples/s]

Step,Training Loss
10,2.7333
20,1.603
30,1.4368
40,1.5415
50,1.5833
60,1.4257
70,1.5744
80,1.8863
90,1.4887
100,1.5402


('/content/small-motivator/tokenizer_config.json',
 '/content/small-motivator/special_tokens_map.json',
 '/content/small-motivator/vocab.json',
 '/content/small-motivator/merges.txt',
 '/content/small-motivator/added_tokens.json')

In [5]:
from transformers import pipeline, GPT2LMHeadModel , GPT2Tokenizer

model = GPT2LMHeadModel.from_pretrained("/content/drive/MyDrive/small-motivator-model-v2")
tokenizer = GPT2Tokenizer.from_pretrained("/content/drive/MyDrive/small-motivator-model-v2")

generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

def generate_quote(mood):
    prompt = f"Motivational quote for someone feeling {mood}:"
    return generator(prompt, max_length=50, num_return_sequences=1)[0]["generated_text"]

print(generate_quote("happy"))
print(generate_quote("sad"))
print(generate_quote("anxious"))

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Motivational quote for someone feeling happy: "If you love your life, you should follow that path not by giving up. Failure is a weapon to keep going. You need to learn to be optimistic. Your life should be like that of your
Motivational quote for someone feeling sad: ‘Make the next step. You don’t have to go on feeling sad. You can’t let the sadness escape you. It can’t help you.
Motivational quote for someone feeling anxious: it all comes from a feeling of belonging to yourself.


In [6]:
import transformers
print(transformers.__version__)

4.46.3


In [None]:
import shutil

# Define source and destination paths
source_folder = "/content/small-motivator"  # The folder where your model is saved
destination_folder = "/content/drive/MyDrive/small-motivator-model"

# Remove the existing folder
import shutil
# Define a new folder with a unique name
destination_folder = "/content/drive/My Drive/small-motivator-model-v2"

# Copy the files to the new folder
shutil.copytree(source_folder, destination_folder)
print(f"Model files copied to {destination_folder}")




Model files copied to /content/drive/My Drive/small-motivator-model-v2


In [None]:
import os

# List files in the destination folder
files = os.listdir(destination_folder)
print(f"Files in {destination_folder}: {files}")


Files in /content/drive/My Drive/small-motivator-model-v2: ['merges.txt', 'model.safetensors', 'generation_config.json', 'vocab.json', 'special_tokens_map.json', 'config.json', 'tokenizer_config.json']


In [None]:
shutil.make_archive("/content/small-motivator", 'zip', source_folder)
from google.colab import files
files.download("/content/small-motivator.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>