In [2]:

# ✅ INSTALL DEPENDENCIES
!pip install kaggle
!pip install transformers datasets --quiet

# ✅ IMPORT LIBRARIES
import os
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling
from google.colab import files

# ✅ UPLOAD YOUR KAGGLE API KEY (kaggle.json)
from google.colab import files
print("Upload your kaggle.json API file...")
files.upload()

# ✅ CONFIGURE KAGGLE API
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# ✅ DOWNLOAD & UNZIP THE DATASET
!kaggle datasets download -d paultimothymooney/poetry
!unzip poetry.zip


# ✅ DOWNLOAD LYRICS DATASET FROM KAGGLE
!kaggle datasets download -d paultimothymooney/poetry
!unzip -o poetry.zip

os.listdir()

# ✅ CHOOSE A FILE FROM THE UNZIPPED FOLDERa
lyrics_file = "Kanye_West.txt"  # or replace with your own `.txt` if uploaded manually



# ✅ LOAD TOKENIZER AND MODEL
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# GPT2 doesn't have a pad token by default
tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))

# ✅ CREATE DATASET FOR TRAINING
def create_dataset(file_path, tokenizer, block_size=128):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )

train_dataset = create_dataset(lyrics_file, tokenizer)


# ✅ COLLATOR TO HANDLE MASKING
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # causal LM
)

# ✅ TRAINING ARGUMENTS
training_args = TrainingArguments(
    output_dir="./gpt2-lyrics",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=2,
    prediction_loss_only=True,
    logging_steps=100,
)

# ✅ TRAINER SETUP
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset
)

# ✅ TRAIN THE MODEL
trainer.train()

# ✅ SAVE THE MODEL
trainer.save_model("./gpt2-lyrics")
tokenizer.save_pretrained("./gpt2-lyrics")

# ✅ TEXT GENERATION FUNCTION
def generate_lyrics(prompt, max_length=100):
    inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
    outputs = model.generate(
        inputs,
        max_length=max_length,
        num_return_sequences=1,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.9,
        pad_token_id=tokenizer.eos_token_id
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# ✅ SAMPLE LYRIC GENERATION
prompts = ["heartless", "I Wonder", "Runaway"]
for prompt in prompts:
    print(f"\n🎵 Prompt: {prompt}")
    print(generate_lyrics(prompt))


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.5.3.2 which is incompatible.
torch 2.6.0+cu124 requires nvid

Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/paultimothymooney/poetry
License(s): CC0-1.0
Archive:  poetry.zip
  inflating: Kanye_West.txt          
  inflating: Lil_Wayne.txt           
  inflating: adele.txt               
  inflating: al-green.txt            
  inflating: alicia-keys.txt         
  inflating: amy-winehouse.txt       
  inflating: beatles.txt             
  inflating: bieber.txt              
  inflating: bjork.txt               
  inflating: blink-182.txt           
  inflating: bob-dylan.txt           
  inflating: bob-marley.txt          
  inflating: britney-spears.txt      
  inflating: bruce-springsteen.txt   
  inflating: bruno-mars.txt          
  inflating: cake.txt                
  inflating: dickinson.txt           
  inflating: disney.txt              
  inflating: dj-khaled.txt           
  inflating: dolly-parton.txt        
  inflating: dr-seuss.txt            
  inflating: drake.txt               
  inflating: eminem

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmrinalreddy3691[0m ([33mmrinalreddy3691-cbit[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,4.5794
200,4.4909
300,4.2509
400,4.0657
500,3.9915
600,3.8419
700,3.8178


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



🎵 Prompt: heartless
heartless
Cause she just wanna be on her niggas ass
And I know she cannt do it all right
Like a white man she cant even fly
Nothin on the white man
But she was like Ye in that movie
So she thought you look like Ye
And you wouldnt try to be Ye
But she couldnt give up everything that she has
Aint she supposed to be Ye
But she just wanted to play the game like I had it

🎵 Prompt: I Wonder
I Wonder what the deal with these babies
If the only girls with black money
They think my money is fake
I still use em when I need them
And guess what they got a problem with
So they try to buy my clothes
I dont care when they want us on them
I cant really complain
Im on top of shit like an electric car
I just got a nigga thats from New York
And they dont really know where I got them the fuck they doin

🎵 Prompt: Runaway
Runawayaway
Theres a lot of black people up in the street
Black people up in the street
And you aint heard about this black people up in the streets
Aint that some n