In [None]:
import os
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    AdamW,
    get_linear_schedule_with_warmup,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from tqdm import tqdm
import logging






In [None]:
# Set up logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)
logger = logging.getLogger(__name__)

In [None]:

# Set random seed for reproducibility
def set_seed(seed_value=42):
    """Set seed for reproducibility."""
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed_value)

set_seed()

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")

# Hyperparameters
MODEL_NAME = "gpt2"  # Options: "gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl"
MAX_LENGTH = 1024    # Maximum context window size for GPT-2
BATCH_SIZE = 4       # Batch size for training
EPOCHS = 3           # Number of training epochs
LEARNING_RATE = 3e-5 # Learning rate
WARMUP_STEPS = 500   # Number of warmup steps for learning rate scheduler
GRADIENT_ACCUMULATION_STEPS = 2  # Number of update steps to accumulate before performing a backward/update pass


In [None]:


# Custom dataset class for pre-tokenized data without padding
class RawTokenDataset(Dataset):
    def __init__(self, file_path):
        self.examples = []

        logger.info(f"Loading pre-tokenized data from {file_path}")

        # Load all token sequences from file
        # Assuming each line contains a sequence of space-separated token IDs
        with open(file_path, 'r') as f:
            for line in f:
                # Convert token IDs to integers
                token_ids = [int(token) for token in line.strip().split()]
                self.examples.append(token_ids)

        logger.info(f"Loaded {len(self.examples)} sequences")

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        # Simply return the token IDs as input_ids
        return {"input_ids": torch.tensor(self.examples[idx])}

In [None]:
# Custom collator that handles variable-length sequences without padding
class DynamicCollator:
    def __init__(self, model_name):
        # Load tokenizer just to get special token IDs
        self.tokenizer = GPT2Tokenizer.from_pretrained(model_name)
        if not hasattr(self.tokenizer, 'pad_token') or self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        self.pad_token_id = self.tokenizer.pad_token_id

    def __call__(self, examples):
        # Find the max length in this batch
        batch_max_length = max(len(example["input_ids"]) for example in examples)

        # Prepare batch tensors
        input_ids_batch = []
        attention_mask_batch = []
        labels_batch = []

        for example in examples:
            input_ids = example["input_ids"]
            input_len = len(input_ids)

            # Create attention mask (1 for real tokens, 0 for padding)
            attention_mask = torch.ones(batch_max_length, dtype=torch.long)

            # Pad input_ids if needed
            if input_len < batch_max_length:
                # Create padded input_ids tensor
                padded_input_ids = torch.cat([
                    input_ids,
                    torch.full((batch_max_length - input_len,), self.pad_token_id, dtype=torch.long)
                ])

                # Update attention mask for padding
                attention_mask[input_len:] = 0
            else:
                padded_input_ids = input_ids

            # For labels, we'll use -100 for padding tokens (HuggingFace ignores these in loss calculation)
            labels = padded_input_ids.clone()
            if input_len < batch_max_length:
                labels[input_len:] = -100

            input_ids_batch.append(padded_input_ids)
            attention_mask_batch.append(attention_mask)
            labels_batch.append(labels)

        return {
            "input_ids": torch.stack(input_ids_batch),
            "attention_mask": torch.stack(attention_mask_batch),
            "labels": torch.stack(labels_batch)
        }

# Load tokenizer and model
def get_tokenizer():
    tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)
    # We still define a pad token for the tokenizer API, but won't use it
    if not hasattr(tokenizer, 'pad_token') or tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    return tokenizer

def get_model():
    model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)
    model.to(device)
    return model

# Compute perplexity for evaluation
def compute_perplexity(model, eval_dataloader):
    model.eval()
    total_loss = 0
    total_tokens = 0

    with torch.no_grad():
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            # Calculate number of tokens (all tokens are real, no padding)
            num_tokens = attention_mask.sum().item()

            # Sum up batch loss
            total_loss += loss.item() * num_tokens
            total_tokens += num_tokens

    # Calculate perplexity
    avg_loss = total_loss / total_tokens
    perplexity = torch.exp(torch.tensor(avg_loss)).item()

    return perplexity

# Main training function
def train():
    tokenizer = get_tokenizer()
    model = get_model()

    # Load pre-tokenized datasets
    train_dataset = RawTokenDataset("fixed_train.txt")
    val_dataset = RawTokenDataset("fixed_test.txt")

    logger.info(f"Train dataset size: {len(train_dataset)}")
    logger.info(f"Validation dataset size: {len(val_dataset)}")

    # Create data collator
    data_collator = DynamicCollator(MODEL_NAME)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        overwrite_output_dir=True,
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
        learning_rate=LEARNING_RATE,
        warmup_steps=WARMUP_STEPS,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=100,
        save_strategy="epoch",
        evaluation_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        fp16=torch.cuda.is_available(),  # Use mixed precision training if available
        dataloader_drop_last=True,      # Drop the last incomplete batch
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=data_collator,
    )

    # Train the model
    logger.info("Starting training")
    trainer.train()

    # Save the model and tokenizer
    model.save_pretrained("./final_model")
    tokenizer.save_pretrained("./final_model")

    # Compute final perplexity
    eval_dataloader = DataLoader(
        val_dataset,
        batch_size=BATCH_SIZE,
        collate_fn=data_collator
    )
    perplexity = compute_perplexity(model, eval_dataloader)
    logger.info(f"Final perplexity: {perplexity:.2f}")

    return model, tokenizer

# Generate some sample text
def generate_sample(model, tokenizer, prompt="The music", max_length=200):
    model.eval()

    # First tokenize the prompt
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

    # Generate text using the maximum length possible
    output = model.generate(
        input_ids,
        max_length=max_length,
        temperature=0.8,
        top_k=50,
        top_p=0.95,
        repetition_penalty=1.2,
        do_sample=True,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id
    )

    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

if __name__ == "__main__":
    # Train the model
    model, tokenizer = train()

    # Generate some samples
    print("\nGenerated samples:")
    print("-" * 40)
    prompts = ["The music", "I feel", "Love is", "Tonight we"]

    for prompt in prompts:
        generated_text = generate_sample(model, tokenizer, prompt)
        print(f"Prompt: {prompt}")
        print(f"Generated: {generated_text}")
        print("-" * 40)



Epoch,Training Loss,Validation Loss
1,3.0107,2.474013
2,2.8766,2.40895


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].
Evaluating: 100%|██████████| 211/211 [00:34<00:00,  6.04it/s]
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



Generated samples:
----------------------------------------
Prompt: The music
Generated: The music is not loud but it's got a little bit of drama to me when you're standing here and i'm feeling like there ain't no way your eyes are gonna catch up with the words  pre my name isn 'bout what we gon' do in six months if she sayin that they can stay on this earth while everybody else think about us how could something as simple as love make sense for somebody so hard gotta see through all these walls tryna get down donít let go just wait til everything changes tell yourself nothing will ever change before one day people who'd never know better come along boy oh baby every single moment has been taken away from ya girl yeah hey now where am I supposedto start cause nobody knows more than them girls huh ehuh uhohhh hmmhmmm nope why dont be jealous please listen ima keep calling yo some bad ass bitch even though its true hes talking trash too much hate em ill find out someday soonhe'll have h

# **Creating text file**

In [None]:
import pandas as pd
import ast
import os

# Read the CSV file
print("Reading CSV file...")
df = pd.read_csv('combined_artists_tokenized.csv')

# Check if Tokenized_Lyrics column exists
if 'Tokenized_Lyrics' not in df.columns:
    print("Column names in file:", df.columns.tolist())
    raise ValueError("The column 'Tokenized_Lyrics' was not found in the CSV file.")

print(f"Found {len(df)} rows in the CSV file.")

# Create an output file to store all tokens
output_file = 'all_tokens.txt'
print(f"Converting tokens to text file: {output_file}...")

# Process each row and write to the output file
with open(output_file, 'w', encoding='utf-8') as f:
    # Counter for monitoring progress
    total_tokens = 0

    for i, row in enumerate(df.itertuples(), 1):
        # Get the tokenized lyrics - handle different formats
        try:
            # If stored as string representation of list
            if isinstance(row.Tokenized_Lyrics, str):
                if row.Tokenized_Lyrics.startswith('[') and row.Tokenized_Lyrics.endswith(']'):
                    tokens = ast.literal_eval(row.Tokenized_Lyrics)
                else:
                    # If it's just a string of space-separated tokens
                    tokens = row.Tokenized_Lyrics.split()
            # If already a list
            elif isinstance(row.Tokenized_Lyrics, list):
                tokens = row.Tokenized_Lyrics
            else:
                print(f"Skipping row {i} - unexpected format: {type(row.Tokenized_Lyrics)}")
                continue

            # Convert all tokens to strings and join with spaces
            token_strings = [str(token) for token in tokens]
            f.write(' '.join(token_strings) + '\n')

            # Update counter
            total_tokens += len(tokens)

            # Print progress every 1000 rows
            if i % 1000 == 0:
                print(f"Processed {i} rows ({total_tokens} tokens so far)...")

        except Exception as e:
            print(f"Error processing row {i}: {e}")

# Print file stats
file_size_mb = os.path.getsize(output_file) / (1024 * 1024)
print(f"\nDone! Created {output_file} with {total_tokens} total tokens.")
print(f"File size: {file_size_mb:.2f} MB")

Reading CSV file...
Found 6027 rows in the CSV file.
Converting tokens to text file: all_tokens.txt...
Processed 1000 rows (465794 tokens so far)...
Processed 2000 rows (1137968 tokens so far)...
Processed 3000 rows (1591275 tokens so far)...
Processed 4000 rows (2003674 tokens so far)...
Processed 5000 rows (2317327 tokens so far)...
Processed 6000 rows (2702901 tokens so far)...

Done! Created all_tokens.txt with 2712405 total tokens.
File size: 11.63 MB


In [None]:
!zip -r final_model.zip final_model/

  adding: final_model/ (stored 0%)
  adding: final_model/model.safetensors (deflated 7%)
  adding: final_model/tokenizer_config.json (deflated 56%)
  adding: final_model/vocab.json (deflated 68%)
  adding: final_model/generation_config.json (deflated 24%)
  adding: final_model/merges.txt (deflated 53%)
  adding: final_model/config.json (deflated 51%)
  adding: final_model/special_tokens_map.json (deflated 74%)


In [None]:

def fix_token_file(input_file, output_file):
    """
    Reads the input file with space-separated tokens and reformats it properly.

    Args:
        input_file: Path to input file with space-separated tokens
        output_file: Path to save the reformatted data
    """
    logger.info(f"Processing {input_file} -> {output_file}")

    # Read the input file
    with open(input_file, 'r') as f:
        content = f.read()

    # Clean and parse tokens
    tokens = [token for token in content.split() if token.strip()]

    try:
        # Convert to integers to verify format
        token_ids = [int(token) for token in tokens]
        logger.info(f"Successfully parsed {len(token_ids)} tokens from {input_file}")

        # Create context windows of appropriate size for GPT-2
        max_seq_len = 1024  # GPT-2 context window
        contexts = []

        # Create sequences with stride
        stride = 512  # 50% overlap
        for i in range(0, len(token_ids) - max_seq_len + 1, stride):
            end_idx = min(i + max_seq_len, len(token_ids))
            contexts.append(token_ids[i:end_idx])

        # If there's a remainder and it's not too small, add it as the last context
        if len(token_ids) % stride > 100:  # Only add if it's a substantial chunk
            contexts.append(token_ids[-(len(token_ids) % stride):])

        logger.info(f"Created {len(contexts)} sequences of maximum length {max_seq_len}")

        # Write to output file - one sequence per line
        with open(output_file, 'w') as f:
            for context in contexts:
                f.write(' '.join(map(str, context)) + '\n')

        logger.info(f"Saved {len(contexts)} sequences to {output_file}")
        return True

    except ValueError as e:
        logger.error(f"Error converting tokens to integers: {e}")
        logger.error(f"First 10 tokens: {tokens[:10]}")
        return False

if __name__ == "__main__":
    # Fix both train and test files
    train_success = fix_token_file("train.txt", "fixed_train.txt")
    test_success = fix_token_file("test.txt", "fixed_test.txt")

    if train_success and test_success:
        logger.info("Successfully reformatted both train.txt and test.txt")
        logger.info("Use fixed_train.txt and fixed_test.txt with your model")
    else:
        logger.error("Failed to process one or both files")

# **GENERATING MUSIC LYRICS**

In [3]:
# Replace 'your_file.zip' with your zip file name
!unzip final_model.zip -d ./

# List the extracted files
!ls -la ./

Archive:  final_model.zip
   creating: ./final_model/
  inflating: ./final_model/model.safetensors  
  inflating: ./final_model/tokenizer_config.json  
  inflating: ./final_model/vocab.json  
  inflating: ./final_model/generation_config.json  
  inflating: ./final_model/merges.txt  
  inflating: ./final_model/config.json  
  inflating: ./final_model/special_tokens_map.json  
total 452060
drwxr-xr-x 1 root root      4096 Feb 27 19:20 .
drwxr-xr-x 1 root root      4096 Feb 27 19:15 ..
drwxr-xr-x 4 root root      4096 Feb 26 18:27 .config
drwxr-xr-x 2 root root      4096 Feb 27 08:26 final_model
-rw-r--r-- 1 root root 462881440 Feb 27 19:20 final_model.zip
drwxr-xr-x 1 root root      4096 Feb 26 18:27 sample_data


In [4]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [7]:
model_path = "./final_model"  # Update this path to where your model is

# Load the fine-tuned model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)


In [9]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [13]:
def generate_text(prompt, max_length=100):
    # Encode the input prompt
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

    # Generate text
    output = model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=1,
        temperature=0.8,  # Controls randomness (higher = more random)
        top_k=50,         # Controls diversity
        top_p=0.95,       # Nucleus sampling
        repetition_penalty=1.2,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True    # Use sampling instead of greedy decoding
    )

    # Decode the generated output
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

# Example usage


import time

print("Enter song lyrics and I'll continue them. Type 'quit' to exit.")
while True:
    prompt = input("\nEnter your music lines: ")

    if prompt.lower() == 'quit':
        print("Goodbye!")
        break

    generated_text = generate_text(prompt)



    # Print character by character with delay for typewriter effect
    for char in generated_text:
        print(char, end='', flush=True)  # flush=True ensures it prints immediately
        time.sleep(0.03)  # 30 milliseconds delay between characters

    print()  # Add a newline at the end
    # Print just the newly generated part (excluding the prompt)


Enter song lyrics and I'll continue them. Type 'quit' to exit.

Enter your music linesq: i got her flowers
i got her flowers we used to make it and i had my heart on the edge of town so come with me uh  pre cause you can be crazy that's what they say baby don't even try but if somebody ever decides there are no other options then oh hey yeah now get off your feet just put 'em down okay all good for ya look at these little bitches how'd yo love themselves girl this could happen right here in front ooh hello goooooooohooyuhh


KeyboardInterrupt: Interrupted by user