In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Load dataset in chunks
file_path = '/content/Spotify Million Song Dataset_exported.csv'  # Replace with the actual file path
chunksize = 10000  # Adjust chunk size as needed

# Initialize NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words]
    return tokens

# Initialize tokenizer
tokenizer = Tokenizer(num_words=10000, oov_token='<UNK>')

# Fit tokenizer on the entire dataset in chunks
for chunk in pd.read_csv(file_path, chunksize=chunksize, encoding='utf-8'):
    chunk.dropna(inplace=True)
    chunk.drop_duplicates(inplace=True)
    chunk['cleaned_lyrics'] = chunk['text'].astype(str).apply(clean_text)
    chunk['cleaned_lyrics_str'] = chunk['cleaned_lyrics'].apply(lambda x: ' '.join(x))
    chunk['text_with_artist'] = chunk['artist'] + ' ' + chunk['cleaned_lyrics_str']
    tokenizer.fit_on_texts(chunk['text_with_artist'])

# Function to process a chunk and yield padded sequences
def process_chunk(chunk):
    chunk.dropna(inplace=True)
    chunk.drop_duplicates(inplace=True)
    chunk['cleaned_lyrics'] = chunk['text'].astype(str).apply(clean_text)
    chunk['cleaned_lyrics_str'] = chunk['cleaned_lyrics'].apply(lambda x: ' '.join(x))
    chunk['text_with_artist'] = chunk['artist'] + ' ' + chunk['cleaned_lyrics_str']
    sequences = tokenizer.texts_to_sequences(chunk['text_with_artist'])
    input_sequences = []
    for seq in sequences:
        for i in range(1, len(seq)):
            n_gram_sequence = seq[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences

# Function to yield batches of padded sequences and targets
def data_generator(file_path, chunksize, max_sequence_len, batch_size):
    while True:
        for chunk in pd.read_csv(file_path, chunksize=chunksize, encoding='utf-8'):
            input_sequences = process_chunk(chunk)
            input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')
            targets = input_sequences[:, -1]
            input_sequences = input_sequences[:, :-1]
            for start in range(0, len(input_sequences), batch_size):
                end = min(start + batch_size, len(input_sequences))
                yield input_sequences[start:end], targets[start:end]

# Determine max sequence length by processing a small sample of the dataset
sample_chunk = pd.read_csv(file_path, nrows=chunksize, encoding='utf-8')
sample_sequences = process_chunk(sample_chunk)
max_sequence_len = max([len(x) for x in sample_sequences])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

# Load the trained model
model = load_model('Yousef_trained_model.h5')

# Setup callbacks for early stopping and best model saving
early_stopping = EarlyStopping(monitor='val_loss', patience=5)
model_checkpoint = ModelCheckpoint('fine_tuned_model.h5', save_best_only=True, monitor='val_loss')
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=1e-6, verbose=1)

# Estimate steps per epoch
train_steps_per_epoch = 10000 // chunksize
val_steps_per_epoch = 10000 // chunksize

# Fine-tune the model using a generator
batch_size = 128
train_generator = data_generator(file_path, chunksize, max_sequence_len, batch_size)
validation_generator = data_generator(file_path, chunksize, max_sequence_len, batch_size)

history = model.fit(
    train_generator,
    steps_per_epoch=train_steps_per_epoch,
    epochs=10,  # Additional epochs for fine-tuning
    validation_data=validation_generator,
    validation_steps=val_steps_per_epoch,
    callbacks=[early_stopping, model_checkpoint, reduce_lr]
)

# Evaluate the fine-tuned model on the test set
test_loss, test_accuracy = model.evaluate(validation_generator, steps=val_steps_per_epoch)
print("Test Loss after Fine-Tuning:", test_loss)
print("Test Accuracy after Fine-Tuning:", test_accuracy)


Epoch 1/10


InvalidArgumentError: Graph execution error:

Detected at node sequential/embedding/embedding_lookup defined at (most recent call last):
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main

  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code

  File "/usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py", line 37, in <module>

  File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 992, in launch_instance

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelapp.py", line 619, in start

  File "/usr/local/lib/python3.10/dist-packages/tornado/platform/asyncio.py", line 195, in start

  File "/usr/lib/python3.10/asyncio/base_events.py", line 603, in run_forever

  File "/usr/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once

  File "/usr/lib/python3.10/asyncio/events.py", line 80, in _run

  File "/usr/local/lib/python3.10/dist-packages/tornado/ioloop.py", line 685, in <lambda>

  File "/usr/local/lib/python3.10/dist-packages/tornado/ioloop.py", line 738, in _run_callback

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 825, in inner

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 786, in run

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelbase.py", line 361, in process_one

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 234, in wrapper

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelbase.py", line 261, in dispatch_shell

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 234, in wrapper

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelbase.py", line 539, in execute_request

  File "/usr/local/lib/python3.10/dist-packages/tornado/gen.py", line 234, in wrapper

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/ipkernel.py", line 302, in do_execute

  File "/usr/local/lib/python3.10/dist-packages/ipykernel/zmqshell.py", line 539, in run_cell

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 2975, in run_cell

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3030, in _run_cell

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/async_helpers.py", line 78, in _pseudo_sync_runner

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3257, in run_cell_async

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3473, in run_ast_nodes

  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code

  File "<ipython-input-2-e4b3f70ae4ac>", line 21, in <cell line: 21>

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1807, in fit

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1401, in train_function

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1384, in step_function

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1373, in run_step

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1150, in train_step

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 590, in __call__

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/base_layer.py", line 1149, in __call__

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/sequential.py", line 398, in call

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/functional.py", line 515, in call

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/functional.py", line 672, in _run_internal_graph

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/base_layer.py", line 1149, in __call__

  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler

  File "/usr/local/lib/python3.10/dist-packages/keras/src/layers/core/embedding.py", line 272, in call

indices[112,390] = 4229 is not in [0, 2834)
	 [[{{node sequential/embedding/embedding_lookup}}]] [Op:__inference_train_function_3454]

Fix 1

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
import os
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load dataset in chunks
file_path = '/content/Spotify Million Song Dataset_exported.csv'  # Replace with the actual file path
chunksize = 10000  # Adjust chunk size as needed

# Initialize NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words]
    return tokens

# Initialize tokenizer
tokenizer = Tokenizer(num_words=10000, oov_token='<UNK>')

# Process dataset in chunks
def process_chunk(chunk):
    chunk.dropna(inplace=True)
    chunk.drop_duplicates(inplace=True)
    chunk['cleaned_lyrics'] = chunk['text'].astype(str).apply(clean_text)
    chunk['cleaned_lyrics_str'] = chunk['cleaned_lyrics'].apply(lambda x: ' '.join(x))
    chunk['text_with_artist'] = chunk['artist'] + ' ' + chunk['cleaned_lyrics_str']
    return chunk

# Fit tokenizer on the entire dataset in chunks
for chunk in pd.read_csv(file_path, chunksize=chunksize, encoding='utf-8'):
    chunk = process_chunk(chunk)
    tokenizer.fit_on_texts(chunk['text_with_artist'])

# Save tokenizer configuration
tokenizer_json = tokenizer.to_json()
with open('/content/drive/MyDrive/Checkpoints/tokenizer.json', 'w') as f:
    f.write(tokenizer_json)

# Function to generate padded sequences and targets in chunks
def data_generator(file_path, chunksize, max_sequence_len, batch_size, tokenizer):
    for chunk in pd.read_csv(file_path, chunksize=chunksize, encoding='utf-8'):
        chunk = process_chunk(chunk)
        sequences = tokenizer.texts_to_sequences(chunk['text_with_artist'])
        input_sequences = []
        for seq in sequences:
            for i in range(1, len(seq)):
                n_gram_sequence = seq[:i+1]
                input_sequences.append(n_gram_sequence)
        input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')
        targets = input_sequences[:, -1]
        input_sequences = input_sequences[:, :-1]
        for start in range(0, len(input_sequences), batch_size):
            end = min(start + batch_size, len(input_sequences))
            yield input_sequences[start:end], targets[start:end]

# Determine max sequence length by processing a small sample of the dataset
sample_chunk = pd.read_csv(file_path, nrows=chunksize, encoding='utf-8')
sample_sequences = process_chunk(sample_chunk)
max_sequence_len = max([len(x) for x in sample_sequences])

# Split data into training and test sets incrementally
train_sequences, test_sequences = [], []
train_targets, test_targets = [], []
for chunk in pd.read_csv(file_path, chunksize=chunksize, encoding='utf-8'):
    input_sequences = process_chunk(chunk)
    input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')
    targets = input_sequences[:, -1]
    input_sequences = input_sequences[:, :-1]
    X_train_chunk, X_test_chunk, y_train_chunk, y_test_chunk = train_test_split(
        input_sequences, targets, test_size=0.1, random_state=42
    )
    train_sequences.extend(X_train_chunk)
    train_targets.extend(y_train_chunk)
    test_sequences.extend(X_test_chunk)
    test_targets.extend(y_test_chunk)

X_train = np.array(train_sequences)
y_train = np.array(train_targets)
X_test = np.array(test_sequences)
y_test = np.array(test_targets)



Mounted at /content/drive


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


ValueError: invalid literal for int() with base 10: 'artist'

In [None]:
#fix2
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
import os
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load dataset in chunks
file_path = '/content/Spotify Million Song Dataset_exported.csv'  # Replace with the actual file path
chunksize = 10000  # Adjust chunk size as needed

# Initialize NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words]
    return tokens

# Initialize tokenizer
tokenizer = Tokenizer(num_words=10000, oov_token='<UNK>')

# Process dataset in chunks
def process_chunk(chunk):
    chunk.dropna(inplace=True)
    chunk.drop_duplicates(inplace=True)
    chunk['cleaned_lyrics'] = chunk['text'].astype(str).apply(clean_text)
    chunk['cleaned_lyrics_str'] = chunk['cleaned_lyrics'].apply(lambda x: ' '.join(x))
    chunk['text_with_artist'] = chunk['artist'] + ' ' + chunk['cleaned_lyrics_str']
    return chunk

# Fit tokenizer on the entire dataset in chunks
for chunk in pd.read_csv(file_path, chunksize=chunksize, encoding='utf-8'):
    chunk = process_chunk(chunk)
    tokenizer.fit_on_texts(chunk['text_with_artist'])

# Save tokenizer configuration
tokenizer_json = tokenizer.to_json()
with open('/content/drive/MyDrive/Checkpoints/tokenizer.json', 'w') as f:
    f.write(tokenizer_json)

# Function to generate padded sequences and targets in chunks
def data_generator(file_path, chunksize, max_sequence_len, batch_size, tokenizer):
    for chunk in pd.read_csv(file_path, chunksize=chunksize, encoding='utf-8'):
        chunk = process_chunk(chunk)
        sequences = tokenizer.texts_to_sequences(chunk['text_with_artist'])
        input_sequences = []
        for seq in sequences:
            for i in range(1, len(seq)):
                n_gram_sequence = seq[:i+1]
                input_sequences.append(n_gram_sequence)
        input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')
        targets = input_sequences[:, -1]
        input_sequences = input_sequences[:, :-1]
        for start in range(0, len(input_sequences), batch_size):
            end = min(start + batch_size, len(input_sequences))
            yield input_sequences[start:end], targets[start:end]

# Determine max sequence length by processing a small sample of the dataset
sample_chunk = pd.read_csv(file_path, nrows=chunksize, encoding='utf-8')
sample_chunk = process_chunk(sample_chunk)
sample_sequences = tokenizer.texts_to_sequences(sample_chunk['text_with_artist'])
max_sequence_len = max([len(x) for x in sample_sequences])

# Split data into training and test sets incrementally
train_sequences, test_sequences = [], []
train_targets, test_targets = [], []
for chunk in pd.read_csv(file_path, chunksize=chunksize, encoding='utf-8'):
    chunk = process_chunk(chunk)
    sequences = tokenizer.texts_to_sequences(chunk['text_with_artist'])
    input_sequences = []
    for seq in sequences:
        for i in range(1, len(seq)):
            n_gram_sequence = seq[:i+1]
            input_sequences.append(n_gram_sequence)
    input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')
    targets = input_sequences[:, -1]
    input_sequences = input_sequences[:, :-1]
    X_train_chunk, X_test_chunk, y_train_chunk, y_test_chunk = train_test_split(
        input_sequences, targets, test_size=0.1, random_state=42
    )
    train_sequences.extend(X_train_chunk)
    train_targets.extend(y_train_chunk)
    test_sequences.extend(X_test_chunk)
    test_targets.extend(y_test_chunk)

X_train = np.array(train_sequences)
y_train = np.array(train_targets)
X_test = np.array(test_sequences)
y_test = np.array(test_targets)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
#fix2

# Load the trained model
model_path = '/content/drive/MyDrive/checkpoints/Yousef_trained_model.h5'
model = load_model(model_path)

# Verify and adjust the vocabulary size
vocab_size = len(tokenizer.word_index) + 1
model.layers[0].input_dim = vocab_size  # Adjust input_dim of the embedding layer

# Setup callbacks for early stopping and best model saving
early_stopping = EarlyStopping(monitor='val_loss', patience=5)
model_checkpoint = ModelCheckpoint('/content/drive/MyDrive/Checkpoints/fine_tuned_model.h5', save_best_only=True, monitor='val_loss')
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=1e-6, verbose=1)

# Estimate steps per epoch
train_steps_per_epoch = len(X_train) // 128
val_steps_per_epoch = len(X_test) // 128

# Fine-tune the model using a generator
train_generator = data_generator(file_path, chunksize, max_sequence_len, 128, tokenizer)
validation_generator = data_generator(file_path, chunksize, max_sequence_len, 128, tokenizer)

history = model.fit(
    train_generator,
    steps_per_epoch=train_steps_per_epoch,
    epochs=10,  # Additional epochs for fine-tuning
    validation_data=validation_generator,
    validation_steps=val_steps_per_epoch,
    callbacks=[early_stopping, model_checkpoint, reduce_lr]
)

# Evaluate the fine-tuned model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print("Test Loss after Fine-Tuning:", test_loss)
print("Test Accuracy after Fine-Tuning:", test_accuracy)



In [None]:
#fix2
# Function to generate text
def generate_text(seed_text, next_words, model, tokenizer, max_sequence_len):
    words_added = 0
    current_text = seed_text
    while words_added < next_words:
        token_list = tokenizer.texts_to_sequences([current_text])[0]
        token_list_padded = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predictions = model.predict(token_list_padded, verbose=0).squeeze()
        predicted_index = np.argmax(predictions)
        output_word = tokenizer.index_word.get(predicted_index, '')

        if output_word and output_word.strip():
            current_text += ' ' + output_word.strip()
            words_added += 1
        else:
            continue

    return current_text

# Example usage
seed_text = "Adele Rolling in the deep"
generated_text = generate_text(seed_text, 10, model, tokenizer, max_sequence_len)
print(generated_text)

In [None]:
# Load the trained model
model_path = '/content/drive/MyDrive/checkpoints/Yousef_trained_model.h5'
model = load_model(model_path)

# Verify and adjust the vocabulary size
vocab_size = len(tokenizer.word_index) + 1
model.layers[0].input_dim = vocab_size  # Adjust input_dim of the embedding layer

# Setup callbacks for early stopping and best model saving
early_stopping = EarlyStopping(monitor='val_loss', patience=5)
model_checkpoint = ModelCheckpoint('/content/drive/MyDrive/Checkpoints/fine_tuned_model.h5', save_best_only=True, monitor='val_loss')
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=1e-6, verbose=1)

# Estimate steps per epoch
train_steps_per_epoch = len(X_train) // batch_size
val_steps_per_epoch = len(X_test) // batch_size

# Fine-tune the model using a generator
train_generator = data_generator(file_path, chunksize, max_sequence_len, batch_size, tokenizer)
validation_generator = data_generator(file_path, chunksize, max_sequence_len, batch_size, tokenizer)

history = model.fit(
    train_generator,
    steps_per_epoch=train_steps_per_epoch,
    epochs=10,  # Additional epochs for fine-tuning
    validation_data=validation_generator,
    validation_steps=val_steps_per_epoch,
    callbacks=[early_stopping, model_checkpoint, reduce_lr]
)

# Evaluate the fine-tuned model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print("Test Loss after Fine-Tuning:", test_loss)
print("Test Accuracy after Fine-Tuning:", test_accuracy)

In [None]:
#fix3
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
import os
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load dataset in chunks
file_path = '/content/Spotify Million Song Dataset_exported.csv'  # Replace with the actual file path
chunksize = 10000  # Adjust chunk size as needed

# Initialize NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words]
    return tokens

# Initialize tokenizer
tokenizer = Tokenizer(num_words=10000, oov_token='<UNK>')

# Process dataset in chunks
def process_chunk(chunk):
    chunk.dropna(inplace=True)
    chunk.drop_duplicates(inplace=True)
    chunk['cleaned_lyrics'] = chunk['text'].astype(str).apply(clean_text)
    chunk['cleaned_lyrics_str'] = chunk['cleaned_lyrics'].apply(lambda x: ' '.join(x))
    chunk['text_with_artist'] = chunk['artist'] + ' ' + chunk['cleaned_lyrics_str']
    return chunk

# Fit tokenizer on the entire dataset in chunks
for chunk in pd.read_csv(file_path, chunksize=chunksize, encoding='utf-8'):
    chunk = process_chunk(chunk)
    tokenizer.fit_on_texts(chunk['text_with_artist'])

# Save tokenizer configuration
tokenizer_json = tokenizer.to_json()
with open('/content/drive/MyDrive/Checkpoints/tokenizer.json', 'w') as f:
    f.write(tokenizer_json)

# Function to generate padded sequences and targets in chunks
def data_generator(file_path, chunksize, max_sequence_len, batch_size, tokenizer):
    for chunk in pd.read_csv(file_path, chunksize=chunksize, encoding='utf-8'):
        chunk = process_chunk(chunk)
        sequences = tokenizer.texts_to_sequences(chunk['text_with_artist'])
        input_sequences = []
        for seq in sequences:
            for i in range(1, len(seq)):
                n_gram_sequence = seq[:i+1]
                input_sequences.append(n_gram_sequence)
        input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')
        targets = input_sequences[:, -1]
        input_sequences = input_sequences[:, :-1]
        for start in range(0, len(input_sequences), batch_size):
            end = min(start + batch_size, len(input_sequences))
            yield input_sequences[start:end], targets[start:end]

# Determine max sequence length by processing a small sample of the dataset
sample_chunk = pd.read_csv(file_path, nrows=chunksize, encoding='utf-8')
sample_chunk = process_chunk(sample_chunk)
sample_sequences = tokenizer.texts_to_sequences(sample_chunk['text_with_artist'])
max_sequence_len = max([len(x) for x in sample_sequences])

# Split data into training and test sets incrementally
train_sequences, test_sequences = [], []
train_targets, test_targets = [], []
for chunk in pd.read_csv(file_path, chunksize=chunksize, encoding='utf-8'):
    chunk = process_chunk(chunk)
    sequences = tokenizer.texts_to_sequences(chunk['text_with_artist'])
    input_sequences = []
    for seq in sequences:
        for i in range(1, len(seq)):
            n_gram_sequence = seq[:i+1]
            input_sequences.append(n_gram_sequence)
    input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')
    targets = input_sequences[:, -1]
    input_sequences = input_sequences[:, :-1]
    X_train_chunk, X_test_chunk, y_train_chunk, y_test_chunk = train_test_split(
        input_sequences, targets, test_size=0.1, random_state=42
    )
    train_sequences.extend(X_train_chunk)
    train_targets.extend(y_train_chunk)
    test_sequences.extend(X_test_chunk)
    test_targets.extend(y_test_chunk)

X_train = np.array(train_sequences)
y_train = np.array(train_targets)
X_test = np.array(test_sequences)
y_test = np.array(test_targets)

# Load the trained model
model_path = '/content/drive/MyDrive/checkpoints/Yousef_trained_model.h5'
model = load_model(model_path)

# Verify and adjust the vocabulary size
vocab_size = len(tokenizer.word_index) + 1

# If the model's embedding layer does not match the vocab_size, create a new model with the correct vocab_size
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Define the model
new_model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=256, input_length=max_sequence_len-1),
    LSTM(256, return_sequences=True),
    Dropout(0.5),
    LSTM(256),
    Dense(vocab_size, activation='softmax')
])

new_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Transfer weights from the old model to the new model
for layer in model.layers:
    try:
        new_model.get_layer(name=layer.name).set_weights(layer.get_weights())
    except:
        print(f"Layer {layer.name} not found or not compatible")

# Save the new model
new_model.save(model_path)

# Setup callbacks for early stopping and best model saving
early_stopping = EarlyStopping(monitor='val_loss', patience=5)
model_checkpoint = ModelCheckpoint('/content/drive/MyDrive/Checkpoints/fine_tuned_model.h5', save_best_only=True, monitor='val_loss')
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=1e-6, verbose=1)

# Estimate steps per epoch
train_steps_per_epoch = len(X_train) // 128
val_steps_per_epoch = len(X_test) // 128

# Fine-tune the model using a generator
train_generator = data_generator(file_path, chunksize, max_sequence_len, 128, tokenizer)
validation_generator = data_generator(file_path, chunksize, max_sequence_len, 128, tokenizer)

history = new_model.fit(
    train_generator,
    steps_per_epoch=train_steps_per_epoch,
    epochs=10,  # Additional epochs for fine-tuning
    validation_data=validation_generator,
    validation_steps=val_steps_per_epoch,
    callbacks=[early_stopping, model_checkpoint, reduce_lr]
)

# Evaluate the fine-tuned model on the test set
test_loss, test_accuracy = new_model.evaluate(X_test, y_test)
print("Test Loss after Fine-Tuning:", test_loss)
print("Test Accuracy after Fine-Tuning:", test_accuracy)

# Function to generate text
def generate_text(seed_text, next_words, model, tokenizer, max_sequence_len):
    words_added = 0
    current_text = seed_text
    while words_added < next_words:
        token_list = tokenizer.texts_to_sequences([current_text])[0]
        token_list_padded = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predictions = model.predict(token_list_padded, verbose=0).squeeze()
        predicted_index = np.argmax(predictions)
        output_word = tokenizer.index_word.get(predicted_index, '')

        if output_word and output_word.strip():
            current_text += ' ' + output_word.strip()
            words_added += 1
        else:
            continue

    return current_text

# Example usage
seed_text = "Adele Rolling in the deep"
generated_text = generate_text(seed_text, 10, new_model, tokenizer, max_sequence_len)
print(generated_text)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import gc
import os
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load dataset in chunks
file_path = '/content/Spotify Million Song Dataset_exported.csv'  # Replace with the actual file path
chunksize = 10000  # Adjust chunk size as needed

# Initialize NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words]
    return tokens

# Initialize tokenizer
tokenizer = Tokenizer(num_words=10000, oov_token='<UNK>')

# Process dataset in chunks
def process_chunk(chunk):
    chunk.dropna(inplace=True)
    chunk.drop_duplicates(inplace=True)
    chunk['cleaned_lyrics'] = chunk['text'].astype(str).apply(clean_text)
    chunk['cleaned_lyrics_str'] = chunk['cleaned_lyrics'].apply(lambda x: ' '.join(x))
    chunk['text_with_artist'] = chunk['artist'] + ' ' + chunk['cleaned_lyrics_str']
    return chunk

# Fit tokenizer on the entire dataset in chunks
for chunk in pd.read_csv(file_path, chunksize=chunksize, encoding='utf-8'):
    chunk = process_chunk(chunk)
    tokenizer.fit_on_texts(chunk['text_with_artist'])
    del chunk
    gc.collect()

# Save tokenizer configuration
tokenizer_json = tokenizer.to_json()
with open('/content/drive/MyDrive/Checkpoints/tokenizer.json', 'w') as f:
    f.write(tokenizer_json)

# Determine max sequence length by processing a small sample of the dataset
sample_chunk = pd.read_csv(file_path, nrows=chunksize, encoding='utf-8')
sample_chunk = process_chunk(sample_chunk)
sample_sequences = tokenizer.texts_to_sequences(sample_chunk['text_with_artist'])
max_sequence_len = max([len(x) for x in sample_sequences])
del sample_chunk, sample_sequences
gc.collect()

# Function to generate padded sequences and targets in chunks
def data_generator(file_path, chunksize, max_sequence_len, batch_size, tokenizer):
    for chunk in pd.read_csv(file_path, chunksize=chunksize, encoding='utf-8'):
        chunk = process_chunk(chunk)
        sequences = tokenizer.texts_to_sequences(chunk['text_with_artist'])
        input_sequences = []
        for seq in sequences:
            for i in range(1, len(seq)):
                n_gram_sequence = seq[:i+1]
                input_sequences.append(n_gram_sequence)
        input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')
        targets = input_sequences[:, -1]
        input_sequences = input_sequences[:, :-1]
        for start in range(0, len(input_sequences), batch_size):
            end = min(start + batch_size, len(input_sequences))
            yield input_sequences[start:end], targets[start:end]
        del chunk, sequences, input_sequences, targets
        gc.collect()

# Create a TensorFlow dataset from the generator
def create_tf_dataset(file_path, chunksize, max_sequence_len, batch_size, tokenizer):
    generator = lambda: data_generator(file_path, chunksize, max_sequence_len, batch_size, tokenizer)
    dataset = tf.data.Dataset.from_generator(generator,
                                             output_types=(tf.int32, tf.int32),
                                             output_shapes=((None, max_sequence_len-1), (None,)))
    return dataset

# Load the trained model
model_path = '/content/drive/MyDrive/checkpoints/Yousef_trained_model.h5'
model = load_model(model_path)

# Verify and adjust the vocabulary size
vocab_size = len(tokenizer.word_index) + 1

# If the model's embedding layer does not match the vocab_size, create a new model with the correct vocab_size
# Define the model
new_model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=256, input_length=max_sequence_len-1),
    LSTM(256, return_sequences=True),
    Dropout(0.5),
    LSTM(256),
    Dense(vocab_size, activation='softmax')
])

new_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Transfer weights from the old model to the new model
for layer in model.layers:
    try:
        new_model.get_layer(name=layer.name).set_weights(layer.get_weights())
    except:
        print(f"Layer {layer.name} not found or not compatible")

# Save the new model
new_model.save(model_path)

# Setup callbacks for early stopping and best model saving
early_stopping = EarlyStopping(monitor='val_loss', patience=5)
model_checkpoint = ModelCheckpoint('/content/drive/MyDrive/Checkpoints/fine_tuned_model.h5', save_best_only=True, monitor='val_loss')
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=1e-6, verbose=1)

# Create TensorFlow datasets
batch_size = 128
train_dataset = create_tf_dataset(file_path, chunksize, max_sequence_len, batch_size, tokenizer)
train_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)

# Estimate steps per epoch
steps_per_epoch = 500  # Set this to a reasonable number

history = new_model.fit(
    train_dataset,
    steps_per_epoch=steps_per_epoch,
    epochs=5,  # Additional epochs for fine-tuning
    callbacks=[early_stopping, model_checkpoint, reduce_lr]
)

# Load the best model
new_model.load_weights('/content/drive/MyDrive/Checkpoints/fine_tuned_model.h5')

# Evaluate the fine-tuned model on the test set
test_dataset = create_tf_dataset(file_path, chunksize, max_sequence_len, batch_size, tokenizer)
test_dataset = test_dataset.prefetch(tf.data.AUTOTUNE)
test_loss, test_accuracy = new_model.evaluate(test_dataset, steps=steps_per_epoch // 10)
print("Test Loss after Fine-Tuning:", test_loss)
print("Test Accuracy after Fine-Tuning:", test_accuracy)

# Function to generate text
def generate_text(seed_text, next_words, model, tokenizer, max_sequence_len):
    words_added = 0
    current_text = seed_text
    while words_added < next_words:
        token_list = tokenizer.texts_to_sequences([current_text])[0]
        token_list_padded = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predictions = model.predict(token_list_padded, verbose=0).squeeze()
        predicted_index = np.argmax(predictions)
        output_word = tokenizer.index_word.get(predicted_index, '')

        if output_word and output_word.strip():
            current_text += ' ' + output_word.strip()
            words_added += 1
        else:
            continue

    return current_text

# Example usage
seed_text = "Adele Rolling in the deep"
generated_text = generate_text(seed_text, 10, new_model, tokenizer, max_sequence_len)
print(generated_text)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Layer embedding not found or not compatible
Layer dense not found or not compatible


  saving_api.save_model(


Epoch 1/10
  9/500 [..............................] - ETA: 1:40:09 - loss: 11.1715 - accuracy: 0.0000e+00

KeyboardInterrupt: 