In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Load dataset
file_path = '/content/Spotify Million Song Dataset_exported.csv'  # Replace with the actual file path
spotify_df = pd.read_csv(file_path, encoding='utf-8')

# Handle missing values
spotify_df.dropna(inplace=True)
spotify_df.drop_duplicates(inplace=True)

# Tokenize lyrics
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words]
    return tokens

spotify_df['cleaned_lyrics'] = spotify_df['text'].astype(str).apply(clean_text)
spotify_df['cleaned_lyrics_str'] = spotify_df['cleaned_lyrics'].apply(lambda x: ' '.join(x))

# Include artist information in the text
spotify_df['text_with_artist'] = spotify_df['artist'] + ' ' + spotify_df['cleaned_lyrics_str']

# Tokenize the combined text
tokenizer = Tokenizer(num_words=10000, oov_token='<UNK>')
tokenizer.fit_on_texts(spotify_df['text_with_artist'])
sequences = tokenizer.texts_to_sequences(spotify_df['text_with_artist'])

# Create input sequences and their corresponding targets
input_sequences = []
for seq in sequences:
    for i in range(1, len(seq)):
        n_gram_sequence = seq[:i+1]
        input_sequences.append(n_gram_sequence)

# Pad sequences
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# Prepare targets
targets = input_sequences[:, -1]
input_sequences = input_sequences[:, :-1]

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(input_sequences, targets, test_size=0.1, random_state=42)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

# Load the trained model
model = load_model('best_model.h5')

# Setup callbacks for early stopping and best model saving
early_stopping = EarlyStopping(monitor='val_loss', patience=5)
model_checkpoint = ModelCheckpoint('fine_tuned_model.h5', save_best_only=True, monitor='val_loss')
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=1e-6, verbose=1)

# Fine-tune the model
history = model.fit(
    X_train, y_train,
    epochs=10,  # Additional epochs for fine-tuning
    batch_size=128,
    validation_data=(X_test, y_test),
    callbacks=[early_stopping, model_checkpoint, reduce_lr]
)

# Evaluate the fine-tuned model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print("Test Loss after Fine-Tuning:", test_loss)
print("Test Accuracy after Fine-Tuning:", test_accuracy)

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

def generate_text(seed_text, next_words, model, tokenizer, max_sequence_len):
    words_added = 0
    current_text = seed_text
    while words_added < next_words:
        token_list = tokenizer.texts_to_sequences([current_text])[0]
        token_list_padded = pad_sequences([token_list], maxlen=max_sequence_len, padding='pre')
        predictions = model.predict(token_list_padded, verbose=0).squeeze()
        predicted_index = np.argmax(predictions)
        output_word = tokenizer.index_word.get(predicted_index, '')

        if output_word and output_word.strip():
            current_text += ' ' + output_word.strip()
            words_added += 1
        else:
            continue

    return current_text

# Example usage
seed_text = "Adele Rolling in the deep"
generated_text = generate_text(seed_text, 10, model, tokenizer, max_sequence_len)
print(generated_text)

# **Fix - 1**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Load dataset in chunks
file_path = '/content/Spotify Million Song Dataset_exported.csv'  # Replace with the actual file path
chunksize = 10000  # Adjust chunk size as needed

# Initialize NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words]
    return tokens

# Initialize tokenizer
tokenizer = Tokenizer(num_words=1000, oov_token='<UNK>')

# Process dataset in chunks
def process_chunk(chunk):
    chunk.dropna(inplace=True)
    chunk.drop_duplicates(inplace=True)
    chunk['cleaned_lyrics'] = chunk['text'].astype(str).apply(clean_text)
    chunk['cleaned_lyrics_str'] = chunk['cleaned_lyrics'].apply(lambda x: ' '.join(x))
    chunk['text_with_artist'] = chunk['artist'] + ' ' + chunk['cleaned_lyrics_str']
    return chunk

# Fit tokenizer on the entire dataset
for chunk in pd.read_csv(file_path, chunksize=chunksize, encoding='utf-8'):
    chunk = process_chunk(chunk)
    tokenizer.fit_on_texts(chunk['text_with_artist'])

# Create input sequences and targets in chunks
input_sequences = []
for chunk in pd.read_csv(file_path, chunksize=chunksize, encoding='utf-8'):
    chunk = process_chunk(chunk)
    sequences = tokenizer.texts_to_sequences(chunk['text_with_artist'])
    for seq in sequences:
        for i in range(1, len(seq)):
            n_gram_sequence = seq[:i+1]
            input_sequences.append(n_gram_sequence)









[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Pad sequences
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# Prepare targets
targets = input_sequences[:, -1]
input_sequences = input_sequences[:, :-1]

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(input_sequences, targets, test_size=0.1, random_state=42)

In [None]:
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

# Load the trained model
model = load_model('Yousef_trained_model.h5')

# Setup callbacks for early stopping and best model saving
early_stopping = EarlyStopping(monitor='val_loss', patience=5)
model_checkpoint = ModelCheckpoint('fine_tuned_model.h5', save_best_only=True, monitor='val_loss')
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=1e-6, verbose=1)

# Fine-tune the model
history = model.fit(
    X_train, y_train,
    epochs=10,  # Additional epochs for fine-tuning
    batch_size=128,
    validation_data=(X_test, y_test),
    callbacks=[early_stopping, model_checkpoint, reduce_lr]
)

In [None]:
# Evaluate the fine-tuned model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print("Test Loss after Fine-Tuning:", test_loss)
print("Test Accuracy after Fine-Tuning:", test_accuracy)

In [None]:


# Function to generate text
def generate_text(seed_text, next_words, model, tokenizer, max_sequence_len):
    words_added = 0
    current_text = seed_text
    while words_added < next_words:
        token_list = tokenizer.texts_to_sequences([current_text])[0]
        token_list_padded = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predictions = model.predict(token_list_padded, verbose=0).squeeze()
        predicted_index = np.argmax(predictions)
        output_word = tokenizer.index_word.get(predicted_index, '')

        if output_word and output_word.strip():
            current_text += ' ' + output_word.strip()
            words_added += 1
        else:
            continue

    return current_text


In [None]:
# Example usage
seed_text = "Adele Rolling in the deep"
generated_text = generate_text(seed_text, 10, model, tokenizer, max_sequence_len)
print(generated_text)

# **Fix - 2**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Load dataset in chunks
file_path = '/content/Spotify Million Song Dataset_exported.csv'  # Replace with the actual file path
chunksize = 10000  # Adjust chunk size as needed

# Initialize NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words]
    return tokens

# Initialize tokenizer
tokenizer = Tokenizer(num_words=10000, oov_token='<UNK>')

# Fit tokenizer on the entire dataset in chunks
for chunk in pd.read_csv(file_path, chunksize=chunksize, encoding='utf-8'):
    chunk.dropna(inplace=True)
    chunk.drop_duplicates(inplace=True)
    chunk['cleaned_lyrics'] = chunk['text'].astype(str).apply(clean_text)
    chunk['cleaned_lyrics_str'] = chunk['cleaned_lyrics'].apply(lambda x: ' '.join(x))
    chunk['text_with_artist'] = chunk['artist'] + ' ' + chunk['cleaned_lyrics_str']
    tokenizer.fit_on_texts(chunk['text_with_artist'])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Function to process a chunk and yield padded sequences
def process_chunk(chunk):
    chunk.dropna(inplace=True)
    chunk.drop_duplicates(inplace=True)
    chunk['cleaned_lyrics'] = chunk['text'].astype(str).apply(clean_text)
    chunk['cleaned_lyrics_str'] = chunk['cleaned_lyrics'].apply(lambda x: ' '.join(x))
    chunk['text_with_artist'] = chunk['artist'] + ' ' + chunk['cleaned_lyrics_str']
    sequences = tokenizer.texts_to_sequences(chunk['text_with_artist'])
    input_sequences = []
    for seq in sequences:
        for i in range(1, len(seq)):
            n_gram_sequence = seq[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences

In [None]:
# Function to yield batches of padded sequences and targets
def data_generator(file_path, chunksize, max_sequence_len, batch_size):
    for chunk in pd.read_csv(file_path, chunksize=chunksize, encoding='utf-8'):
        input_sequences = process_chunk(chunk)
        input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')
        targets = input_sequences[:, -1]
        input_sequences = input_sequences[:, :-1]
        for start in range(0, len(input_sequences), batch_size):
            end = min(start + batch_size, len(input_sequences))
            yield input_sequences[start:end], targets[start:end]

# Determine max sequence length by processing a small sample of the dataset
sample_chunk = pd.read_csv(file_path, nrows=chunksize, encoding='utf-8')
sample_sequences = process_chunk(sample_chunk)
max_sequence_len = max([len(x) for x in sample_sequences])

In [None]:
# Split data into training and test sets incrementally
train_sequences, test_sequences = [], []
train_targets, test_targets = [], []
for chunk in pd.read_csv(file_path, chunksize=chunksize, encoding='utf-8'):
    input_sequences = process_chunk(chunk)
    input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')
    targets = input_sequences[:, -1]
    input_sequences = input_sequences[:, :-1]
    X_train_chunk, X_test_chunk, y_train_chunk, y_test_chunk = train_test_split(
        input_sequences, targets, test_size=0.1, random_state=42
    )
    train_sequences.extend(X_train_chunk)
    train_targets.extend(y_train_chunk)
    test_sequences.extend(X_test_chunk)
    test_targets.extend(y_test_chunk)

X_train = np.array(train_sequences)
y_train = np.array(train_targets)
X_test = np.array(test_sequences)
y_test = np.array(test_targets)

In [None]:
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

# Load the trained model
model = load_model('Yousef_trained_model.h5')

# Setup callbacks for early stopping and best model saving
early_stopping = EarlyStopping(monitor='val_loss', patience=5)
model_checkpoint = ModelCheckpoint('fine_tuned_model.h5', save_best_only=True, monitor='val_loss')
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=1e-6, verbose=1)

# Fine-tune the model using a generator
batch_size = 128
train_generator = data_generator(file_path, chunksize, max_sequence_len, batch_size)

# Estimate steps per epoch
train_steps_per_epoch = len(X_train) // batch_size
val_steps_per_epoch = len(X_test) // batch_size

history = model.fit(
    train_generator,
    steps_per_epoch=train_steps_per_epoch,
    epochs=10,  # Additional epochs for fine-tuning
    validation_data=(X_test, y_test),
    callbacks=[early_stopping, model_checkpoint, reduce_lr]
)

# Evaluate the fine-tuned model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print("Test Loss after Fine-Tuning:", test_loss)
print("Test Accuracy after Fine-Tuning:", test_accuracy)

In [None]:
# Function to generate text
def generate_text(seed_text, next_words, model, tokenizer, max_sequence_len):
    words_added = 0
    current_text = seed_text
    while words_added < next_words:
        token_list = tokenizer.texts_to_sequences([current_text])[0]
        token_list_padded = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predictions = model.predict(token_list_padded, verbose=0).squeeze()
        predicted_index = np.argmax(predictions)
        output_word = tokenizer.index_word.get(predicted_index, '')

        if output_word and output_word.strip():
            current_text += ' ' + output_word.strip()
            words_added += 1
        else:
            continue

    return current_text

In [None]:
# Example usage
seed_text = "Adele Rolling in the deep"
generated_text = generate_text(seed_text, 10, model, tokenizer, max_sequence_len)
print(generated_text)