### THIS NOTEBOOK FOCUSES ON GENERATING COMBINED EMBEDDINGS OF LYRICS/PLOT AND EXTRACTED KEYWORDS USING SENTENCE BERT, COMPUTING COSINE SIMILARITY OF THE COMBINED EMBEDDINGS AND CARRYING OUT THE RECOMMENDATION PROCESS
### THIS IS A PART OF ABLATION STUDY WHERE I CHECK THE QUALITY OF RECOMMENDATIONS GENERATED USING THE  FEATURES LYRICS/PLOT AND EXTRACTED KEYWORDS

In [26]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# LOADING THE MUSIC DATASET
music_df = pd.read_csv("my_preprocessed_SPOTIFY_GENIUS_SONG_DATASET.csv")

# LOADING THE MOVIES DATASET
movies_df = pd.read_csv("MY_preprocessed_omdb_movie_dataset.csv")

In [28]:
# LOADING SENTENCE BERT MODELS
model= SentenceTransformer('all-Mpnet-base-v2')
print("Sentence BERT model 'all-Mpnet-base-v2' loaded successfully!")

Sentence BERT model 'all-Mpnet-base-v2' loaded successfully!


In [30]:
# DEFINING FEATURES TO INCLUDE FROM MUSIC AND MOVIE DATASETS
music_features = ['Processed_Lyrics', 'Extracted_Keywords']  
movie_features = ['Processed_Plot', 'Extracted_Keywords']  


### ASSIGNING WEIGHTS TO THE FEATURES AND GENERATING EMBEDDINGS USING BERT MODEL 1 : 'all-Mpnet-base-v2'

In [32]:
# ASSIGNING WEIGHTS TO MUSIC FEATURES
music_weights = {
    'Processed_Lyrics': 0.4,           # Lyrics are very important
    'Extracted_Keywords': 0.6,         # Keywords provide additional context but are less important
}

# ASSIGNING WEIGHTS TO MOVIE FEATURES
movie_weights = {
    'Processed_Plot': 0.4,               # The plot is the most critical feature
    'Extracted_Keywords': 0.6,           # Keywords can provide relevant context
}

# FUNCTION TO COMPUTE THE EMBEDDINGS FOR A SINGLE ROW
def weighted_embeddings(row, features, weights, model):
    """
    Compute weighted embeddings for a single row.
    
    :param row: A single row of the dataframe
    :param features: List of features to include
    :param weights: Dictionary with feature names as keys and weights as values
    :param model: Preloaded BERT model
    :return: Weighted embedding vector for the row
    """
    embeddings = []
    total_weight = sum(weights.get(feature, 0) for feature in features if feature in row and pd.notna(row[feature]))  # CALCULATING TOTAL WEIGHT

    
    # CALCULATING TOTAL WEIGHT FOR NORMALIZATION
    for feature in features:
        if feature in row:
            weight = weights.get(feature, 1)  # Default weight is 1
            total_weight += weight

    # GENERATING WEIGHTED EMBEDDINGS
    for feature in features:
        if feature in row:
            text = str(row[feature])
            embedding = model.encode([text], convert_to_numpy=True)[0]  # GENERATING EMBEDDING FOR THE FEATURE
            weight = weights.get(feature, 1)  # DEFAULT WEIGHT = 1
            
            # NORMALIZING THE WEIGHT
            normalized_weight = weight / total_weight
            embeddings.append(embedding * normalized_weight)  # SCALING EMBEDDING BY NORMALIZED WEIGHT

    return np.sum(embeddings, axis=0)  # RETURNING THE COMBINED WEIGHTED EMBEDDING

def generate_weighted_embeddings_in_batches(df, features, weights, model, batch_size=100):
    """
    Generate weighted embeddings for a DataFrame in batches.
    
    :param df: The DataFrame containing the data
    :param features: List of features to include
    :param weights: Dictionary with feature names as keys and weights as values
    :param model: Preloaded BERT model
    :param batch_size: Number of rows to process in each batch
    :return: Numpy array of weighted embeddings
    """
    all_embeddings = []

    # PROCESSING THE DATAFRAME IN BATCHES DUE TO COMPUTATIONAL LIMITS
    for i in range(0, len(df), batch_size):
        batch = df.iloc[i:i + batch_size]  # SELECTING A BATCH OF ROWS
        batch_embeddings = np.array([
            weighted_embeddings(row, features, weights, model) for _, row in batch.iterrows()
        ])
        all_embeddings.append(batch_embeddings)

    return np.vstack(all_embeddings)  # COMBINING ALL BATCHES INTO A SINGLE ARRAY


### GENERATING COMBINED EMBEDDINGS FOR MUSIC AND MOVIES USING LRYICS/PLOTS AND EXTRACTED KEYWORDS.

In [34]:
# GENERATING WEIGHTED EMBEDDINGS FOR MUSIC AND MOVIES USING BERT MODEL : 'all-Mpnet-base-v2'
music_embeddings = generate_weighted_embeddings_in_batches(music_df, music_features, music_weights, model, batch_size=50)
movie_embeddings = generate_weighted_embeddings_in_batches(movies_df, movie_features, movie_weights, model, batch_size=50)
print("Combined Embeddings generated using 'all-Mpnet-base-v2' !")

Combined Embeddings generated using 'all-Mpnet-base-v2' !


In [36]:
# COMPUTING COSINE SIMILARITY MATRICES
print("Computing similarity matrix...")
similarity_matrix = cosine_similarity(music_embeddings, movie_embeddings)
print("similarity matrix created!")

Computing similarity matrix...
similarity matrix created!


In [38]:
# FUNCTION TO PREPROCESS USER INPUT
def preprocess_input(input_string):
    return input_string.lower().strip()

# FUNTCTION TO FIND SONG INDEX BASED ON USER INPUT
def find_song_index(track_name, artist_name, music_df):
    track_name = preprocess_input(track_name)
    artist_name = preprocess_input(artist_name)
    matches = music_df[
        (music_df['Track Name'].str.lower() == track_name) & 
        (music_df['Artist'].str.lower() == artist_name)
    ]
    if matches.empty:
        return None  # SONG NOT FOUND
    return matches.index[0]  # RETURNING THE INDEX OF THE SONG


In [44]:
import pandas as pd
import numpy as np

# FUNCTION TO PREPROCESS USER INPUT 
def preprocess_input(input_string):
    return input_string.lower().strip()

# FUNCTION TO FIND SONG INDEX BASED ON USER INPUT
def find_song_index(track_name, artist_name, music_df):
    # PREPROCESSING INPUT STRING
    track_name = preprocess_input(track_name)
    artist_name = preprocess_input(artist_name)

    # FINDING THE MATCHING INDEX IN THE DATASET
    matches = music_df[
        (music_df['Track Name'].str.lower() == track_name) & 
        (music_df['Artist'].str.lower() == artist_name)
    ]

    if matches.empty:
        return None  # SONG NOT FOUND
    return matches.index[0]  # RETURNING THE INDEX OF THE SONG IF A MATCH IS FOUND

# Initialize logs DataFrame
logs_df = pd.DataFrame(columns=["User Song", "User Artist", "Recommendations", "User Feedback"])

# USER INPUT
user_track = input("Enter the song name: ")
user_artist = input("Enter the artist name: ")

# FIND SONG INDEX
song_index = find_song_index(user_track, user_artist, music_df)

if song_index is None:
    print("Sorry, the song could not be found in the dataset.")
else:
     # GET SONG EXTRACTED KEYWORDS
    extracted_keywords = music_df.loc[song_index, 'Extracted_Keywords']
    
    # GET SIMILARITY SCORES FOR THIS SONG
    sim_scores = similarity_matrix[song_index]
    
    # SORT MOVIE INDICES BY SIMILARITY SCORE IN DESCENDING ORDER
    top_indices = np.argsort(sim_scores)[::-1][:5]  # GETTING TOP 5 RECOMMENDATIONS
    
    # GETTING RECOMMENDED MOVIES
    recommendations = [
        {
            "Title": movies_df['Title'].iloc[i],
            "Genre": movies_df['Genre'].iloc[i],
            "Plot": movies_df['Plot'].iloc[i],
            "Similarity": sim_scores[i]
        }
        for i in top_indices if i < len(movies_df)
    ]
    
    # DISPLAY RECOMMENDATIONS
    #print(f"\n✨ Recommendations for '{user_track}' by {user_artist} using BERT : 'all-Mpnet-base-v2' ✨\n")
    print(f"\n✨ Recommendations for '{user_track}' by {user_artist} ✨")
    print(f"🎵 Extracted Keywords: {extracted_keywords}\n")
    print(f"Using BERT: 'all-Mpnet-base-v2'\n")
    for rec in recommendations:
        print(f"🎬 **{rec['Title']}**")
        print(f"   - Genre: {rec['Genre']}")
        print(f"   - Plot: {rec['Plot']}")
        print(f"   - Similarity Score: {rec['Similarity']:.2f}")
        print("-" * 60)




Enter the song name:  perfect
Enter the artist name:  ed sheeran



✨ Recommendations for 'perfect' by ed sheeran ✨
🎵 Extracted Keywords: darlin, song, love, holdin, dancin

Using BERT: 'all-Mpnet-base-v2'

🎬 **billy elliot**
   - Genre: drama, music
   - Plot: A talented young boy becomes torn between his unexpected love of dance and the disintegration of his family.
   - Similarity Score: 0.52
------------------------------------------------------------
🎬 **the disappearance of eleanor rigby: them**
   - Genre: drama, romance
   - Plot: One couple's story as they try to reclaim the life and love they once knew and pick up the pieces of a past that may be too far gone.
   - Similarity Score: 0.52
------------------------------------------------------------
🎬 **crazy heart**
   - Genre: drama, music, romance
   - Plot: A faded country music musician is forced to reassess his dysfunctional life during a doomed romance that also inspires him.
   - Similarity Score: 0.51
------------------------------------------------------------
🎬 **the broken circle b

### Ablation Study: Recommendations are generated for five themes and genres of songs.
### Evaluation is done based on diversity and novelty scores.

In [46]:
import pandas as pd
import numpy as np

# FUNCTION TO PREPROCESS USER INPUT 
def preprocess_input(input_string):
    return input_string.lower().strip()

# FUNCTION TO FIND SONG INDEX BASED ON USER INPUT
def find_song_index(track_name, artist_name, music_df):
    # PREPROCESSING INPUT STRING
    track_name = preprocess_input(track_name)
    artist_name = preprocess_input(artist_name)

    # FINDING THE MATCHING INDEX IN THE DATASET
    matches = music_df[
        (music_df['Track Name'].str.lower() == track_name) & 
        (music_df['Artist'].str.lower() == artist_name)
    ]

    if matches.empty:
        return None  # SONG NOT FOUND
    return matches.index[0]  # RETURNING THE INDEX OF THE SONG IF A MATCH IS FOUND

# FUNCTION TO DISPLAY RECOMMENDATIONS IN A DATAFRAME
def display_recommendations_df(recommendations, user_track, user_artist):
    """
    Display recommendations in a DataFrame table format.
    """
    # Create a DataFrame from recommendations
    recommendations_df = pd.DataFrame(recommendations)
    
    # Add user song and artist as columns for clarity
    recommendations_df["Input Song"] = user_track
    recommendations_df["Input Artist"] = user_artist
    
    # Reorder columns
    return recommendations_df[
        ["Input Song", "Input Artist", "Title", "Genre", "Plot Summary", "Similarity Score"]
    ]

# SONGS TO RECOMMEND
songs_to_recommend = [
    {"track": "Perfect", "artist": "Ed Sheeran"},
    {"track": "The Mountain", "artist": "Three Days Grace"},
    {"track": "Stan", "artist": "Eminem"},
    {"track": "I Will Always Love You", "artist": "Whitney Houston"},
    {"track": "Wake Me Up", "artist": "Avicii"}
]

all_recommendations_df = pd.DataFrame()  # FOR STORING ALL SONG RECOMMENDATIONS

for song in songs_to_recommend:
    user_track = song["track"]
    user_artist = song["artist"]

    # FIND SONG INDEX
    song_index = find_song_index(user_track, user_artist, music_df)

    if song_index is None:
        print(f"Sorry, the song '{user_track}' by '{user_artist}' could not be found in the dataset.")
        continue
    else:
        # GET SONG EXTRACTED KEYWORDS
        extracted_keywords = music_df.loc[song_index, 'Extracted_Keywords']
        
        # GET SIMILARITY SCORES FOR THIS SONG
        sim_scores = similarity_matrix[song_index]
        
        # SORT MOVIE INDICES BY SIMILARITY SCORE IN DESCENDING ORDER
        top_indices = np.argsort(sim_scores)[::-1][:5]  # GETTING TOP 5 RECOMMENDATIONS
        
        # GETTING RECOMMENDED MOVIES
        recommendations = [
            {
                "Title": movies_df['Title'].iloc[i],
                "Genre": movies_df['Genre'].iloc[i],
                "Plot Summary": movies_df['Plot'].iloc[i],
                "Similarity Score": f"{sim_scores[i]:.2f}"
            }
            for i in top_indices if i < len(movies_df)
        ]
        
        # DISPLAY RECOMMENDATIONS
        recommendations_df = display_recommendations_df(recommendations, user_track, user_artist)
        all_recommendations_df = pd.concat([all_recommendations_df, recommendations_df], ignore_index=True)

# SAVING RECOMMENDATIONS TO CSV
all_recommendations_df.to_csv('SBERT_recommendations_ablation_study.csv', index=False) 

# PRINTING ALL RECOMMENDATIONS IN A CONSOLIDATED DATAFRAME
pd.set_option('display.max_colwidth', None)  # TO DISPLAY FULL PLOT SUMMARIES
print("Recommendations List:")
print(all_recommendations_df.head(25))



Recommendations List:
                Input Song      Input Artist  \
0                  Perfect        Ed Sheeran   
1                  Perfect        Ed Sheeran   
2                  Perfect        Ed Sheeran   
3                  Perfect        Ed Sheeran   
4                  Perfect        Ed Sheeran   
5             The Mountain  Three Days Grace   
6             The Mountain  Three Days Grace   
7             The Mountain  Three Days Grace   
8             The Mountain  Three Days Grace   
9             The Mountain  Three Days Grace   
10                    Stan            Eminem   
11                    Stan            Eminem   
12                    Stan            Eminem   
13                    Stan            Eminem   
14                    Stan            Eminem   
15  I Will Always Love You   Whitney Houston   
16  I Will Always Love You   Whitney Houston   
17  I Will Always Love You   Whitney Houston   
18  I Will Always Love You   Whitney Houston   
19  I Will Always 

In [48]:
# FUNCTION TO CALCULATE DIVERSITY SCORE
def calculate_diversity(top_indices, movie_embeddings):
    recommended_embeddings = movie_embeddings[top_indices]
    similarity_matrix = cosine_similarity(recommended_embeddings)
    upper_triangle_indices = np.triu_indices(len(top_indices), k=1)
    pairwise_similarities = similarity_matrix[upper_triangle_indices]
    pairwise_dissimilarities = 1 - pairwise_similarities
    return np.mean(pairwise_dissimilarities)

# FUNCTION TO CALCULATE NOVELTY SCORE
def calculate_novelty(top_indices, popularity_scores):
    recommended_popularity = popularity_scores[top_indices]
    return np.mean(1 - recommended_popularity)

# DIVERSITY AND NOVELTY TABLE CREATION
def create_diversity_novelty_table(songs_to_recommend, music_df, movies_df, similarity_matrix, movie_embeddings):
    """
    Create a table of diversity and novelty scores for each input song.
    """
    diversity_novelty_table = pd.DataFrame(columns=["Input Song", "Input Artist", "Diversity Score", "Novelty Score"])

    for song in songs_to_recommend:
        user_track = song["track"]
        user_artist = song["artist"]

        # FIND SONG INDEX
        song_index = find_song_index(user_track, user_artist, music_df)

        if song_index is None:
            print(f"Sorry, the song '{user_track}' by '{user_artist}' could not be found in the dataset.")
            continue
        else:
            # GET SIMILARITY SCORES FOR THIS SONG
            sim_scores = similarity_matrix[song_index]

            # SORT MOVIE INDICES BY SIMILARITY SCORE IN DESCENDING ORDER
            top_indices = np.argsort(sim_scores)[::-1][:5]  # GETTING TOP 5 RECOMMENDATIONS

            # CALCULATE DIVERSITY AND NOVELTY
            diversity_score = calculate_diversity(top_indices, movie_embeddings)
            novelty_score = calculate_novelty(top_indices, movies_df['IMDb Rating'].to_numpy())

            # Add to diversity and novelty table
            entry = {
                "Input Song": user_track,
                "Input Artist": user_artist,
                "Diversity Score": f"{diversity_score:.2f}",
                "Novelty Score": f"{novelty_score:.2f}"
            }
            diversity_novelty_table = pd.concat([diversity_novelty_table, pd.DataFrame([entry])], ignore_index=True)

    return diversity_novelty_table

diversity_novelty_table = create_diversity_novelty_table(songs_to_recommend, music_df, movies_df, similarity_matrix, movie_embeddings)

# PRINTING THE DIVERSITY AND NOVELTY SCORES
print("\nDiversity and Novelty Scores Table:\n")
print(diversity_novelty_table.head())


Diversity and Novelty Scores Table:

               Input Song      Input Artist Diversity Score Novelty Score
0                 Perfect        Ed Sheeran            0.47          0.37
1            The Mountain  Three Days Grace            0.58          0.40
2                    Stan            Eminem            0.64          0.44
3  I Will Always Love You   Whitney Houston            0.42          0.41
4              Wake Me Up            Avicii            0.51          0.49
