## THIS NOTEBOOK FOCUSES ON GENERATING TEXT EMBEDDINGS USING SENTENCE BERT, COMPUTING COSINE SIMILARITY AND CARRYING OUT THE RECOMMENDATION PROCESS
### THIS IS A PART OF ABLATION STUDY WHERE I CHECK THE QUALITY OF RECOMMENDATIONS GENERATED USING THE ONLY THE LYRICS/PLOT COLUMNS

### LOADING PREPROCESSED DATASETS

In [72]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# LOADING THE MUSIC DATASET
music_df = pd.read_csv("my_preprocessed_SPOTIFY_GENIUS_SONG_DATASET.csv")

# LOADING THE MOVIES DATASET
movies_df = pd.read_csv("MY_preprocessed_omdb_movie_dataset.csv")

### LOADING SENTENCE BERT MODELS

In [74]:
# LOADING MODEL
model= SentenceTransformer('all-Mpnet-base-v2')
print("Sentence BERT model 'all-Mpnet-base-v2' loaded successfully!")

Sentence BERT model 'all-Mpnet-base-v2' loaded successfully!


### GENERATING EMBEDDINGS FOR SONG LYRICS AND MOVIE PLOTS USING MODEL - 'all-Mpnet-base-v2'

In [76]:
# FUNCTION TO GENERATE EMBEDDINGS USING model - 'all-Mpnet-base-v2'
def generate_embeddings(data, model):
    return model.encode(data.tolist(), convert_to_numpy=True)

# GENERATING EMBEDDINGS FOR LYRICS AND PLOT USING model - 'all-Mpnet-base-v2'
print("Generating embeddings for music lyrics...")
music_lyrics_embeddings = generate_embeddings(music_df['Processed_Lyrics'], model)
print("Generating embeddings for movie plots...")
movie_plot_embeddings = generate_embeddings(movies_df['Processed_Plot'], model)
print("Embeddings generated for music lyrics and movie plots !")

Generating embeddings for music lyrics...
Generating embeddings for movie plots...
Embeddings generated for music lyrics and movie plots !


### COMPUTING COSINE SIMILARITY MATRICES FOR EMBEDDINGS GENERATED USING ALL 3 MODELS

In [78]:
# COMPUTING COSINE SIMILARITY MATRIX
print("Computing similarity matrix...")
similarity_matrix=  cosine_similarity(music_lyrics_embeddings, movie_plot_embeddings)
print("similarity matrix created!")

Computing similarity matrix...
similarity matrix created!


### GENERATING RECOMMENDATIONS BY GETTING SONG INPUT FROM USER

In [60]:
import pandas as pd
import numpy as np

# FUNCTION TO PREPROCESS USER INPUT 
def preprocess_input(input_string):
    return input_string.lower().strip()

# FUNCTION TO FIND SONG INDEX BASED ON USER INPUT
def find_song_index(track_name, artist_name, music_df):
    # PREPROCESSING INPUT STRING
    track_name = preprocess_input(track_name)
    artist_name = preprocess_input(artist_name)

    # FINDING THE MATCHING INDEX IN THE DATASET
    matches = music_df[
        (music_df['Track Name'].str.lower() == track_name) & 
        (music_df['Artist'].str.lower() == artist_name)
    ]

    if matches.empty:
        return None  # SONG NOT FOUND
    return matches.index[0]  # RETURNING THE INDEX OF THE SONG IF A MATCH IS FOUND


# USER INPUT
user_track = input("Enter the song name: ")
user_artist = input("Enter the artist name: ")

# FIND SONG INDEX
song_index = find_song_index(user_track, user_artist, music_df)

if song_index is None:
    print("Sorry, the song could not be found in the dataset.")
else:
     # GET SONG EXTRACTED KEYWORDS
    extracted_keywords = music_df.loc[song_index, 'Extracted_Keywords']
    
    # GET SIMILARITY SCORES FOR THIS SONG
    sim_scores = similarity_matrix[song_index]
    
    # SORT MOVIE INDICES BY SIMILARITY SCORE IN DESCENDING ORDER
    top_indices = np.argsort(sim_scores)[::-1][:5]  # GETTING TOP 5 RECOMMENDATIONS
    
    # GETTING RECOMMENDED MOVIES
    recommendations = [
        {
            "Title": movies_df['Title'].iloc[i],
            "Genre": movies_df['Genre'].iloc[i],
            "Plot": movies_df['Plot'].iloc[i],
            "Similarity": sim_scores[i]
        }
        for i in top_indices if i < len(movies_df)
    ]
    
    # DISPLAY RECOMMENDATIONS
    #print(f"\n✨ Recommendations for '{user_track}' by {user_artist} using BERT : 'all-Mpnet-base-v2' ✨\n")
    print(f"\n✨ Recommendations for '{user_track}' by {user_artist} ✨")
   # print(f"🎵 Extracted Keywords: {extracted_keywords}\n")
    print(f"Using BERT: 'all-Mpnet-base-v2'\n")
    for rec in recommendations:
        print(f"🎬 **{rec['Title']}**")
        print(f"   - Genre: {rec['Genre']}")
        print(f"   - Plot: {rec['Plot']}")
        print(f"   - Similarity Score: {rec['Similarity']:.2f}")
        print("-" * 60)

    # Logging user input and recommendations
    log_entry = {
        "User Song": user_track,
        "User Artist": user_artist,
        "Recommendations": [(rec["Title"], rec["Genre"]) for rec in recommendations],
        "User Feedback": None  # Placeholder for feedback
    }
   # logs_df = logs_df.append(log_entry, ignore_index=True)
    log_entry_df = pd.DataFrame([log_entry])  # Create a temporary DataFrame for the new log entry
    logs_df = pd.concat([logs_df, log_entry_df], ignore_index=True)  # Concatenate the new entry

    # Ask for user feedback
    feedback = input("\nAre you happy with these recommendations? (yes/no): ").strip().lower()
    logs_df.at[logs_df.index[-1], "User Feedback"] = feedback
    
    # Save logs to a file
    logs_df.to_csv("recommendation_logs.csv", index=False)
    print("\nYour feedback has been recorded. Thank you!")


Enter the song name:  perfect 
Enter the artist name:  ed sheeran



✨ Recommendations for 'perfect ' by ed sheeran ✨
Using BERT: 'all-Mpnet-base-v2'

🎬 **the disappearance of eleanor rigby: them**
   - Genre: drama, romance
   - Plot: One couple's story as they try to reclaim the life and love they once knew and pick up the pieces of a past that may be too far gone.
   - Similarity Score: 0.50
------------------------------------------------------------
🎬 **the last song**
   - Genre: drama, music, romance
   - Plot: A rebellious girl is sent to a Southern beach town for the summer to stay with her father. Through their mutual love of music, the estranged duo learn to reconnect.
   - Similarity Score: 0.43
------------------------------------------------------------
🎬 **song to song**
   - Genre: drama, music, romance
   - Plot: Two intersecting love triangles. Obsession and betrayal set against the music scene in Austin, Texas.
   - Similarity Score: 0.42
------------------------------------------------------------
🎬 **the broken circle breakdown**
 


Are you happy with these recommendations? (yes/no):  no



Your feedback has been recorded. Thank you!


### Ablation Study: Recommendations are generated for five themes and genres of songs.
### Evaluation is done based on diversity and novelty scores.

In [155]:
import pandas as pd
import numpy as np

# FUNCTION TO PREPROCESS USER INPUT 
def preprocess_input(input_string):
    return input_string.lower().strip()

# FUNCTION TO FIND SONG INDEX BASED ON USER INPUT
def find_song_index(track_name, artist_name, music_df):
    # PREPROCESSING INPUT STRING
    track_name = preprocess_input(track_name)
    artist_name = preprocess_input(artist_name)

    # FINDING THE MATCHING INDEX IN THE DATASET
    matches = music_df[
        (music_df['Track Name'].str.lower() == track_name) & 
        (music_df['Artist'].str.lower() == artist_name)
    ]

    if matches.empty:
        return None  # SONG NOT FOUND
    return matches.index[0]  # RETURNING THE INDEX OF THE SONG IF A MATCH IS FOUND

# FUNCTION TO DISPLAY RECOMMENDATIONS IN A DATAFRAME
def display_recommendations_df(recommendations, user_track, user_artist):
    """
    Display recommendations in a DataFrame table format.
    """
    # Create a DataFrame from recommendations
    recommendations_df = pd.DataFrame(recommendations)
    
    # Add user song and artist as columns for clarity
    recommendations_df["Input Song"] = user_track
    recommendations_df["Input Artist"] = user_artist
    
    # Reorder columns
    return recommendations_df[
        ["Input Song", "Input Artist", "Title", "Genre", "Plot Summary", "Similarity Score"]
    ]

# SONGS TO RECOMMEND
songs_to_recommend = [
    {"track": "Perfect", "artist": "Ed Sheeran"},
    {"track": "The Mountain", "artist": "Three Days Grace"},
    {"track": "Stan", "artist": "Eminem"},
    {"track": "I Will Always Love You", "artist": "Whitney Houston"},
    {"track": "Wake Me Up", "artist": "Avicii"}
]

all_recommendations_df = pd.DataFrame()  # FOR STORING ALL SONG RECOMMENDATIONS

for song in songs_to_recommend:
    user_track = song["track"]
    user_artist = song["artist"]

    # FIND SONG INDEX
    song_index = find_song_index(user_track, user_artist, music_df)

    if song_index is None:
        print(f"Sorry, the song '{user_track}' by '{user_artist}' could not be found in the dataset.")
        continue
    else:
        # GET SONG EXTRACTED KEYWORDS
        extracted_keywords = music_df.loc[song_index, 'Extracted_Keywords']
        
        # GET SIMILARITY SCORES FOR THIS SONG
        sim_scores = similarity_matrix[song_index]
        
        # SORT MOVIE INDICES BY SIMILARITY SCORE IN DESCENDING ORDER
        top_indices = np.argsort(sim_scores)[::-1][:5]  # GETTING TOP 5 RECOMMENDATIONS
        
        # GETTING RECOMMENDED MOVIES
        recommendations = [
            {
                "Title": movies_df['Title'].iloc[i],
                "Genre": movies_df['Genre'].iloc[i],
                "Plot Summary": movies_df['Plot'].iloc[i],
                "Similarity Score": f"{sim_scores[i]:.2f}"
            }
            for i in top_indices if i < len(movies_df)
        ]
        
        # DISPLAY RECOMMENDATIONS
        recommendations_df = display_recommendations_df(recommendations, user_track, user_artist)
        all_recommendations_df = pd.concat([all_recommendations_df, recommendations_df], ignore_index=True)

# SAVING RECOMMENDATIONS TO CSV
all_recommendations_df.to_csv('SBERT_recommendations_ablation_study.csv', index=False) 

# PRINTING ALL RECOMMENDATIONS IN A CONSOLIDATED DATAFRAME
pd.set_option('display.max_colwidth', None)  # TO DISPLAY FULL PLOT SUMMARIES
print("Recommendations List:")
print(all_recommendations_df.head(25))



Recommendations List:
                Input Song      Input Artist  \
0                  Perfect        Ed Sheeran   
1                  Perfect        Ed Sheeran   
2                  Perfect        Ed Sheeran   
3                  Perfect        Ed Sheeran   
4                  Perfect        Ed Sheeran   
5             The Mountain  Three Days Grace   
6             The Mountain  Three Days Grace   
7             The Mountain  Three Days Grace   
8             The Mountain  Three Days Grace   
9             The Mountain  Three Days Grace   
10                    Stan            Eminem   
11                    Stan            Eminem   
12                    Stan            Eminem   
13                    Stan            Eminem   
14                    Stan            Eminem   
15  I Will Always Love You   Whitney Houston   
16  I Will Always Love You   Whitney Houston   
17  I Will Always Love You   Whitney Houston   
18  I Will Always Love You   Whitney Houston   
19  I Will Always 

In [153]:
# FUNCTION TO CALCULATE DIVERSITY SCORE
def calculate_diversity(top_indices, movie_embeddings):
    recommended_embeddings = movie_embeddings[top_indices]
    similarity_matrix = cosine_similarity(recommended_embeddings)
    upper_triangle_indices = np.triu_indices(len(top_indices), k=1)
    pairwise_similarities = similarity_matrix[upper_triangle_indices]
    pairwise_dissimilarities = 1 - pairwise_similarities
    return np.mean(pairwise_dissimilarities)

# FUNCTION TO CALCULATE NOVELTY SCORE
def calculate_novelty(top_indices, popularity_scores):
    recommended_popularity = popularity_scores[top_indices]
    return np.mean(1 - recommended_popularity)

# DIVERSITY AND NOVELTY TABLE CREATION
def create_diversity_novelty_table(songs_to_recommend, music_df, movies_df, similarity_matrix, movie_embeddings):
    """
    Create a table of diversity and novelty scores for each input song.
    """
    diversity_novelty_table = pd.DataFrame(columns=["Input Song", "Input Artist", "Diversity Score", "Novelty Score"])

    for song in songs_to_recommend:
        user_track = song["track"]
        user_artist = song["artist"]

        # FIND SONG INDEX
        song_index = find_song_index(user_track, user_artist, music_df)

        if song_index is None:
            print(f"Sorry, the song '{user_track}' by '{user_artist}' could not be found in the dataset.")
            continue
        else:
            # GET SIMILARITY SCORES FOR THIS SONG
            sim_scores = similarity_matrix[song_index]

            # SORT MOVIE INDICES BY SIMILARITY SCORE IN DESCENDING ORDER
            top_indices = np.argsort(sim_scores)[::-1][:5]  # GETTING TOP 5 RECOMMENDATIONS

            # CALCULATE DIVERSITY AND NOVELTY
            diversity_score = calculate_diversity(top_indices, movie_embeddings)
            novelty_score = calculate_novelty(top_indices, movies_df['IMDb Rating'].to_numpy())

            # Add to diversity and novelty table
            entry = {
                "Input Song": user_track,
                "Input Artist": user_artist,
                "Diversity Score": f"{diversity_score:.2f}",
                "Novelty Score": f"{novelty_score:.2f}"
            }
            diversity_novelty_table = pd.concat([diversity_novelty_table, pd.DataFrame([entry])], ignore_index=True)

    return diversity_novelty_table

diversity_novelty_table = create_diversity_novelty_table(songs_to_recommend, music_df, movies_df, similarity_matrix, movie_plot_embeddings)

# PRINTING THE DIVERSITY AND NOVELTY SCORES
print("\nDiversity and Novelty Scores Table:\n")
print(diversity_novelty_table.head())


Diversity and Novelty Scores Table:

               Input Song      Input Artist Diversity Score Novelty Score
0                 Perfect        Ed Sheeran            0.54          0.50
1            The Mountain  Three Days Grace            0.69          0.47
2                    Stan            Eminem            0.60          0.46
3  I Will Always Love You   Whitney Houston            0.57          0.49
4              Wake Me Up            Avicii            0.57          0.53
