### Baseline System for Comparision: TF-IDF Embeddings

In [43]:
import pandas as pd
import numpy as np


# LOADING THE MUSIC DATASET
music_df = pd.read_csv("my_preprocessed_SPOTIFY_GENIUS_SONG_DATASET.csv")

# LOADING THE MOVIES DATASET
movies_df = pd.read_csv("MY_preprocessed_omdb_movie_dataset.csv")

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
music_lyrics = music_df["Processed_Lyrics"]
movie_plots = movies_df["Processed_Plot"]

# COMBINING ALL TEXT DATA FOR TF-IDF TRAINING
combined_corpus = music_lyrics + movie_plots
combined_corpus = [str(doc) if pd.notna(doc) else "" for doc in combined_corpus]

# INITIALIZING AND FITTING THE TF-IDF VECTORIZER\
vectorizer = TfidfVectorizer(max_features=5000)  
music_tfidf_embeddings = vectorizer.fit_transform(music_lyrics)
movies_tfidf_embeddings = vectorizer.fit_transform(movie_plots)
print("embeddings generated using tf-idf!")

# COMPUTING SIMILARITY MATRIX
similarity_matrix = cosine_similarity(music_tfidf_embeddings, movies_tfidf_embeddings)
print("similarity matrix generated!")

embeddings generated using tf-idf!
similarity matrix generated!


In [46]:
import pandas as pd
import numpy as np

# FUNCTION TO PREPROCESS USER INPUT 
def preprocess_input(input_string):
    return input_string.lower().strip()

# FUNCTION TO FIND SONG INDEX BASED ON USER INPUT
def find_song_index(track_name, artist_name, music_df):
    # PREPROCESSING INPUT STRING
    track_name = preprocess_input(track_name)
    artist_name = preprocess_input(artist_name)

    # FINDING THE MATCHING INDEX IN THE DATASET
    matches = music_df[
        (music_df['Track Name'].str.lower() == track_name) & 
        (music_df['Artist'].str.lower() == artist_name)
    ]

    if matches.empty:
        return None  # SONG NOT FOUND
    return matches.index[0]  # RETURNING THE INDEX OF THE SONG IF A MATCH IS FOUND

# FUNCTION TO DISPLAY RECOMMENDATIONS IN A DATAFRAME
def display_recommendations_df(recommendations, user_track, user_artist):
    """
    Display recommendations in a DataFrame table format.
    """
    # Create a DataFrame from recommendations
    recommendations_df = pd.DataFrame(recommendations)
    
    # Add user song and artist as columns for clarity
    recommendations_df["Input Song"] = user_track
    recommendations_df["Input Artist"] = user_artist
    
    # Reorder columns
    return recommendations_df[
        ["Input Song", "Input Artist", "Title", "Genre", "Plot Summary", "Similarity Score"]
    ]

# SONGS TO RECOMMEND
songs_to_recommend = [
    {"track": "Perfect", "artist": "Ed Sheeran"},
    {"track": "The Mountain", "artist": "Three Days Grace"},
    {"track": "Stan", "artist": "Eminem"},
    {"track": "I Will Always Love You", "artist": "Whitney Houston"},
    {"track": "Wake Me Up", "artist": "Avicii"}
]

all_recommendations_df = pd.DataFrame()  # FOR STORING ALL SONG RECOMMENDATIONS

for song in songs_to_recommend:
    user_track = song["track"]
    user_artist = song["artist"]

    # FIND SONG INDEX
    song_index = find_song_index(user_track, user_artist, music_df)

    if song_index is None:
        print(f"Sorry, the song '{user_track}' by '{user_artist}' could not be found in the dataset.")
        continue
    else:
        # GET SONG EXTRACTED KEYWORDS
        extracted_keywords = music_df.loc[song_index, 'Extracted_Keywords']
        
        # GET SIMILARITY SCORES FOR THIS SONG
        sim_scores = similarity_matrix[song_index]
        
        # SORT MOVIE INDICES BY SIMILARITY SCORE IN DESCENDING ORDER
        top_indices = np.argsort(sim_scores)[::-1][:5]  # GETTING TOP 5 RECOMMENDATIONS
        
        # GETTING RECOMMENDED MOVIES
        recommendations = [
            {
                "Title": movies_df['Title'].iloc[i],
                "Genre": movies_df['Genre'].iloc[i],
                "Plot Summary": movies_df['Plot'].iloc[i],
                "Similarity Score": f"{sim_scores[i]:.2f}"
            }
            for i in top_indices if i < len(movies_df)
        ]
        
        # DISPLAY RECOMMENDATIONS
        recommendations_df = display_recommendations_df(recommendations, user_track, user_artist)
        all_recommendations_df = pd.concat([all_recommendations_df, recommendations_df], ignore_index=True)


# PRINTING ALL RECOMMENDATIONS IN A CONSOLIDATED DATAFRAME
pd.set_option('display.max_colwidth', None)  # TO DISPLAY FULL PLOT SUMMARIES
print("Recommendations List using TF-IDF :")
print(all_recommendations_df.head(25))



Recommendations List using TF-IDF :
                Input Song      Input Artist                Title  \
0                  Perfect        Ed Sheeran       made in heaven   
1                  Perfect        Ed Sheeran             song one   
2                  Perfect        Ed Sheeran        sliding doors   
3                  Perfect        Ed Sheeran   500 days of summer   
4                  Perfect        Ed Sheeran    the frozen ground   
5             The Mountain  Three Days Grace         mystic pizza   
6             The Mountain  Three Days Grace      far from heaven   
7             The Mountain  Three Days Grace   revolutionary road   
8             The Mountain  Three Days Grace            disturbia   
9             The Mountain  Three Days Grace   the girl next door   
10                    Stan            Eminem           kick-ass 2   
11                    Stan            Eminem         the notebook   
12                    Stan            Eminem                 safe  

In [48]:
# FUNCTION TO CALCULATE DIVERSITY SCORE
def calculate_diversity(top_indices, movie_embeddings):
    recommended_embeddings = movie_embeddings[top_indices]
    similarity_matrix = cosine_similarity(recommended_embeddings)
    upper_triangle_indices = np.triu_indices(len(top_indices), k=1)
    pairwise_similarities = similarity_matrix[upper_triangle_indices]
    pairwise_dissimilarities = 1 - pairwise_similarities
    return np.mean(pairwise_dissimilarities)

# FUNCTION TO CALCULATE NOVELTY SCORE
def calculate_novelty(top_indices, popularity_scores):
    recommended_popularity = popularity_scores[top_indices]
    return np.mean(1 - recommended_popularity)

# DIVERSITY AND NOVELTY TABLE CREATION
def create_diversity_novelty_table(songs_to_recommend, music_df, movies_df, similarity_matrix, movie_embeddings):
    """
    Create a table of diversity and novelty scores for each input song.
    """
    diversity_novelty_table = pd.DataFrame(columns=["Input Song", "Input Artist", "Diversity Score", "Novelty Score"])

    for song in songs_to_recommend:
        user_track = song["track"]
        user_artist = song["artist"]

        # FIND SONG INDEX
        song_index = find_song_index(user_track, user_artist, music_df)

        if song_index is None:
            print(f"Sorry, the song '{user_track}' by '{user_artist}' could not be found in the dataset.")
            continue
        else:
            # GET SIMILARITY SCORES FOR THIS SONG
            sim_scores = similarity_matrix[song_index]

            # SORT MOVIE INDICES BY SIMILARITY SCORE IN DESCENDING ORDER
            top_indices = np.argsort(sim_scores)[::-1][:5]  # GETTING TOP 5 RECOMMENDATIONS

            # CALCULATE DIVERSITY AND NOVELTY
            diversity_score = calculate_diversity(top_indices, movie_embeddings)
            novelty_score = calculate_novelty(top_indices, movies_df['IMDb Rating'].to_numpy())

            # Add to diversity and novelty table
            entry = {
                "Input Song": user_track,
                "Input Artist": user_artist,
                "Diversity Score": f"{diversity_score:.2f}",
                "Novelty Score": f"{novelty_score:.2f}"
            }
            diversity_novelty_table = pd.concat([diversity_novelty_table, pd.DataFrame([entry])], ignore_index=True)

    return diversity_novelty_table

diversity_novelty_table = create_diversity_novelty_table(songs_to_recommend, music_df, movies_df, similarity_matrix, movies_tfidf_embeddings)

# PRINTING THE DIVERSITY AND NOVELTY SCORES
print("\nDiversity and Novelty Scores Table using TF-IDF system:\n")
print(diversity_novelty_table.head())


Diversity and Novelty Scores Table using TF-IDF system:

               Input Song      Input Artist Diversity Score Novelty Score
0                 Perfect        Ed Sheeran            0.97          0.40
1            The Mountain  Three Days Grace            0.94          0.42
2                    Stan            Eminem            0.97          0.35
3  I Will Always Love You   Whitney Houston            0.98          0.56
4              Wake Me Up            Avicii            0.93          0.29
