INSTALL LIBRARIES

In [1]:
!pip install faiss-gpu
!pip install fastapi nest-asyncio pyngrok uvicorn

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2
Collecting pyngrok
  Downloading pyngrok-7.2.1-py3-none-any.whl.metadata (8.3 kB)
Downloading pyngrok-7.2.1-py3-none-any.whl (22 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.1


In [2]:
import os
import json
import pandas as pd
from tqdm import tqdm
from typing import Optional
import numpy as np
import pickle
import re
import faiss
import logging
from typing import List, Dict, Tuple
from joblib import dump, load


from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import random
from fastapi.middleware.cors import CORSMiddleware
import nest_asyncio
from pyngrok import ngrok
import uvicorn

app = FastAPI()

# middlewares
app.add_middleware(
    CORSMiddleware, # https://fastapi.tiangolo.com/tutorial/cors/
    allow_origins=['*'], # wildcard to allow all, more here - https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Access-Control-Allow-Origin
    allow_credentials=True, # https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Access-Control-Allow-Credentials
    allow_methods=['*'], # https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Access-Control-Allow-Methods
    allow_headers=['*'], # https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Access-Control-Allow-Headers
)

import warnings
warnings.filterwarnings('ignore')
import os

In [3]:
files_path = '/kaggle/input'

dataset_file_path = f'{files_path}/song-lyrics-filtered-seven-hundred-mb'

checkpoints_input_path = f'{files_path}/preprocessed_data_checkpoints/other/default/4'
val_checkpoints_input_path = f'{files_path}/song-recommendation-val-inputs/other/default/4'

song_embeddings_input_path = f'{files_path}/song-embeddings-mpnet/other/default/2'
val_song_embeddings_input_path = f'{files_path}/val-song-embeddings-mpnet/other/default/1'

autoencoder_input_path = f'{files_path}/song-recommendation-autoencoder-large/other/default/1'
val_autoencoder_input_path = f'{files_path}/val-song-recommendation-autoencoder/other/default/1'

song_recommender_cache_input_path = f'{files_path}/song-recommender-cache-large/other/default/1'
reordered_lyrics_input_path = f'{files_path}/song-recommendation-re-ordered-lyrics/other/default/1/reordered_lyrics.pkl'

load_preprocessed_checkpoints = True
working_files_path = '/kaggle/working'
vector_files_path = 'vectors'
song_recommender_cache_out_path = 'song_recommender_cache'
data_file_name = 'song_lyrics_filtered_seven_hundred_mb' #@param {type:"string"}

import os
print(os.getcwd())
directories = [vector_files_path, song_recommender_cache_out_path]
for directory in directories:
    if not os.path.exists(directory):
        os.makedirs(directory)


/kaggle/working


In [4]:
def load_vectors(method, data_file_name):
    """
    Load the existing vectors.
    """
    # Define file names based on the input convention
    file_base_name = f'{method}_{data_file_name}'

    # Load sentences file
    with open(f"{vector_files_path}/{file_base_name}.file", "rb") as f:
        vectors = pickle.load(f)

    print(f"Loaded vector file for {file_base_name}")

    return vectors

def load_vectors_from_path(path):
    with open(path, "rb") as f:
        vectors = pickle.load(f)

    print(f"Loaded vector file from {path}")

    return vectors

In [5]:
class SongRecommender:
    def __init__(self, latent_representations: np.ndarray, song_lyrics: List[str], top_k: int = 10, cache_dir_in=song_recommender_cache_input_path, cache_dir_out=song_recommender_cache_out_path):
        """
        Initialize the recommender with latent representations and lyrics data.

        Parameters:
        - latent_representations: Latent representations of songs.
        - song_lyrics: List of lyrics corresponding to each song.
        - top_k: Number of top recommendations to return.
        - cache_dir_in: Directory where cache files will be read from.
        - cache_dir_out: Directory where cache files will be saved.
        """
        self.latent_representations = latent_representations
        self.song_lyrics = song_lyrics
        self.top_k = top_k
        self.index_file_in = f"{cache_dir_in}/faiss_index.idx"
        self.index_file_out = f"{cache_dir_out}/faiss_index.idx"

        print("Building FAISS index for recommendation")
        self.faiss_index = self._build_faiss_index()

        # densities = self.compute_neighborhood_density(top_k=50, distance_threshold=0.3)
        # top_seeds_by_density = [song_id for song_id, _ in densities[:20]]  # Top 20 seeds
        # print(f"Top 10 seeds by neighborhood density: {top_seeds_by_density}")

    def _build_faiss_index(self):
        """Build or load a FAISS index with optimized parameters for a large dataset."""
        index = self._load_faiss_index()
        if not index:  # Load existing index if available
            d = self.latent_representations.shape[1]
            nlist = 4096  # Number of Voronoi cells
            m = 16  # Number of sub-quantizers for PQ
            index = faiss.IndexIVFPQ(faiss.IndexFlatIP(d), d, nlist, m, 8)  # 8 bits per sub-vector
            index.train(self.latent_representations)
            index.add(self.latent_representations)
            index.nprobe = 20  # Number of clusters to search during querying
            faiss.write_index(index, self.index_file_out)  # Save for future use
            print("FAISS index built and saved.")
        return index

    def _load_faiss_index(self):
        """Load the FAISS index if it exists."""
        try:
            faiss_index = faiss.read_index(self.index_file_in)
            print(f"FAISS index loaded from {self.index_file_in}")
            return faiss_index
        except:
            print(f"No pre-existing FAISS index found at {self.index_file_in}; building a new one.")
            return None
        
    def recommend(self, song_id: int, novelty: float = 0.5) -> List[int]:
        """
        Recommend songs based on similarity and novelty.
    
        Parameters:
        - song_id: ID of the song to base recommendations on.
        - novelty: Weight to control novelty (0 = no novelty, 1 = full novelty).
    
        Returns:
        - List of tuples (recommended_song_id, final_score).
        """
        print(f"Generating recommendations for song_id {song_id} and novelty {novelty}...")
    
        # Query the FAISS index to get top-k results
        query_vector = self.latent_representations[song_id:song_id + 1]
        distances, indices = self.faiss_index.search(query_vector, self.top_k)
    
        if not distances.any():
            return []  # Return empty if no results are found.
    
        # Flatten results
        distances = distances.flatten()
        indices = indices.flatten()
    
        # Normalize distances to [0, 1], higher distance = higher novelty
        max_distance = distances.max()
        min_distance = distances.min()
        normalized_distances = (distances - min_distance) / (max_distance - min_distance + 1e-10)
    
        # Compute similarity and novelty scores
        similarity_scores = 1 - normalized_distances
        novelty_scores = 1 - normalized_distances  # Higher distance = higher novelty
    
        # Combine similarity and novelty based on the novelty weight
        final_scores = (1 - novelty) * similarity_scores + novelty * novelty_scores
    
        # Create a list of (song_id, final_score), filter out the seed song
        recommendations = [(idx, score) for idx, score in zip(indices, final_scores) if idx != song_id]
        
        # Sort by final score, descending
        recommendations.sort(key=lambda x: x[1], reverse=True)
    
        return recommendations


    def compute_neighborhood_density(self, top_k: int = 100, distance_threshold: float = 0.5) -> List[Tuple[int, float]]:
        """
        Compute the neighborhood density for each song in the FAISS index.
    
        Parameters:
        - top_k: Number of nearest neighbors to consider for density calculation.
        - distance_threshold: Distance threshold to define a 'dense' neighborhood.
    
        Returns:
        - List of tuples (song_id, density_score) sorted by density.
        """
        densities = []
        for song_id in tqdm(range(len(self.latent_representations)), desc="Precomputing neighbors"):
            query_vector = self.latent_representations[song_id:song_id + 1]
            distances, _ = self.faiss_index.search(query_vector, top_k)
            
            # Count neighbors within the distance threshold
            density = np.sum(distances.flatten() < distance_threshold)
            densities.append((song_id, density))
    
        # Sort songs by density in descending order
        return sorted(densities, key=lambda x: x[1], reverse=True)


    def _save_recommendations_to_file(self, song_id: int, recommendations: List[int]):
        """
        Store the seed song and its recommendations along with their lyrics and scores to a file.
    
        Parameters:
        - song_id: ID of the seed song.
        - recommendations: List of recommended songs and their scores.
        """
        file_path = "song_recommendations.json"
    
        recommended_song_ids = [int(rec[0]) for rec in recommendations]  # Convert to Python int
        recommended_scores = [float(rec[1]) for rec in recommendations]  # Convert to Python float
    
        # Prepare metadata
        valid_recommendations = []
        for i in range(len(recommendations)):
            if recommended_song_ids[i] < len(self.song_lyrics):  # Ensure valid index
                valid_recommendations.append({
                    "song_id": recommended_song_ids[i],
                    "song_lyrics": self.song_lyrics[recommended_song_ids[i]],
                    "score": recommended_scores[i]
                })
            else:
                print(f"Warning: Skipping invalid song ID {recommended_song_ids[i]}")
    
        song_metadata = {
            "seed_song_id": song_id,
            "seed_song_lyrics": self.song_lyrics[song_id] if song_id < len(self.song_lyrics) else "Unknown",
            "recommendations": valid_recommendations
        }
    
        # Ensure the directory exists
        # os.makedirs(os.path.dirname(file_path), exist_ok=True)
    
        # Write the metadata to a JSON file
        with open(file_path, "w") as f:
            json.dump(song_metadata, f, indent=4)
        print(f"Recommendations saved to {file_path}")



In [6]:
latent_representations_file = f'{autoencoder_input_path}/autoencoder_train_latent_representations_song_lyrics_filtered_seven_hundred_mb.file'
latent_representations = load_vectors_from_path(latent_representations_file)
# val_latent_representations_file = f'{val_autoencoder_input_path}/autoencoder_val_latent_representations_song_lyrics_filtered_seven_hundred_mb.file'
# val_latent_representations = load_vectors_from_path(val_latent_representations_file)
lyrics = load_vectors_from_path(reordered_lyrics_input_path)
recommender = SongRecommender(
    latent_representations = latent_representations,
    song_lyrics = lyrics
)
recommendations = recommender.recommend(song_id=10, novelty=0.1)
logging.info("Recommended songs for song_id 10: %s", recommendations)


Loaded vector file from /kaggle/input/song-recommendation-autoencoder-large/other/default/1/autoencoder_train_latent_representations_song_lyrics_filtered_seven_hundred_mb.file
Loaded vector file from /kaggle/input/song-recommendation-re-ordered-lyrics/other/default/1/reordered_lyrics.pkl
Building FAISS index for recommendation
FAISS index loaded from /kaggle/input/song-recommender-cache-large/other/default/1/faiss_index.idx
Generating recommendations for song_id 10 and novelty 0.1...


In [7]:
class RecommendationRequest(BaseModel):
    song_id: int
    novelty: Optional[float] = 0.5  # Default novelty weight

class RecommendationResponse(BaseModel):
    seed_song_id: int
    seed_song_lyrics: str
    recommendations: List[dict]

@app.get("/")
def home():
    return {"message": "Welcome to the Song Recommender API!"}

@app.get("/seed_songs", response_model=List[dict])
def get_seed_songs(count: int = 50):
    """
    Retrieve a random selection of seed songs with their IDs and lyrics.
    
    Parameters:
    - count: Number of random seed songs to return (default: 50).

    Returns:
    - List of dictionaries containing song IDs and lyrics.
    """
    if count <= 0 or count > len(recommender.song_lyrics):
        raise HTTPException(status_code=400, detail="Invalid count value.")

    try:
        # Randomly select song IDs
        random_ids = random.sample(range(len(recommender.song_lyrics)), count)
        seed_songs = [
            {"song_id": song_id, "song_lyrics": recommender.song_lyrics[song_id]}
            for song_id in random_ids
        ]
        return seed_songs
    except Exception as e:
        logging.error(f"Error generating seed songs: {e}")
        raise HTTPException(status_code=500, detail="Failed to retrieve seed songs.")

@app.post("/recommend", response_model=RecommendationResponse)
def get_recommendations(request: RecommendationRequest):
    song_id = request.song_id
    novelty = request.novelty

    # Ensure song_id is within the valid range
    if song_id < 0 or song_id >= len(recommender.song_lyrics):
        raise HTTPException(status_code=400, detail="Invalid song_id provided.")

    try:
        recommendations = recommender.recommend(song_id=song_id, novelty=novelty)

        # Prepare the response
        recommended_songs = []
        for rec in recommendations:
            rec_id, score = rec
            if rec_id < len(recommender.song_lyrics):  # Ensure valid index
                recommended_songs.append({
                    "song_id": int(rec_id),
                    "song_lyrics": recommender.song_lyrics[rec_id],
                    "score": float(score),
                })

        response = {
            "seed_song_id": song_id,
            "seed_song_lyrics": recommender.song_lyrics[song_id],
            "recommendations": recommended_songs,
        }

        return response
    except Exception as e:
        logging.error(f"Error during recommendation: {e}")
        raise HTTPException(status_code=500, detail="An error occurred during the recommendation process.")



In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
ngrok_auth_token = user_secrets.get_secret("ngrok-auth-token")

port = 8000
ngrok.set_auth_token(ngrok_auth_token)
ngrok_tunnel = ngrok.connect(port)

# where we can visit our fastAPI app
print('Public URL:', ngrok_tunnel.public_url)

nest_asyncio.apply()

# finally run the app
uvicorn.run(app, port=port)

                                                                                                    

INFO:     Started server process [30]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)


Public URL: https://7d36-34-80-82-5.ngrok-free.app
INFO:     159.2.24.252:0 - "OPTIONS /seed_songs?count=10 HTTP/1.1" 200 OK
