<a href="https://colab.research.google.com/github/lovrodukic/music-recommendation/blob/main/notebooks/recommender_ollama.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing

In [None]:
!pip install numpy pandas faiss-gpu requests

In [None]:
!curl -fsSL https://ollama.com/install.sh | sh

In [None]:
!nohup ollama serve &

In [None]:
!ollama pull llama2

In [None]:
import pandas as pd

df = pd.read_csv('tracks_features.csv').drop(
    columns=['id', 'album_id', 'artist_ids', 'track_number',
                'disc_number', 'duration_ms', 'time_signature', 'year',
                'release_date'],
    errors='ignore'
)
df

In [None]:
def create_textual_representation(row):
    return (
        f"Track: {row['name']},\n"
        f"Album: {row['album']},\n"
        f"Artists: {row['artists'][1:-1]},\n"
        f"Explicit: {row['explicit']}"
    )

df['textual_representation'] = df.apply(
    create_textual_representation,
    axis=1
)
print(df['textual_representation'].values[0])

In [None]:
import numpy as np
import requests

def generate_embedding(representation):
    """
    Generate textual embedding based on selected model
    """
    try:
        response = requests.post(
            'http://localhost:11434/api/embeddings',
            json={'model': 'llama2', 'prompt': representation}
        )
        response.raise_for_status()
        return np.array(response.json()['embedding'], dtype='float32')
    except requests.RequestException as e:
        raise ValueError(f"Error generating Ollama embedding: {e}")

def build_index(index_name, batch_size=1000):
    """
    Build the FAISS index using using song textual representations
    """
    num_rows = len(df)
    print(f"Total songs: {num_rows}")

    for start_idx in range(0, len(df), batch_size):
        end_idx = min(start_idx + batch_size, num_rows)
        print(f"Processing songs {start_idx} to {end_idx}...")

        # Prepare batch embeddings
        batch_embeddings = np.zeros(
            (end_idx - start_idx, 4096 + 11),
            dtype='float32'
        )

        for i, row in enumerate(
            df.iloc[start_idx:end_idx].iterrows()
        ):
            song_idx, song_row = row

            # Generate textual embedding
            try:
                textual_embedding = generate_embedding(
                    song_row['textual_representation']
                )
            except ValueError as e:
                print(f"Skipping song {song_idx} due to error: {e}")
                continue

            # Extract numerical features
            numerical_features = song_row[
                ['danceability', 'energy', 'key', 'loudness', 'mode',
                    'speechiness', 'acousticness', 'instrumentalness',
                    'liveness', 'valence', 'tempo']
            ].values.astype('float32')

            # Combine textual and numerical embeddings
            batch_embeddings[i] = np.concatenate(
                [textual_embedding, numerical_features]
            )

        # Add batch embeddings to FAISS index
        faiss_index.add(batch_embeddings)

    faiss.write_index(self.faiss_index, f"models/{index_name}")
    print(f"Saved FAISS index to models/{index_name}")

build_index('index', batch_size=1000)