In [137]:
from typing import Tuple
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score

from adv_ml_music_recommendation.recommenders.collaborativerecommender import CollaborativeRecommender
from adv_ml_music_recommendation.recommenders.contentbasedrecommender import ContentRecommender
from adv_ml_music_recommendation.recommenders.hybridrecommender import HybridRecommender
from adv_ml_music_recommendation.recommenders.popularityrecommender import PopularityRecommender
from adv_ml_music_recommendation.util.data_functions import get_interacted_tracks
from adv_ml_music_recommendation.recommenders.abstractrecommender import AbstractSongRecommender
from adv_ml_music_recommendation.util.data_functions import get_number_of_songs_in_playlist

class RecommenderEvaluator:
    def __init__(self, df_playlist: pd.DataFrame, df_tracks: pd.DataFrame, type: str = 'collaborative'):
        self.df_train_playlist = []
        self.df_test_playlist = []
        self.df_tracks = df_tracks

        # Filter out playlists with fewer than 10 songs
        filtered_playlists = [pid for pid in df_playlist['playlist_id'] if get_number_of_songs_in_playlist(df_playlist, pid) >= 10]

        # Renumber the filtered playlist_ids to remove gaps
        df_playlist = list(range(1, len(filtered_playlists) + 1))

        # Group by playlist_id and perform train-test split for each playlist
        for playlist_id, group in df_playlist.groupby('playlist_id'):
            if playlist_id > 1000:
                break

            if playlist_id % 100 == 0:
                print(playlist_id)

            # Extract track_uris for the current playlist
            track_uris = group['track_uri'].tolist()

            # Perform train-test split
            train_uris, test_uris = train_test_split(track_uris, test_size=0.2, random_state=42)

            # Append the results to the train and test lists
            for uri in train_uris:
                 self.df_train_playlist.append({'playlist_id': playlist_id, 'track_uri': uri})

            for uri in test_uris:
                 self.df_test_playlist.append({'playlist_id': playlist_id, 'track_uri': uri})


        self.df_train_playlist = pd.DataFrame(self.df_train_playlist)
        self.df_test_playlist = pd.DataFrame(self.df_test_playlist)

        # Filter the tracks dataframe to keep only tracks that appear in the playlists dataframe:
        # & operator feels like questionable design
        print(self.df_train_playlist)
        #self.df_tracks = self.df_tracks[self.df_tracks['track_uri'].isin(self.df_train_data['track_uri']) & self.df_tracks['track_uri'].isin(self.df_test_data['track_uri'])]
        self.df_tracks = self.df_tracks[self.df_tracks['track_uri'].isin(self.df_train_playlist['track_uri'])]

        if type == 'hybrid':
            self.train_data_model = HybridRecommender(df_playlist=self.df_train_playlist, df_tracks=self.df_tracks)
            self.test_data_model = HybridRecommender(df_playlist=self.df_test_playlist, df_tracks=self.df_tracks)
        elif type == 'collaborative':
            self.train_data_model = CollaborativeRecommender(df_playlist=self.df_train_playlist, df_tracks=self.df_tracks)
            self.test_data_model = CollaborativeRecommender(df_playlist=self.df_test_playlist, df_tracks=self.df_tracks)
        elif type == 'content':
            self.train_data_model = ContentRecommender(df_playlist=self.df_train_playlist, df_tracks=self.df_tracks)
            self.test_data_model = ContentRecommender(df_playlist=self.df_test_playlist, df_tracks=self.df_tracks)
        elif type == 'popularity':
            self.train_data_model = PopularityRecommender(df_playlist=self.df_train_playlist, df_tracks=self.df_tracks)
            self.test_data_model = PopularityRecommender(df_playlist=self.df_test_playlist, df_tracks=self.df_tracks)
        else:
            raise ValueError(
                f"Invalid type: {type}. Must be one of 'hybrid', 'collaborative', 'content', 'popularity'.")


    def evaluate_recommender_for_playlist(self, playlist_id):
        # Get recommendations from the train model
        ranked_recommendations_df = self.train_data_model.recommend_tracks(playlist_id)

        # Extract the recommended track URIs
        recommended_track_uris = ranked_recommendations_df['track_uri'].tolist()

        # Get the ground truth: test track URIs for the playlist
        test_track_uris = self.df_test_playlist[self.df_test_playlist['playlist_id'] == playlist_id]['track_uri'].iloc[0]

        # Create binary vectors for precision and recall calculation
        y_true = [1 if uri in test_track_uris else 0 for uri in recommended_track_uris]
        y_pred = [1] * len(recommended_track_uris)  # All recommendations are predicted as relevant

        # Compute precision and recall
        precision = precision_score(y_true, y_pred, zero_division=0)
        recall = recall_score(y_true, y_pred, zero_division=0)

        return precision, recall


    def evaluate_model(self):
        total_precision = 0
        total_recall = 0
        num_playlists = len(self.df_test_playlist)

        # Iterate over all playlists in the test_data
        for playlist_id in self.df_test_playlist['playlist_id'].unique():
            precision, recall = self.evaluate_recommender_for_playlist(playlist_id)
            total_precision += precision
            total_recall += recall

        # Compute average precision and recall
        avg_precision = total_precision / num_playlists
        avg_recall = total_recall / num_playlists

        return {
            'average_precision': avg_precision,
            'average_recall': avg_recall
        }


In [127]:
df_playlist = pd.read_csv("../../../data/track_playlist_association.csv")
df_tracks = pd.read_csv("../../../data/matched_songs.csv")
print(df_tracks)

        Unnamed: 0  pos        artist_name  \
0                0    1     Britney Spears   
1                0    3  Justin Timberlake   
2                0    8    Destiny's Child   
3                0    9            OutKast   
4                0   19    Destiny's Child   
...            ...  ...                ...   
134707         829   24        The Outlaws   
134708         829  131        The Outlaws   
134709         870   69    History Invades   
134710         887    5          Limi-T 21   
134711         935   32        Rod Stewart   

                                   track_uri  \
0       spotify:track:6I9VzXrHxO9rA9A5euc8Ak   
1       spotify:track:1AWQoqb9bSvzTjaLralEkT   
2       spotify:track:7H6ev70Weq6DdpZyyTmUXk   
3       spotify:track:2PpruBYCo4H7WOBJ7Q2EwM   
4       spotify:track:4pmc2AxSEq6g7hPVlJCPyP   
...                                      ...   
134707  spotify:track:38Zq6XbsyIcwI0Kn1ikPlV   
134708  spotify:track:68nYzKdsOUiiVAuyAc5y3U   
134709  spotify

0         spotify:track:6I9VzXrHxO9rA9A5euc8Ak
1         spotify:track:1AWQoqb9bSvzTjaLralEkT
2         spotify:track:7H6ev70Weq6DdpZyyTmUXk
3         spotify:track:2PpruBYCo4H7WOBJ7Q2EwM
4         spotify:track:4pmc2AxSEq6g7hPVlJCPyP
                          ...                 
134707    spotify:track:38Zq6XbsyIcwI0Kn1ikPlV
134708    spotify:track:68nYzKdsOUiiVAuyAc5y3U
134709    spotify:track:2U7FCJK3m012zjvPlAw4PK
134710    spotify:track:1HkPxeUaQwIFIbYCRq9nyK
134711    spotify:track:533lw7eUjXE5ZbajU8L4H6
Name: track_uri, Length: 134712, dtype: object


In [138]:
evaluator = RecommenderEvaluator(df_playlist=df_playlist, df_tracks=df_tracks)

KeyboardInterrupt: 

In [133]:
print(evaluator.df_train_playlist['playlist_id'].unique())

[  0   1   2   3   4   5   7   8   9  10  11  14  16  17  18  19  20  22
  24  25  28  29  30  31  32  34  35  37  38  39  40  41  42  44  45  49
  50  51  53  55  56  59  60  63  64  65  66  68  70  71  72  73  80  83
  85  88  89  90  91  93  94  95  96  97  98 101 102 103 105 106 107 109
 112 114 115 118 119 120 121 122 123 124 125 126 127 128 129 130 131 133
 134 135 136 138 139 140 142 143 144 145 146 148 149 150 151 152 153 154
 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 172 173
 176 177 178 179 180 182 183 184 185 186 189 192 194 195 196 197 199 200
 201 202 203 205 207 208 209 210 211 212 214 215 217 218 219 220 223 224
 225 227 228 231 232 233 234 236 237 239 240 241 242 243 244 245 246 247
 248 249 250 252 253 254 255 256 258 259 260 262 263 264 265 267 268 271
 273 274 275 277 279 280 281 282 283 284 285 290 291 293 294 295 297 298
 299 302 305 307 309 310 314 315 316 321 323 326 328 329 330 331 332 333
 334 335 338 339 341 342 343 344 348 349 350 351 35

In [134]:
evaluator.evaluate_model()

IndexError: index 708 is out of bounds for axis 0 with size 706