<a href="https://colab.research.google.com/github/migub/recommender-systems/blob/main/EMB_recommender_systems.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
# pip install deezer-python

In [18]:
## 1. Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import deezer as deezer
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
## 2. Load cleaned data from previous step
data = pd.read_csv("/content/drive/MyDrive/Recommender_Systems/cleaned_data.csv")

# Display basic info
data.info()
display(data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7558834 entries, 0 to 7558833
Data columns (total 17 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   genre_id          int64 
 1   ts_listen         object
 2   media_id          int64 
 3   album_id          int64 
 4   context_type      int64 
 5   release_date      object
 6   platform_name     int64 
 7   platform_family   int64 
 8   media_duration    int64 
 9   listen_type       int64 
 10  user_gender       int64 
 11  user_id           int64 
 12  artist_id         int64 
 13  user_age          int64 
 14  is_listened       bool  
 15  listen_hour       int64 
 16  listen_dayofweek  int64 
dtypes: bool(1), int64(14), object(2)
memory usage: 929.9+ MB


Unnamed: 0,genre_id,ts_listen,media_id,album_id,context_type,release_date,platform_name,platform_family,media_duration,listen_type,user_gender,user_id,artist_id,user_age,is_listened,listen_hour,listen_dayofweek
0,25471,2016-12-01 13:00:15,222606,41774,12,2004-07-04,1,0,223,0,0,9241,55164,29,False,13,3
1,25571,2016-11-30 22:25:35,250467,43941,0,2006-03-01,2,1,171,0,0,16547,55830,30,True,22,2
2,16,2016-11-19 13:59:13,305197,48078,1,2014-07-14,2,1,149,1,1,7665,2704,29,True,13,5
3,7,2016-11-26 09:21:38,900502,71521,0,2000-10-30,0,0,240,0,1,1580,938,30,False,9,5
4,7,2016-11-05 18:02:54,542335,71718,0,2008-02-15,0,0,150,0,1,1812,2939,24,True,18,5


In [20]:
client = deezer.Client()


# Testing deezer
track_id = 222606
track = client.get_track(track_id)

# Print track details
print(f"Title: {track.title}")
print(f"Artist: {track.artist.name}")
print(f"Album: {track.album.title}")
print(f"Duration: {track.duration} seconds")

user_id = 16547
user = client.get_user(user_id)

# Print user details
print(f"Gender: {user.gender}")
print(f"Country: {user.country}")


Title: Seul au monde
Artist: Malin Plaisir
Album: Malin plaisir
Duration: 223 seconds


DeezerErrorResponse: {'error': {'type': 'DataException', 'message': 'no data', 'code': 800}}

While retrieval of track data works, user data is not as easily accessible.

[I will continue to look into this, but at the moment, I'm not finding an easy solution. It could be an authentication problem, which could be circumvented by creating an app, but currently, no new app applications are being accepted by Deezer.]

In [None]:
# This function can handle track_id and user id, but in reality, we are only getting track data.

def get_deezer_info(ids, entity_type):
    """
    Fetches information from the Deezer API for a given entity type (track or user) using the deezer package.

    Args:
        ids (list): List of entity IDs (track IDs or user IDs).
        entity_type (str): Either 'track' or 'user'.

    Returns:
        pd.DataFrame: A DataFrame containing the requested information.
    """
    client = deezer.Client()
    data_list = []

    for entity_id in ids:
        try:
            if entity_type == "track":
                track = client.get_track(entity_id)
                entity_info = {
                    "track_id": entity_id,
                    "rank": track.rank,
                    "explicit_lyrics": track.explicit_lyrics,
                    "bpm": track.bpm
                }
            elif entity_type == "user":
                user = client.get_user(entity_id)
                entity_info = {
                    "id": entity_id,
                    "gender": user.gender,
                    "country": user.country
                }
            else:
                raise ValueError("Invalid entity_type. Use 'track' or 'user'.")

            data_list.append(entity_info)

        except deezer.DeezerErrorResponse as e:
            print(f"Failed to retrieve data for {entity_type} ID {entity_id}: {e}")

    return pd.DataFrame(data_list)





In [None]:
test = data.head(20)
print(test["media_id"])


In [None]:
# Testing on small dataset
track_info_df = get_deezer_info(test["media_id"].unique().tolist(), "track")  # Get track data
track_info_df = track_info_df.rename(columns={"track_id": "media_id"})  # Rename column for merging
test2 = test.merge(track_info_df, on="media_id", how="left")  # Merge retrieved track data

# user_info_df = get_deezer_info((test["user_id"].unique()).tolist(), "user")
# user_info_df = user_info_df.rename(columns={"id": "user_id"})  # Rename column for merging
#test = test.merge(user_info_df, on="user_id", how="left")  # Merge retrieved track data
# print(user_info_df)
# print(test)

In [None]:
test2.head()

Now implementing the above for the whole training dataset and saving the resulting df for sharing with colleagues.

In [None]:
media_ids = data["media_id"].unique().tolist()
track_info_df = get_deezer_info(media_ids, "track")  # Get all the track data

In [None]:
track_info_df = track_info_df.rename(columns={"track_id": "media_id"})  # Rename column for merging

In [None]:
# Save the DataFrame 'track_info' to Google Drive.
track_info_df.to_csv("/content/drive/MyDrive/Recommender_Systems/track_info_deezer.csv", index=False)


In [None]:
data2 = data.merge(track_info_df, on="media_id", how="left")  # Merge retrieved track data