<a href="https://colab.research.google.com/github/migub/recommender-systems/blob/main/EMB_deezer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
 #pip install deezer-python

In [14]:
## 1. Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import deezer as deezer
import time
import os
import glob # file searching
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# New section

In [15]:
## 2. Load cleaned data from previous step
data = pd.read_csv("/content/drive/MyDrive/Recommender_Systems/cleaned_data.csv")

# Display basic info
data.info()
display(data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7558834 entries, 0 to 7558833
Data columns (total 17 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   genre_id          int64 
 1   ts_listen         object
 2   media_id          int64 
 3   album_id          int64 
 4   context_type      int64 
 5   release_date      object
 6   platform_name     int64 
 7   platform_family   int64 
 8   media_duration    int64 
 9   listen_type       int64 
 10  user_gender       int64 
 11  user_id           int64 
 12  artist_id         int64 
 13  user_age          int64 
 14  is_listened       bool  
 15  listen_hour       int64 
 16  listen_dayofweek  int64 
dtypes: bool(1), int64(14), object(2)
memory usage: 929.9+ MB


Unnamed: 0,genre_id,ts_listen,media_id,album_id,context_type,release_date,platform_name,platform_family,media_duration,listen_type,user_gender,user_id,artist_id,user_age,is_listened,listen_hour,listen_dayofweek
0,25471,2016-12-01 13:00:15,222606,41774,12,2004-07-04,1,0,223,0,0,9241,55164,29,False,13,3
1,25571,2016-11-30 22:25:35,250467,43941,0,2006-03-01,2,1,171,0,0,16547,55830,30,True,22,2
2,16,2016-11-19 13:59:13,305197,48078,1,2014-07-14,2,1,149,1,1,7665,2704,29,True,13,5
3,7,2016-11-26 09:21:38,900502,71521,0,2000-10-30,0,0,240,0,1,1580,938,30,False,9,5
4,7,2016-11-05 18:02:54,542335,71718,0,2008-02-15,0,0,150,0,1,1812,2939,24,True,18,5


Checking what type of data we can get via the deezer package.

In [16]:
client = deezer.Client()


# Testing deezer
track_id = 222606
track = client.get_track(track_id)

# Print track details
print(f"Title: {track.title}")
print(f"Artist: {track.artist.name}")
print(f"Album: {track.album.title}")
print(f"Duration: {track.duration} seconds")

# user_id = 16547
# user = client.get_user(user_id)
#
# # Print user details
# print(f"Gender: {user.gender}")
# print(f"Country: {user.country}")


Title: Seul au monde
Artist: Malin Plaisir
Album: Malin plaisir
Duration: 223 seconds


While retrieval of track data works, user data is not as easily accessible.

[I will continue to look into this, but at the moment, I'm not finding an easy solution. It could be an authentication problem, which could be circumvented by creating an app, but currently, no new app applications are being accepted by Deezer.]

In [27]:
# With improved error handling and chunking

def get_deezer_info(ids, entity_type):
    """
    Fetches information from the Deezer API for a given entity type (track or user).
    Skips entities with errors and enforces rate limiting.

    Args:
        ids (list): List of entity IDs (track IDs or user IDs).
        entity_type (str): Either 'track' or 'user'.

    Returns:
        pd.DataFrame: A DataFrame containing the requested information.
    """
    try:
        client = deezer.Client()
    except Exception as e:
        print(f"Failed to initialize Deezer client: {e}")
        return pd.DataFrame()

    data_list = []
    last_request_time = time.time()

    for entity_id in ids:
        # Rate limiting: Ensure at least 20ms between requests
        time_since_last_request = time.time() - last_request_time
        time.sleep(max(0, 0.02 - time_since_last_request))

        try:
            if entity_type == "track":
                track = client.get_track(entity_id)

                # Handle case where track is not found
                if track is None:
                    print(f"Track ID {entity_id} not found on Deezer.")
                    continue

                entity_info = {
                    "track_id": entity_id,
                    "title": track.title if track.title else "Unknown",
                    "artist_name": track.artist.name if track.artist else "Unknown",
                    "album_title": track.album.title if track.album else "Unknown",
                    "album_release_date": track.album.release_date if track.album else "Unknown",
                    "rank": track.rank,
                    "explicit_lyrics": track.explicit_lyrics,
                    "bpm": track.bpm
                }
                data_list.append(entity_info)

        except Exception as e:
            print(f"Error retrieving data for {entity_type} ID {entity_id}: {e}")

        last_request_time = time.time()

    return pd.DataFrame(data_list)

In [28]:
test = data.head(20)
print(test["media_id"])


0     222606
1     250467
2     305197
3     900502
4     542335
5     542335
6     542335
7     542335
8     542335
9     542335
10    542335
11    542335
12    542341
13    542335
14    542335
15    542335
16    542335
17    542335
18    542335
19    542335
Name: media_id, dtype: int64


In [29]:
# Testing on small dataset
track_info_df = get_deezer_info(test["media_id"].unique().tolist(), "track")  # Get track data
track_info_df = track_info_df.rename(columns={"track_id": "media_id"})  # Rename column for merging
test2 = test.merge(track_info_df, on="media_id", how="left")  # Merge retrieved track data

# user_info_df = get_deezer_info((test["user_id"].unique()).tolist(), "user")
# user_info_df = user_info_df.rename(columns={"id": "user_id"})  # Rename column for merging
#test = test.merge(user_info_df, on="user_id", how="left")  # Merge retrieved track data
# print(user_info_df)
# print(test)

In [30]:
test2.head()

Unnamed: 0,genre_id,ts_listen,media_id,album_id,context_type,release_date,platform_name,platform_family,media_duration,listen_type,...,is_listened,listen_hour,listen_dayofweek,title,artist_name,album_title,album_release_date,rank,explicit_lyrics,bpm
0,25471,2016-12-01 13:00:15,222606,41774,12,2004-07-04,1,0,223,0,...,False,13,3,Seul au monde,Malin Plaisir,Malin plaisir,2004-07-04,1711,False,100.1
1,25571,2016-11-30 22:25:35,250467,43941,0,2006-03-01,2,1,171,0,...,True,22,2,You're not ok,SoldouT,Stop talking / dead tapes,2006-03-01,70929,False,160.2
2,16,2016-11-19 13:59:13,305197,48078,1,2014-07-14,2,1,149,1,...,True,13,5,Samuel,René Aubry,Dérives,2014-07-14,115098,False,90.1
3,7,2016-11-26 09:21:38,900502,71521,0,2000-10-30,0,0,240,0,...,False,9,5,Mujer Amiga Mia,Eros Ramazzotti,Estilolibre (Spanish Version),2000-10-30,103778,False,124.9
4,7,2016-11-05 18:02:54,542335,71718,0,2008-02-15,0,0,150,0,...,True,18,5,Mr. Tambourine Man,The Byrds,Collections - The Byrds Play Dylan,2008-02-15,407879,False,120.5


# Deezer scraping

Now implementing the above for the whole training dataset and saving the resulting df for sharing with colleagues.

In [31]:
# Load media IDs
media_ids = data["media_id"].unique().tolist()

# Split media_ids into chunks of 100
chunk_size = 100
media_id_chunks = [media_ids[i:i + chunk_size] for i in range(0, len(media_ids), chunk_size)]

# Step 1: Process each chunk and save separately, skipping existing chunks
for i, chunk in enumerate(media_id_chunks):
    chunk_file = f"/content/drive/MyDrive/Recommender_Systems/chunks/track_info_chunk_{i}.csv"

    # Skip if chunk file already exists (no overwriting)
    if os.path.exists(chunk_file):
        print(f"Chunk {i} already exists. Skipping...")
        continue

    print(f"Processing chunk {i}...")
    track_info_df_chunk = get_deezer_info(chunk, "track")

    if not track_info_df_chunk.empty:
        track_info_df_chunk.to_csv(chunk_file, index=False)
        print(f"Saved: {chunk_file}")
    else:
        print(f"Skipping saving {chunk_file} (empty DataFrame).")

print("All chunks processed and saved!!!")


Chunk 0 already exists. Skipping...
Chunk 1 already exists. Skipping...
Chunk 2 already exists. Skipping...
Processing chunk 3...
Saved: /content/drive/MyDrive/Recommender_Systems/track_info_chunk_3.csv
Processing chunk 4...
Saved: /content/drive/MyDrive/Recommender_Systems/track_info_chunk_4.csv
Processing chunk 5...
Saved: /content/drive/MyDrive/Recommender_Systems/track_info_chunk_5.csv
Processing chunk 6...
Saved: /content/drive/MyDrive/Recommender_Systems/track_info_chunk_6.csv
Processing chunk 7...
Saved: /content/drive/MyDrive/Recommender_Systems/track_info_chunk_7.csv
Processing chunk 8...
Saved: /content/drive/MyDrive/Recommender_Systems/track_info_chunk_8.csv
Processing chunk 9...
Saved: /content/drive/MyDrive/Recommender_Systems/track_info_chunk_9.csv
Processing chunk 10...
Saved: /content/drive/MyDrive/Recommender_Systems/track_info_chunk_10.csv
Processing chunk 11...
Saved: /content/drive/MyDrive/Recommender_Systems/track_info_chunk_11.csv
Processing chunk 12...
Saved: /co

KeyboardInterrupt: 

In [None]:

# Step 2: Combine all chunk files (Separate Step)
output_file = "/content/drive/MyDrive/Recommender_Systems/track_info_deezer.csv"
print("Combining all chunk files into final dataset...")

# Find all chunk CSV files
chunk_files = sorted(glob.glob("/content/drive/MyDrive/Recommender_Systems/chunks/track_info_chunk_*.csv"))

# Append each chunk to the final output file (step-by-step)
header_written = False
for chunk_file in chunk_files:
    chunk_df = pd.read_csv(chunk_file)

    # Append to final CSV (write header only for first chunk)
    chunk_df.to_csv(output_file, mode="w" if not header_written else "a", header=not header_written, index=False)
    header_written = True  # Ensure header is only written once

print(f"Final dataset saved: {output_file}")

In [None]:
track_info_df = track_info_df.rename(columns={"track_id": "media_id"})  # Rename column for merging

In [None]:
# Save the DataFrame 'track_info' to Google Drive.
track_info_df.to_csv("/content/drive/MyDrive/Recommender_Systems/track_info_deezer.csv", index=False)


In [None]:
data2 = data.merge(track_info_df, on="media_id", how="left")  # Merge retrieved track data