# Data Science and Artificial Intelligence II
### Investigating gender bias in music recommender systems

In [None]:
import os
import json
import pandas as pd

# Path to your data folder
data_path = r"D:\DigiEcon\4rd Semester\5673 Data Science and AI II\Project\spotify_million_playlist_dataset\data"

# Get a sorted list of all .json files
json_files = sorted([f for f in os.listdir(data_path) if f.endswith('.json')])

# Choose the first 100
json_files = json_files[:100]

# Initialize a list to store track data
all_tracks = []

# Loop through selected files and extract track info
for filename in json_files:
    file_path = os.path.join(data_path, filename)
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        for playlist in data['playlists']:
            for track in playlist['tracks']:
                all_tracks.append({
                    'playlist_id': playlist['pid'],
                    'playlist_name': playlist['name'],
                    'track_position': track['pos'],
                    'track_name': track['track_name'],
                    'artist_name': track['artist_name'],
                    'album_name': track['album_name'],
                    'track_uri': track['track_uri'],
                    'artist_uri': track['artist_uri'],
                    'album_uri': track['album_uri'],
                    'track_duration_ms': track['duration_ms']
                })

# Convert to a DataFrame
df_tracks = pd.DataFrame(all_tracks)

# Show a sample
print(df_tracks.head())
print(f"\nLoaded {len(df_tracks)} tracks from {len(json_files)} files.")


   playlist_id playlist_name  track_position  \
0            0    Throwbacks               0   
1            0    Throwbacks               1   
2            0    Throwbacks               2   
3            0    Throwbacks               3   
4            0    Throwbacks               4   

                                   track_name        artist_name  \
0  Lose Control (feat. Ciara & Fat Man Scoop)      Missy Elliott   
1                                       Toxic     Britney Spears   
2                               Crazy In Love            Beyoncé   
3                              Rock Your Body  Justin Timberlake   
4                                It Wasn't Me             Shaggy   

                                     album_name  \
0                                  The Cookbook   
1                                   In The Zone   
2  Dangerously In Love (Alben für die Ewigkeit)   
3                                     Justified   
4                                      Hot Shot

In [3]:
# Define output CSV path
output_csv_path = r"D:\DigiEcon\4rd Semester\5673 Data Science and AI II\Project\spotify_tracks_subset.csv"

# Export to CSV (no index column)
df_tracks.to_csv(output_csv_path, index=False, encoding='utf-8')

print(f"\nCSV exported successfully to:\n{output_csv_path}")


CSV exported successfully to:
D:\DigiEcon\4rd Semester\5673 Data Science and AI II\Project\spotify_tracks_subset.csv


In [4]:
import pandas as pd

csv_path = r"D:\DigiEcon\4rd Semester\5673 Data Science and AI II\Project\spotify_tracks_subset.csv"
df_tracks = pd.read_csv(csv_path)

print(df_tracks.head())
print(f"Reloaded {len(df_tracks)} tracks.")

   playlist_id playlist_name  track_position  \
0            0    Throwbacks               0   
1            0    Throwbacks               1   
2            0    Throwbacks               2   
3            0    Throwbacks               3   
4            0    Throwbacks               4   

                                   track_name        artist_name  \
0  Lose Control (feat. Ciara & Fat Man Scoop)      Missy Elliott   
1                                       Toxic     Britney Spears   
2                               Crazy In Love            Beyoncé   
3                              Rock Your Body  Justin Timberlake   
4                                It Wasn't Me             Shaggy   

                                     album_name  \
0                                  The Cookbook   
1                                   In The Zone   
2  Dangerously In Love (Alben für die Ewigkeit)   
3                                     Justified   
4                                      Hot Shot

### Retrieve Gender Information based on the artists name

In [7]:
import musicbrainzngs

# Set your application details
musicbrainzngs.set_useragent("GenderBiasMusicRec", "1.0", "h11910653@s.wu.ac.at")

def search_artist(artist_name):
    try:
        result = musicbrainzngs.search_artists(artist=artist_name, limit=1)
        if result['artist-list']:
            return result['artist-list'][0]
    except Exception as e:
        print(f"Error searching for {artist_name}: {e}")
    return None


In [8]:
artist_info = search_artist("Beyoncé")
print(artist_info)

{'id': '859d0860-d480-4efd-970c-c05d5f1776b8', 'type': 'Person', 'ext:score': '100', 'name': 'Beyoncé', 'sort-name': 'Beyoncé', 'gender': 'female', 'country': 'US', 'area': {'id': '489ce91b-6658-3307-9877-795b68554c98', 'type': 'Country', 'name': 'United States', 'sort-name': 'United States', 'life-span': {'ended': 'false'}}, 'begin-area': {'id': 'c920948b-83e3-40b7-8fe9-9ab5abaac55b', 'type': 'City', 'name': 'Houston', 'sort-name': 'Houston', 'life-span': {'ended': 'false'}}, 'ipi-list': ['00341826274'], 'isni-list': ['0000000114914936'], 'life-span': {'begin': '1981-09-04', 'ended': 'false'}, 'alias-list': [{'locale': 'zh_Hant_TW', 'sort-name': '碧昂絲', 'type': 'Artist name', 'primary': 'primary', 'alias': '碧昂絲'}, {'locale': 'zh_Hans_CN', 'sort-name': 'Beyoncé', 'primary': 'primary', 'alias': 'Beyoncé'}, {'locale': 'en', 'sort-name': 'Knowles-Carter, Beyoncé', 'type': 'Artist name', 'alias': 'Beyoncé Knowles-Carter'}, {'locale': 'ja', 'sort-name': 'ビヨンセ', 'type': 'Artist name', 'primar

In [9]:
import musicbrainzngs
import requests
import time

# Set up MusicBrainz
musicbrainzngs.set_useragent("GenderBiasMusicRec", "1.0", "h11910653@s.wu.ac.at")

# Search for artist
def search_artist(artist_name):
    try:
        result = musicbrainzngs.search_artists(artist=artist_name, limit=1)
        if result['artist-list']:
            return result['artist-list'][0]
    except Exception as e:
        print(f"[search_artist] Error for '{artist_name}': {e}")
    return None

# Get Wikidata ID from MusicBrainz
def get_wikidata_id(mbid):
    try:
        result = musicbrainzngs.get_artist_by_id(mbid, includes=["url-rels"])
        for rel in result["artist"]["url-relation-list"]:
            if rel["type"] == "wikidata":
                return rel["target"].split("/")[-1]
    except Exception as e:
        print(f"[get_wikidata_id] Error for MBID {mbid}: {e}")
    return None

# Get gender from Wikidata
def get_gender_from_wikidata(wikidata_id):
    query = f"""
    SELECT ?genderLabel WHERE {{
      wd:{wikidata_id} wdt:P21 ?gender.
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """
    try:
        url = "https://query.wikidata.org/sparql"
        headers = {"Accept": "application/sparql-results+json"}
        response = requests.get(url, params={"query": query}, headers=headers, timeout=10)
        results = response.json()["results"]["bindings"]
        if results:
            return results[0]["genderLabel"]["value"]
    except Exception as e:
        print(f"[get_gender_from_wikidata] Error for Wikidata ID {wikidata_id}: {e}")
    return "unknown"

# Wrapper function
def get_artist_gender(artist_name):
    artist = search_artist(artist_name)
    if artist:
        mbid = artist.get("id")
        wikidata_id = get_wikidata_id(mbid)
        if wikidata_id:
            gender = get_gender_from_wikidata(wikidata_id)
            return gender
    return "unknown"


In [None]:
#testing
print(get_artist_gender("Beyoncé"))       # Expect: female
print(get_artist_gender("Justin Bieber")) # Expect: male
print(get_artist_gender("Coldplay"))      # Expect: unknown or group

female
male
unknown


In [12]:
import pandas as pd
import time

# Step 1: Get unique artist names
unique_artists = df_tracks['artist_name'].dropna().unique()

# Step 2: Create a cache dictionary
artist_gender_map = {}

# Step 3: Loop through artists and query gender
for i, artist in enumerate(unique_artists):
    if artist not in artist_gender_map:
        gender = get_artist_gender(artist)
        artist_gender_map[artist] = gender
        print(f"[{i+1}/{len(unique_artists)}] {artist} ➜ {gender}")
        time.sleep(1.0)  # respectful delay to avoid rate limits (adjust if needed)

# Optional: Save cache to CSV for reuse later
pd.Series(artist_gender_map).to_csv("artist_gender_map.csv", header=["gender"])

# Step 4: Map gender back to original DataFrame
df_tracks['artist_gender'] = df_tracks['artist_name'].map(artist_gender_map)

# Show result
print(df_tracks[['artist_name', 'artist_gender']].head())


[1/107166] Missy Elliott ➜ female
[2/107166] Britney Spears ➜ female
[3/107166] Beyoncé ➜ female
[4/107166] Justin Timberlake ➜ male
[5/107166] Shaggy ➜ male
[6/107166] Usher ➜ male
[7/107166] The Pussycat Dolls ➜ unknown
[8/107166] Destiny's Child ➜ unknown
[9/107166] OutKast ➜ unknown
[10/107166] Nelly Furtado ➜ female
[11/107166] Jesse McCartney ➜ male
[12/107166] Cassie ➜ female
[13/107166] Omarion ➜ male
[14/107166] Avril Lavigne ➜ female
[15/107166] Chris Brown ➜ male
[16/107166] Sheryl Crow ➜ female
[17/107166] The Black Eyed Peas ➜ unknown
[18/107166] Bowling For Soup ➜ unknown
[19/107166] The Click Five ➜ unknown
[20/107166] Jonas Brothers ➜ unknown
[21/107166] Lil Mama ➜ male
[22/107166] Cascada ➜ unknown
[23/107166] Jason Derulo ➜ male
[24/107166] Ne-Yo ➜ male
[25/107166] Miley Cyrus ➜ female
[26/107166] Boys Like Girls ➜ unknown
[27/107166] Iyaz ➜ male
[28/107166] Kesha ➜ female
[29/107166] Justin Bieber ➜ male
[30/107166] M.I.A. ➜ female
[31/107166] The Killers ➜ unknown
[

KeyboardInterrupt: 

In [13]:
import pandas as pd
pd.Series(artist_gender_map).to_csv("artist_gender_map_partial_backup.csv", header=["gender"])


In [None]:
## Load the gender map 
artist_gender_map = pd.read_csv("artist_gender_map_partial_backup.csv", index_col=0)
artist_gender_map = artist_gender_map["gender"].to_dict()

### Preview Mapping 

In [16]:
for artist, gender in list(artist_gender_map.items())[:10]:
    print(f"{artist}: {gender}")

Missy Elliott: female
Britney Spears: female
Beyoncé: female
Justin Timberlake: male
Shaggy: male
Usher: male
The Pussycat Dolls: unknown
Destiny's Child: unknown
OutKast: unknown
Nelly Furtado: female


In [17]:
import pandas as pd

# Convert to a Series
gender_series = pd.Series(artist_gender_map)

# Count each gender category
print(gender_series.value_counts())


unknown              2584
male                 1819
female                602
non-binary gender      11
genderfluid             3
trans woman             2
agender                 2
neutral sex             1
Name: count, dtype: int64


In [19]:
# Map gender to each row based on artist_name
df_tracks["artist_gender"] = df_tracks["artist_name"].map(artist_gender_map)

# Preview the updated DataFrame
print(df_tracks[["artist_name", "artist_gender"]].head(10))

          artist_name artist_gender
0       Missy Elliott        female
1      Britney Spears        female
2             Beyoncé        female
3   Justin Timberlake          male
4              Shaggy          male
5               Usher          male
6               Usher          male
7  The Pussycat Dolls       unknown
8     Destiny's Child       unknown
9             OutKast       unknown


In [21]:
df_tracks.to_csv("spotify_tracks_with_gender.csv", index=False)

In [None]:
import pandas as pd

# Load the enriched dataset
df_with_gender = pd.read_csv("spotify_tracks_with_gender.csv")

# Preview the first few rows
print(df_with_gender.head(10))

   playlist_id playlist_name  track_position  \
0            0    Throwbacks               0   
1            0    Throwbacks               1   
2            0    Throwbacks               2   
3            0    Throwbacks               3   
4            0    Throwbacks               4   
5            0    Throwbacks               5   
6            0    Throwbacks               6   
7            0    Throwbacks               7   
8            0    Throwbacks               8   
9            0    Throwbacks               9   

                                   track_name         artist_name  \
0  Lose Control (feat. Ciara & Fat Man Scoop)       Missy Elliott   
1                                       Toxic      Britney Spears   
2                               Crazy In Love             Beyoncé   
3                              Rock Your Body   Justin Timberlake   
4                                It Wasn't Me              Shaggy   
5                                       Yeah!            

In [None]:
# Drop rows where artist_gender is "unknown" or NaN
df_with_gender_clean = df_with_gender[
    df_with_gender["artist_gender"].notna() & 
    (df_with_gender["artist_gender"] != "unknown")
]

# Reset index (optional, for a clean DataFrame)
df_with_gender_clean.reset_index(drop=True, inplace=True)

# Preview result
print(df_with_gender_clean[["track_name", "artist_name", "artist_gender"]].head())
print(f"\nRemaining tracks with known gender: {len(df_with_gender_clean)}")


                                   track_name        artist_name artist_gender
0  Lose Control (feat. Ciara & Fat Man Scoop)      Missy Elliott        female
1                                       Toxic     Britney Spears        female
2                               Crazy In Love            Beyoncé        female
3                              Rock Your Body  Justin Timberlake          male
4                                It Wasn't Me             Shaggy          male

Remaining tracks with known gender: 3174887


In [None]:
df_with_gender_clean.to_csv("spotify_tracks_with_gender_clean.csv", index=False)
