<a href="https://colab.research.google.com/github/nbilasals/music_recommendation/blob/main/music_recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Music Recommendation (Spotify)

## Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from google.colab import files
import os
import shutil
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from sklearn.metrics import mean_squared_error, mean_absolute_error
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
warnings.filterwarnings('ignore')

## Data Preparation

### Loading Data

In [None]:
# uploaded = files.upload()

In [None]:
# kaggle_dir = os.path.expanduser('~/.kaggle')
# os.makedirs(kaggle_dir, exist_ok=True)

# # Pindahkan file kaggle.json
# shutil.move('kaggle.json', os.path.join(kaggle_dir, 'kaggle.json'))

# # Atur permission file
# os.chmod(os.path.join(kaggle_dir, 'kaggle.json'), 0o600)

In [None]:
# !kaggle datasets download -d maharshipandya/-spotify-tracks-dataset

In [None]:
# !unzip /content/-spotify-tracks-dataset.zip

In [None]:
data = pd.read_csv("dataset.csv")

In [None]:
data.tail()

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
113995,113995,2C3TZjDRiAzdyViavDJ217,Rainy Lullaby,#mindfulness - Soft Rain for Mindful Meditatio...,Sleep My Little Boy,21,384999,False,0.172,0.235,...,-16.393,1,0.0422,0.64,0.928,0.0863,0.0339,125.995,5,world-music
113996,113996,1hIz5L4IB9hN3WRYPOCGPw,Rainy Lullaby,#mindfulness - Soft Rain for Mindful Meditatio...,Water Into Light,22,385000,False,0.174,0.117,...,-18.318,0,0.0401,0.994,0.976,0.105,0.035,85.239,4,world-music
113997,113997,6x8ZfSoqDjuNa5SVP5QjvX,Cesária Evora,Best Of,Miss Perfumado,22,271466,False,0.629,0.329,...,-10.895,0,0.042,0.867,0.0,0.0839,0.743,132.378,4,world-music
113998,113998,2e6sXL2bYv4bSz6VTdnfLs,Michael W. Smith,Change Your World,Friends,41,283893,False,0.587,0.506,...,-10.889,1,0.0297,0.381,0.0,0.27,0.413,135.96,4,world-music
113999,113999,2hETkH7cOfqmz3LqZDHZf5,Cesária Evora,Miss Perfumado,Barbincor,22,241826,False,0.526,0.487,...,-10.204,0,0.0725,0.681,0.0,0.0893,0.708,79.198,4,world-music


In [None]:
unique_genres = data['track_genre'].unique()
print(unique_genres)
print(f"Number of unique genres: {len(unique_genres)}")

['acoustic' 'afrobeat' 'alt-rock' 'alternative' 'ambient' 'anime'
 'black-metal' 'bluegrass' 'blues' 'brazil' 'breakbeat' 'british'
 'cantopop' 'chicago-house' 'children' 'chill' 'classical' 'club' 'comedy'
 'country' 'dance' 'dancehall' 'death-metal' 'deep-house' 'detroit-techno'
 'disco' 'disney' 'drum-and-bass' 'dub' 'dubstep' 'edm' 'electro'
 'electronic' 'emo' 'folk' 'forro' 'french' 'funk' 'garage' 'german'
 'gospel' 'goth' 'grindcore' 'groove' 'grunge' 'guitar' 'happy'
 'hard-rock' 'hardcore' 'hardstyle' 'heavy-metal' 'hip-hop' 'honky-tonk'
 'house' 'idm' 'indian' 'indie-pop' 'indie' 'industrial' 'iranian'
 'j-dance' 'j-idol' 'j-pop' 'j-rock' 'jazz' 'k-pop' 'kids' 'latin'
 'latino' 'malay' 'mandopop' 'metal' 'metalcore' 'minimal-techno' 'mpb'
 'new-age' 'opera' 'pagode' 'party' 'piano' 'pop-film' 'pop' 'power-pop'
 'progressive-house' 'psych-rock' 'punk-rock' 'punk' 'r-n-b' 'reggae'
 'reggaeton' 'rock-n-roll' 'rock' 'rockabilly' 'romance' 'sad' 'salsa'
 'samba' 'sertanejo' 'show

In [None]:
data.columns

Index(['Unnamed: 0', 'track_id', 'artists', 'album_name', 'track_name',
       'popularity', 'duration_ms', 'explicit', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature',
       'track_genre'],
      dtype='object')

In [None]:
print("Numbers of Rows and Columns:")
data.shape

Numbers of Rows and Columns:


(114000, 21)

### Dropping Duplicate Values

In [None]:
# Check for duplicates based on 'track_id'
duplicates = data[data.duplicated(subset=['track_id'])]

# Display the duplicated rows
print("Duplicated rows based on 'track_id':")
print(duplicates)

# Count the number of duplicates
num_duplicates = len(duplicates)
print(f"\nNumber of duplicated track IDs: {num_duplicates}")

Duplicated rows based on 'track_id':
        Unnamed: 0                track_id  \
1925          1925  0CDucx9lKxuCZplLXUz0iX   
2155          2155  2aibwv5hGXSgw7Yru8IYTO   
3000          3000  5E30LdtzQTGqRvNd7l6kG5   
3002          3002  2K7xn816oNHJZ0aVqdQsha   
3003          3003  2QjOHCTQ1Jl3zawyYOpxh6   
...            ...                     ...   
113572      113572  1saXUvvFlAQaefZUFVmhCn   
113605      113605  1Q5jFp1g2Ns4gBsHRpcqhu   
113617      113617  71dLJx3qHOTQMTvvoE2dmd   
113619      113619  6OG5TBCmuTOuWCzSGsETrE   
113641      113641  7xsirhcgFWOnItsGuBfrv9   

                                           artists  \
1925                        Buena Onda Reggae Club   
2155                         Red Hot Chili Peppers   
3000                             The Neighbourhood   
3002                             The Neighbourhood   
3003                             The Neighbourhood   
...                                            ...   
113572                   Bethel 

In [None]:
# Display rows before dropping duplicates
print("Rows before dropping duplicates:")
print(data.shape[0])

# Drop duplicate rows based on all columns
data_no_duplicates = data.drop_duplicates(subset='track_id')


# Display rows after dropping duplicates
print("\nRows after dropping duplicates:")
data_no_duplicates.shape[0]

Rows before dropping duplicates:
114000

Rows after dropping duplicates:


89741

### Dropping Missing Values

In [None]:
# Display number of missing values in each column
print("\nMissing values per column:")
print(data_no_duplicates.isnull().sum())

# Drop rows with any missing values
data = data_no_duplicates.dropna()

# Display the shape of the DataFrame after dropping missing values
print("\nShape of DataFrame after dropping missing values:")
data.shape


Missing values per column:
Unnamed: 0          0
track_id            0
artists             1
album_name          1
track_name          1
popularity          0
duration_ms         0
explicit            0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
track_genre         0
dtype: int64

Shape of DataFrame after dropping missing values:


(89740, 21)

### Drop Unrelevant Columns

In [None]:
data = data.drop('Unnamed: 0', axis=1)

# Now 'data' DataFrame does not have the 'Unnamed: 0' column
data.columns

Index(['track_id', 'artists', 'album_name', 'track_name', 'popularity',
       'duration_ms', 'explicit', 'danceability', 'energy', 'key', 'loudness',
       'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'time_signature', 'track_genre'],
      dtype='object')

## Modelling

### 1. Using K-Nearest Neighbors

These features should represent song characteristics.

In [None]:
# Select the relevant features for the model
X = data[['danceability', 'energy', 'loudness', 'tempo', 'valence']]

Create and Train the KNN Model

In [None]:
# Initialize the KNN model
knn = NearestNeighbors(n_neighbors=10, algorithm='ball_tree')

# Fit the model on the features
knn.fit(X)

Define the Recommendation Function

In [None]:
def recommend_tracks(track_id, n_recommendations=5):
    """
    Recommend similar tracks based on KNN.

    Parameters:
        track_id (str): ID of the track for which recommendations are made.
        n_recommendations (int): Number of recommendations to return (default 5).

    Returns:
        pd.DataFrame: Recommended tracks with track name, artist, and popularity.
    """
    if track_id in data['track_id'].values:
        # Get the index of the track in the dataset
        idx_data = data[data['track_id'] == track_id].index[0]

        # Get the index in the feature matrix (X)
        idx_X = X.index.get_loc(idx_data)

        # Find nearest neighbors using KNN
        distances, indices = knn.kneighbors([X.iloc[idx_X]], n_neighbors=n_recommendations + 1)

        # Extract recommended tracks
        recommended_tracks = data.iloc[indices[0][1:]]
        return recommended_tracks[['track_name', 'artists', 'popularity']]
    else:
        print(f"Track ID '{track_id}' not found in the dataset.")
        return None


 Test the Recommendation Function

In [None]:
# Test the function with a specific track ID
recommendations = recommend_tracks('2C3TZjDRiAzdyViavDJ217')

# Display the recommendations
recommendations

### 2. Using Cosine Similarity

Define Features and Normalize

In [24]:
# Define numeric features for modeling
features = ['danceability', 'energy', 'loudness', 'tempo', 'valence']

# Check data types and missing values for selected features
print(data[features].info())  # Ensure all are numeric
print(data[features].isnull().sum())  # Ensure no missing values

# Normalize the features using MinMaxScaler
scaler = MinMaxScaler()
data[features] = scaler.fit_transform(data[features])

# Display the normalized data
data[features].head()


Unnamed: 0,track_name,artists,similarity
59797,Picture Of A Whisper,Ata Ebtekar,0.999888
26546,The Ultimate Vision,Germaine Franco,0.999833
101676,Shhh Baby Sleep Mother's Soothing Voice - With...,Baby Sleep,0.999814
4303,Pastoral,Christian Löffler,0.999512
76372,Harry's Game,Celtic Woman,0.999414


Define Similarity Function

In [None]:
# Recommendation function based on audio features
def recommend_by_audio(track_id, data, features, k=5):
    # Filter track input
    target_track = data[data['track_id'] == track_id]

    # Check if track_id exists
    if target_track.empty:
        print(f"Track ID '{track_id}' not found in the dataset.")
        return pd.DataFrame()  # Return empty DataFrame if track_id not found

    # Extract target track and other track features
    target_features = target_track[features].values
    other_features = data[features].values

    # Calculate cosine similarity
    similarities = cosine_similarity(target_features, other_features).flatten()

    # Add similarity scores to the dataset
    data['similarity'] = similarities

    # Sort by similarity scores and exclude the input track itself
    recommendations = data.sort_values(by='similarity', ascending=False).iloc[1:k+1]
    return recommendations[['track_name', 'artists', 'similarity']]


In [None]:
# Calculate similarity matrix for all tracks
similarity_matrix = cosine_similarity(data[features])

# Convert to DataFrame for better readability
similarity_df = pd.DataFrame(similarity_matrix, index=data['track_name'], columns=data['track_name'])

# Display a sample of the similarity matrix
similarity_df.iloc[:10, :10]


Test the Recommendation System

In [None]:
# Example usage: Replace '2C3TZjDRiAzdyViavDJ217' with a valid track_id from your dataset
track_id = '2C3TZjDRiAzdyViavDJ217'

recommendations = recommend_by_audio(track_id=track_id, data=data, features=features, k=5)

# Display the recommendations
print("Recommendations for the track:")
print(recommendations)

In [None]:
# Inisialisasi TfidfVectorizer
tf = TfidfVectorizer()

# Melakukan perhitungan idf pada data
tf.fit(data['track_genre'])

# Mapping array dari fitur index integer ke fitur nama
tf.get_feature_names_out()

array(['acoustic', 'afrobeat', 'age', 'alt', 'alternative', 'ambient',
       'and', 'anime', 'bass', 'black', 'bluegrass', 'blues', 'brazil',
       'breakbeat', 'british', 'cantopop', 'chicago', 'children', 'chill',
       'classical', 'club', 'comedy', 'country', 'dance', 'dancehall',
       'death', 'deep', 'detroit', 'disco', 'disney', 'drum', 'dub',
       'dubstep', 'edm', 'electro', 'electronic', 'emo', 'film', 'folk',
       'forro', 'french', 'funk', 'garage', 'german', 'gospel', 'goth',
       'grindcore', 'groove', 'grunge', 'guitar', 'happy', 'hard',
       'hardcore', 'hardstyle', 'heavy', 'hip', 'honky', 'hop', 'house',
       'idm', 'idol', 'indian', 'indie', 'industrial', 'iranian', 'jazz',
       'kids', 'latin', 'latino', 'malay', 'mandopop', 'metal',
       'metalcore', 'minimal', 'mpb', 'music', 'new', 'opera', 'pagode',
       'party', 'piano', 'pop', 'power', 'progressive', 'psych', 'punk',
       'reggae', 'reggaeton', 'rock', 'rockabilly', 'roll', 'romance',
  

In [None]:
# Melakukan fit lalu ditransformasikan ke bentuk matrix
tfidf_matrix = tf.fit_transform(data['track_genre'])

# Melihat ukuran matrix tfidf
tfidf_matrix.shape

(114000, 114)

In [None]:
# Mengubah vektor tf-idf dalam bentuk matriks dengan fungsi todense()
tfidf_matrix.todense()

matrix([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [1.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [1.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.70710678],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.70710678],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.70710678]])

In [None]:
#Tampilkan dalam DataFrame
pd.DataFrame(
    tfidf_matrix.todense(),
    columns=tf.get_feature_names_out(),
    index=data.track_name
).sample(22, axis=1).sample(10, axis=0)

Unnamed: 0_level_0,heavy,hardcore,hard,show,dance,french,guitar,samba,forro,trance,...,salsa,minimal,new,idm,progressive,dancehall,death,psych,disco,indie
track_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cyber Ravage - Vector Seven Remix,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Yearning = Alchemical Fire,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Todos Los Besos,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Loving You Like Always,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.777611,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Estrella Roja,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Counting Stars,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Amber Glow,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Chainsaw man main theme but it's lofi hiphop,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Wishful Drinking,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"A State Of Trance (ASOT 1090) - Interview with GXD, Pt. 3",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Sekarang, kita akan menghitung derajat kesamaan (similarity degree) antar musik dengan teknik cosine similarity.

In [None]:
# Menghitung cosine similarity pada matrix tf-idf
cosine_sim = cosine_similarity(tfidf_matrix)
cosine_sim

array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 1., 1.],
       [0., 0., 0., ..., 1., 1., 1.],
       [0., 0., 0., ..., 1., 1., 1.]])

In [None]:
# Membuat dataframe dari variabel cosine_sim dengan baris dan kolom berupa nama
cosine_sim_df = pd.DataFrame(cosine_sim, index=data['track_name'], columns=data['track_name'])
print('Shape:', cosine_sim_df.shape)

# Melihat similarity matrix pada setiap resto
cosine_sim_df.sample(5, axis=1).sample(10, axis=0)

Shape: (114000, 114000)


In [None]:
def music_recommendations(nama_track, similarity_data=cosine_sim_df, items=data[['track_name', 'track_genre','artists']], k=5):
    # Mengambil data dengan menggunakan argpartition untuk melakukan partisi secara tidak langsung sepanjang sumbu yang diberikan
    # Dataframe diubah menjadi numpy
    # Range(start, stop, step)
    index = similarity_data.loc[:,nama_track].to_numpy().argpartition(
        range(-1, -k, -1))

    # Mengambil data dengan similarity terbesar dari index yang ada
    # Convert 'index' to 1D array before using it to index 'similarity_data.columns'
    closest = similarity_data.columns[index[-1:-(k+2):-1].ravel()]

    # Drop nama_resto agar nama resto yang dicari tidak muncul dalam daftar rekomendasi
    closest = closest.drop(nama_track, errors='ignore')

    return pd.DataFrame(closest).merge(items).head(k)

In [None]:
data[data.track_name.eq('Boy In Luv')]

In [None]:
music_recommendations('Boy In Luv')