In [1]:
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/spotify-dataset/data/data_by_year.csv
/kaggle/input/spotify-dataset/data/data_by_genres.csv
/kaggle/input/spotify-dataset/data/data_w_genres.csv
/kaggle/input/spotify-dataset/data/data_by_artist.csv
/kaggle/input/spotify-dataset/data/data.csv


In [2]:
import numpy as np
import pandas as pd

import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
import matplotlib.pyplot as plt
from wordcloud import WordCloud

from collections import defaultdict
from scipy.spatial.distance import cdist
from sklearn.preprocessing import MinMaxScaler, StandardScaler

import warnings
warnings.filterwarnings("ignore")

In [3]:
# Saving data from csv to pandas dataframe
data = pd.read_csv("../input/spotify-dataset/data/data.csv")
genre_data = pd.read_csv('../input/spotify-dataset/data/data_by_genres.csv')
year_data = pd.read_csv('../input/spotify-dataset/data/data_by_year.csv')
artist_data = pd.read_csv('../input/spotify-dataset/data/data_by_artist.csv')

In [4]:
data.sample(4)

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
150287,0.75,1993,0.0164,['Heavy Nopal'],0.467,203685,0.634,0,7ArHgkupud8056QGPwrLhz,0.0,2,0.158,-15.649,1,Asalto Chido,43,1993,0.0637,75.189
132469,0.472,1978,0.192,['Patti Smith'],0.538,248533,0.666,0,6rH42cutB9ZnEmTwgFY6tq,0.0,9,0.134,-8.185,1,Space Monkey,33,1978,0.0361,105.618
66717,0.865,1978,0.0065,['Cheap Trick'],0.527,224800,0.939,0,5e4kUj91oLjKxxiT8akiQV,3.8e-05,9,0.177,-7.97,1,California Man,34,1978-04,0.0467,155.838
67317,0.869,1981,0.152,['The Rolling Stones'],0.511,212027,0.892,0,0SpNlEAUqNsuij5xi7Z7cQ,0.262,0,0.638,-4.701,1,Black Limousine - Remastered,38,1981-08-24,0.0588,108.959


In [5]:
genre_data.sample(4)

Unnamed: 0,mode,genres,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key
2251,1,psychedelic doom,0.059968,0.305506,347691.1875,0.741812,0.655865,0.152131,-8.571938,0.057419,117.161187,0.334875,43.0625,1
2479,1,ska argentino,0.106157,0.673826,234711.043989,0.690049,0.015024,0.214886,-7.623294,0.058875,117.273917,0.699855,57.942444,9
1810,1,modern hard rock,0.084078,0.553648,229278.545333,0.724673,0.008138,0.190052,-7.396835,0.05606,114.514197,0.508317,46.420667,11
2307,1,rave,0.085144,0.610495,298809.503128,0.848479,0.331842,0.207249,-7.149814,0.110109,130.181358,0.457179,49.27126,11


In [6]:
year_data.sample(4)

Unnamed: 0,mode,year,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key
24,1,1945,0.709657,0.519143,196161.557,0.226044,0.275894,0.20301,-16.981472,0.305097,108.32407,0.491361,2.1265,0
15,1,1936,0.772312,0.558006,220809.186364,0.308389,0.25711,0.221438,-14.612999,0.279029,109.888755,0.564064,5.080909,10
61,1,1982,0.28958,0.564259,248357.306,0.590919,0.100016,0.201007,-10.933122,0.065853,120.918607,0.586438,36.247,2
81,1,2002,0.282624,0.57616,239503.283,0.64127,0.088048,0.193911,-7.68664,0.084308,119.239738,0.542397,48.6555,7


In [7]:
artist_data.sample(4)

Unnamed: 0,mode,count,acousticness,artists,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key
7836,0,2,0.00549,Evildead,0.433,257667.0,0.947,0.0257,0.383,-12.672,0.0781,99.824,0.251,33.0,8
1356,1,5,0.10668,Annika Wells,0.4888,225487.0,0.724,2e-06,0.09152,-3.6372,0.0413,115.256,0.2434,70.2,7
5701,1,2,0.0498,Dara Jamchan,0.532,114155.0,0.642,0.047,0.421,-8.08,0.0365,120.243,0.945,27.0,0
15731,1,18,0.976778,Madhubala Jhaveri,0.538111,200880.944444,0.222289,0.301595,0.124433,-12.084333,0.123517,117.617722,0.601444,0.111111,7


In [8]:
# Typecasting columns for better understanding of the datasets
data['year'] = pd.to_datetime(data['year'], format='%Y')
data['release_date'] = pd.to_datetime(data['release_date'], format = 'mixed')
year_data['year'] = pd.to_datetime(year_data['year'], format='%Y')

In [9]:
# Combining the datasets for convinent access
datasets = [("data", data), ("genre_data", genre_data), ("year_data", year_data), ("artist_data", artist_data)]

In [10]:
for name, df in datasets:
    # print some info about the datasets
    print(f"Info about the dataset: {name}")
    print("-"*30)
    print(df.info())
    print()

Info about the dataset: data
------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170653 entries, 0 to 170652
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   valence           170653 non-null  float64       
 1   year              170653 non-null  datetime64[ns]
 2   acousticness      170653 non-null  float64       
 3   artists           170653 non-null  object        
 4   danceability      170653 non-null  float64       
 5   duration_ms       170653 non-null  int64         
 6   energy            170653 non-null  float64       
 7   explicit          170653 non-null  int64         
 8   id                170653 non-null  object        
 9   instrumentalness  170653 non-null  float64       
 10  key               170653 non-null  int64         
 11  liveness          170653 non-null  float64       
 12  loudness          170653 non-null  float64       
 13 

In [11]:
for name, df in datasets:
    print(f"Missing Values in: {name}")
    print("-"*30)
    print(df.isnull().sum())
    print()

Missing Values in: data
------------------------------
valence             0
year                0
acousticness        0
artists             0
danceability        0
duration_ms         0
energy              0
explicit            0
id                  0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
name                0
popularity          0
release_date        0
speechiness         0
tempo               0
dtype: int64

Missing Values in: genre_data
------------------------------
mode                0
genres              0
acousticness        0
danceability        0
duration_ms         0
energy              0
instrumentalness    0
liveness            0
loudness            0
speechiness         0
tempo               0
valence             0
popularity          0
key                 0
dtype: int64

Missing Values in: year_data
------------------------------
mode                0
year                0
acousticness        0
danc

In [12]:
for name, df in datasets:
    print(f"Duplicates in the dataset: {name}")
    print("-"*30)
    print(df.duplicated(keep=False).sum())
    print()

Duplicates in the dataset: data
------------------------------
0

Duplicates in the dataset: genre_data
------------------------------
0

Duplicates in the dataset: year_data
------------------------------
0

Duplicates in the dataset: artist_data
------------------------------
0



In [13]:
for name, df in datasets:
    print(f"Unique Values in: {name}")
    print("-"*30)
    print(df.nunique())
    print()

Unique Values in: data
------------------------------
valence               1733
year                   100
acousticness          4689
artists              34088
danceability          1240
duration_ms          51755
energy                2332
explicit                 2
id                  170653
instrumentalness      5401
key                     12
liveness              1740
loudness             25410
mode                     2
name                133638
popularity             100
release_date         10968
speechiness           1626
tempo                84694
dtype: int64

Unique Values in: genre_data
------------------------------
mode                   2
genres              2973
acousticness        2798
danceability        2725
duration_ms         2872
energy              2778
instrumentalness    2731
liveness            2709
loudness            2873
speechiness         2707
tempo               2872
valence             2745
popularity          2188
key                   12
dtype: in

In [14]:
# Popularity Trends Over Years
fig = px.line(year_data, x='year', y='popularity', title='Popularity Trends Over the Years', labels={'year': 'Years --->', 'popularity': "Popularity --->"})
fig.show()

In [15]:
# Converting release_date to datetime and extract decade
data['release_decade'] = (data['release_date'].dt.year // 10) * 10

# Counting the number of songs per decade
decade_counts = data['release_decade'].value_counts().sort_index()

# Creating a bar chart for songs per decade
fig = px.bar(x=decade_counts.index, y=decade_counts.values, labels={'x': 'Decade --->', 'y': 'Number of Songs --->', 'color':"Color"},
             title='Number of Songs Released per Decade', color=decade_counts.index, color_continuous_scale='Rainbow')
fig.update_layout(xaxis_type='category')
fig.show()

In [16]:
# Changes in Tempo Over the Years
fig = px.scatter(year_data, x='year', y='tempo', color='tempo', size='popularity',
                 title='Changes in Tempo Over the Years', labels={'tempo': 'Tempo --->', "year":"Years --->"})
fig.show()

In [17]:
# Convert year column back
data['year'] = data['year'].dt.year

In [18]:
# List of numerical columns to consider for similarity calculations
number_cols = ['valence', 'year', 'acousticness', 'danceability', 'duration_ms', 'energy', 'explicit', 'year',
               'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity', 'speechiness', 'tempo']

In [19]:
# Function to retrieve song data for a given song name
def get_song_data(name, data):
    try:
        return data[data['name'].str.lower() == name].iloc[0]
        return song_data
    except IndexError:
        return None

In [20]:
# Function to calculate the mean vector of a list of songs
def get_mean_vector(song_list, data):
    song_vectors = []
    for song in song_list:
        song_data = get_song_data(song['name'], data)
        if song_data is None:
            print('Warning: {} does not exist in the dataset'.format(song['name']))
            return None
        song_vector = song_data[number_cols].values
        song_vectors.append(song_vector)
    song_matrix = np.array(list(song_vectors))
    return np.mean(song_matrix, axis=0)

In [21]:
# Function to flatten a list of dictionaries into a single dictionary
def flatten_dict_list(dict_list):
    flattened_dict = defaultdict()
    for key in dict_list[0].keys():
        flattened_dict[key] = []
    for dictionary in dict_list:
        for key, value in dictionary.items():
            flattened_dict[key].append(value)
    return flattened_dict

In [22]:
# Normalize the song data using Min-Max Scaler
min_max_scaler = MinMaxScaler()
normalized_data = min_max_scaler.fit_transform(data[number_cols])

# Standardize the normalized data using Standard Scaler
standard_scaler = StandardScaler()
scaled_normalized_data = standard_scaler.fit_transform(normalized_data)

In [23]:
# Function to recommend songs based on a list of seed songs
def recommend_songs(seed_songs, data, n_recommendations=10):
    metadata_cols = ['name', 'artists', 'year']
    song_center = get_mean_vector(seed_songs, data)
    
    # Return an empty list if song_center is missing
    if song_center is None:
        return []
    
    # Normalize the song center
    normalized_song_center = min_max_scaler.transform([song_center])
    
    # Standardize the normalized song center
    scaled_normalized_song_center = standard_scaler.transform(normalized_song_center)
    
    # Calculate Euclidean distances and get recommendations
    distances = cdist(scaled_normalized_song_center, scaled_normalized_data, 'euclidean')
    index = np.argsort(distances)[0]
    
    # Filter out seed songs and duplicates, then get the top n_recommendations
    rec_songs = []
    for i in index:
        song_name = data.iloc[i]['name']
        if song_name not in [song['name'] for song in seed_songs] and song_name not in [song['name'] for song in rec_songs]:
            rec_songs.append(data.iloc[i])
            if len(rec_songs) == n_recommendations:
                break
    
    return pd.DataFrame(rec_songs)[metadata_cols].to_dict(orient='records')

In [24]:
# List of seed songs (replace with your own seed songs)
seed_songs = [
    {'name': 'Come As You Are'},
    {'name': 'Smells Like Teen Spirit'},
    # Add more seed songs as needed
]
seed_songs = [{'name': name['name'].lower()} for name in seed_songs]

# Number of recommended songs
n_recommendations = 10

# Call the recommend_songs function
recommended_songs = recommend_songs(seed_songs, data, n_recommendations)

# Convert the recommended songs to a DataFrame
recommended_df = pd.DataFrame(recommended_songs)

# Print the recommended songs
for idx, song in enumerate(recommended_songs, start=1):
    print(f"{idx}. {song['name']} by {song['artists']} ({song['year']})")

1. No Excuses by ['Alice In Chains'] (1994)
2. Come As You Are by ['Nirvana'] (1991)
3. Smells Like Teen Spirit by ['Nirvana'] (1991)
4. Born in the U.S.A. by ['Bruce Springsteen'] (1984)
5. Breakfast At Tiffany's by ['Deep Blue Something'] (1995)
6. Malibu by ['Hole'] (1998)
7. Fuel by ['Metallica'] (1997)
8. Sleep Now In the Fire by ['Rage Against The Machine'] (1999)
9. When You're Gone by ['Bryan Adams', 'Melanie C'] (1998)
10. Outshined by ['Soundgarden'] (1991)
