In [2]:
import sys

sys.path.append('../')
import os
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

from dotenv import load_dotenv
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
from matplotlib import pyplot
import re
from textblob import TextBlob
import random

load_dotenv()

ModuleNotFoundError: No module named 'spotipy'

In [13]:
sp = spotipy.Spotify(
    auth_manager=SpotifyClientCredentials(
        client_id=os.getenv("SPOTIFY_CLIENT_ID"),
        client_secret=os.getenv("SPOTIFY_API_KEY")
    ))

Functions:

In [14]:
def df_cleaner(df):
    df.drop_duplicates('name', inplace=True)
    df = df.drop(
        [
            'album',
            'artists',
            'available_markets',
            'disc_number',
            'explicit',
            'external_ids',
            'external_urls',
            'href',
            'id',
            'is_local',
            '...',
            'track_href',
            'analysis_url',
            'uri',
            'preview_url',
            'type',
            'name',
            'preview_url',
        ],
        errors='ignore',
        axis='columns'
    )

    df = df.reindex(sorted(df.columns), axis=1)

    return df


def get_top_tracks(max_page, per_page, keyword):
    current_offset = 1
    final_data = []
    results = sp.search(q='top', limit=per_page, type="track")
    final_results = results['tracks']['items']

    while current_offset <= max_page and results['tracks'].get('next', ''):
        current_offset += 1
        results = sp.search(q=keyword, offset=current_offset)
        final_results.extend(results['tracks']['items'])

    for i in final_results:
        audio_features = sp.audio_features(i["uri"])
        # print(audio_features)
        final_data.append({**i, **audio_features[0]})

    return final_data


def index_to_instance(df, index=None):
    if index:
        return XYZ(df)[index][1]
    else:
        return XYZ(df)


def XYZ(df):
    return sorted(list(zip(list(df.index.codes[0].data), list(df.index.levels[0].array))))


def value_to_index_map(array):
    array1 = zip(array, range(len(array)))
    return array1


class RecSysContentBased():
    def __init__(self):
        pass

    def fit(self, train):
        self.train_set = train
        df1 = cosine_similarity(train)
        self.similarity = df1
        self.distances = pairwise_distances(train, metric='euclidean')

    def evaluate(self, user):
        d = sorted(value_to_index_map(self.distances[user]))
        return list(index_to_instance(self.train_set, d[i][1]) for i in range(len(d)))

    def predict(self):
        pass

    def test(self, testset):
        pass




import and clean the data

In [29]:
baseDf = pd.read_csv('spotify_data.csv')
# baseDf.index = [baseDf["name"]]
data = baseDf[[
    'energy', 'danceability', 'loudness', 'liveness', 'valence',
    'duration_ms', 'acousticness', 'speechiness', 'popularity']]
df = df_cleaner(baseDf)

scale down the data

In [16]:
X = pd.DataFrame(df, columns=list(df.columns))
scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

split and cluster the data to 3 cluster

In [17]:
kmeans = KMeans(n_clusters=6, random_state=1234)
kmeans.fit(X_scaled_df)

In [18]:
labels = kmeans.predict(X_scaled_df)
clusters = np.unique(labels)

In [19]:
clusters = kmeans.predict(X_scaled_df)
X["cluster"] = clusters

In [20]:
kmeans2 = KMeans(n_clusters=50,
                 init="k-means++",
                 n_init=50,  # try with 1, 4, 8, 20, 30, 100...
                 max_iter=10,
                 tol=0,
                 algorithm="elkan",
                 random_state=1234)
kmeans2.fit(X_scaled_df)

In [21]:
K = range(2, 21)
inertia = []

for k in K:
    kmeans = KMeans(n_clusters=3,
                    random_state=1234)
    kmeans.fit(X_scaled_df)
    inertia.append(kmeans.inertia_)

In [22]:
model = RecSysContentBased()
model.fit(data)

In [31]:
user_input = input('what do you like?')
textBlb = TextBlob(user_input)  # Making our first textblob
textCorrected = str(textBlb.correct())
search_result = baseDf[baseDf['name'].str.contains(user_input, case=False)][:4]
guess_message = ''
search_result

Unnamed: 0,album,artists,available_markets,disc_number,duration_ms,explicit,external_ids,external_urls,href,id,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,track_href,analysis_url,time_signature
12,"{'album_type': 'album', 'artists': [{'external...",[{'external_urls': {'spotify': 'https://open.s...,"['AD', 'AE', 'AG', 'AL', 'AM', 'AO', 'AR', 'AT...",1,169907,True,{'isrc': 'QMCE32000236'},{'spotify': 'https://open.spotify.com/track/4Y...,https://api.spotify.com/v1/tracks/4YhJ3a6kBPGJ...,4YhJ3a6kBPGJWX4Ek77sDd,...,0,0.16,0.206,0.0,0.417,0.481,112.985,https://api.spotify.com/v1/tracks/4YhJ3a6kBPGJ...,https://api.spotify.com/v1/audio-analysis/4YhJ...,4
35,"{'album_type': 'album', 'artists': [{'external...",[{'external_urls': {'spotify': 'https://open.s...,"['AD', 'AE', 'AG', 'AM', 'AO', 'AR', 'AT', 'AU...",1,187333,False,{'isrc': 'TCACN1687807'},{'spotify': 'https://open.spotify.com/track/0S...,https://api.spotify.com/v1/tracks/0So2sgVa8aJi...,0So2sgVa8aJiARPl2P29u2,...,0,0.0287,0.85,0.243,0.107,0.311,90.024,https://api.spotify.com/v1/tracks/0So2sgVa8aJi...,https://api.spotify.com/v1/audio-analysis/0So2...,4
219,"{'album_type': 'single', 'artists': [{'externa...",[{'external_urls': {'spotify': 'https://open.s...,"['AD', 'AE', 'AG', 'AL', 'AM', 'AO', 'AR', 'AT...",1,296011,False,{'isrc': 'USRC12000150'},{'spotify': 'https://open.spotify.com/track/6M...,https://api.spotify.com/v1/tracks/6MO2bfLHKykU...,6MO2bfLHKykUgCChFdw91H,...,1,0.244,0.0145,0.000102,0.14,0.17,96.981,https://api.spotify.com/v1/tracks/6MO2bfLHKykU...,https://api.spotify.com/v1/audio-analysis/6MO2...,4
259,"{'album_type': 'album', 'artists': [{'external...",[{'external_urls': {'spotify': 'https://open.s...,"['AD', 'AE', 'AG', 'AL', 'AM', 'AO', 'AR', 'AT...",1,215000,True,{'isrc': 'USUM72112384'},{'spotify': 'https://open.spotify.com/track/2U...,https://api.spotify.com/v1/tracks/2UwALqx6yOsX...,2UwALqx6yOsXTFt7zRxnts,...,1,0.332,0.251,6e-06,0.349,0.678,143.971,https://api.spotify.com/v1/tracks/2UwALqx6yOsX...,https://api.spotify.com/v1/audio-analysis/2UwA...,4


In [32]:
options = list(enumerate(search_result['name']))
print(*options, sep='\n')

(0, 'TOP FLOOR (feat. Travis Scott)')
(1, 'Pope Is a Rockstar')
(2, 'Slide (Remix) (feat. Pop Smoke, A Boogie Wit da Hoodie & Chris Brown) (feat. Pop Smoke)')
(3, 'Tell The Vision (feat. Kanye West & Pusha T)')


In [33]:
user_choice_input_data = input(guess_message)
user_choice_id = search_result.iloc[[user_choice_input_data]]['id'].values[0]
user_choice_id

'4YhJ3a6kBPGJWX4Ek77sDd'

API call for song data from spotify

In [62]:
track_data = sp.track(track_id=user_choice_id)
track_data_featured = {**track_data, **sp.audio_features(track_data['uri'])[0]}
#track_data_featured = sp.audio_features(track_data['uri'])[0]
track_data_featured

{'album': {'album_type': 'album',
  'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/2hlmm7s2ICUX0LVIhVFlZQ'},
    'href': 'https://api.spotify.com/v1/artists/2hlmm7s2ICUX0LVIhVFlZQ',
    'id': '2hlmm7s2ICUX0LVIhVFlZQ',
    'name': 'Gunna',
    'type': 'artist',
    'uri': 'spotify:artist:2hlmm7s2ICUX0LVIhVFlZQ'}],
  'available_markets': ['AD',
   'AE',
   'AG',
   'AL',
   'AM',
   'AO',
   'AR',
   'AT',
   'AU',
   'AZ',
   'BA',
   'BB',
   'BD',
   'BE',
   'BF',
   'BG',
   'BH',
   'BI',
   'BJ',
   'BN',
   'BO',
   'BR',
   'BS',
   'BT',
   'BW',
   'BY',
   'BZ',
   'CA',
   'CD',
   'CG',
   'CH',
   'CI',
   'CL',
   'CM',
   'CO',
   'CR',
   'CV',
   'CW',
   'CY',
   'CZ',
   'DE',
   'DJ',
   'DK',
   'DM',
   'DO',
   'DZ',
   'EC',
   'EE',
   'EG',
   'ES',
   'FI',
   'FJ',
   'FM',
   'FR',
   'GA',
   'GB',
   'GD',
   'GE',
   'GH',
   'GM',
   'GN',
   'GQ',
   'GR',
   'GT',
   'GW',
   'GY',
   'HK',
   'HN',
   'HR',
   'HT',
   'HU

In [63]:
track_data_X = pd.DataFrame(track_data_featured, columns=list(df.columns), index=[0])
track_data_scaler = StandardScaler()
scaler.transform(track_data_X)
track_data_X_scaled = scaler.transform(track_data_X)
track_data_X_scaled_df = pd.DataFrame(track_data_X_scaled, columns=track_data_X.columns)
track_data_X_scaled

array([[-0.17956252,  1.16073101, -0.75276399, -0.57533769, -0.37562714,
         0.76993632,  1.66430593, -0.04534059, -1.51338117, -0.11977677,
         0.52406452, -0.24079105,  0.15702718,  1.10761802, -0.09497974]])

predict

In [84]:
track_data_labels = kmeans.predict(track_data_X_scaled_df)
track_data_cluster = np.unique(track_data_labels)
track_data_cluster = kmeans.predict(track_data_X_scaled_df)
track_data_X["cluster"] = track_data_cluster


1

In [102]:
# random.choice(baseDf[baseDf['cluster'] == 1])
#
#
# random_track = pd.DataFrame(pandas.np.random.random(100))
#
# df_elements = baseDf[baseDf['cluster'] == 1].sample(n=1)

recommend_track = baseDf[X['cluster'] == int(track_data_cluster)].sample()
recommend_track['track_href'].values[0]

'https://api.spotify.com/v1/tracks/2AmEv442DGwSxMGZ9XEvBA'

In [104]:

from IPython.display import IFrame

#track_id = "1rfORa9iYmocEsnnZGMVC4"
track_id= 'spotify:track:3hgl7EQwTutSm6PESsB7gZ'
IFrame(src=recommend_track['track_href'].values[0],
       width="320",
       height="80",
       frameborder="0",
       allowtransparency="true",
       allow="encrypted-media",
      )