# Spotify Recommender
#### Setting up the environment

In [2]:
# import necessary libraries
from dotenv import load_dotenv
import os
import base64
from requests import post, get
import json
import pandas as pd
import numpy as np
import matplotlib as plt

# set up the environment
load_dotenv()
client_id = os.getenv("CLIENT_ID")
client_secret = os.getenv("CLIENT_SECRET") 


#### Functions to set up token authorization

In [3]:
# function to generate the token
def get_token():
    auth_string = client_id + ":" + client_secret
    auth_bytes = auth_string.encode("utf-8")
    auth_base64 = str(base64.b64encode(auth_bytes), "utf-8")

    url = "https://accounts.spotify.com/api/token"
    headers = {
        "Authorization": "Basic " + auth_base64,
        "Content-Type": "application/x-www-form-urlencoded"
    }
    data = {"grant_type": "client_credentials"}
    result = post(url, headers=headers, data=data)
    json_result = json.loads(result.content)
    token = json_result["access_token"]
    return token

# function to generate the authorization header
def get_auth_header(token):
    return {"Authorization": "Bearer " + token}

#### Functions getting info from Spotify's Web API

In [4]:
# search for artist by name
def search_for_artist(token, artist_name):
    url = "https://api.spotify.com/v1/search"
    headers = get_auth_header(token)
    query = f"?q={artist_name}&type=artist&limit=1"

    query_url = url + query
    result = get(query_url, headers=headers)
    json_result = json.loads(result.content)["artists"]["items"]
    if len(json_result) == 0:
        print("No artist with this name exists...")
        return None
    return json_result[0]

# search for track by name
def search_for_track(token, track_name):
    url = "https://api.spotify.com/v1/search"
    headers = get_auth_header(token)
    query = f"?q={track_name}&type=track&limit=1"

    query_url = url + query
    result = get(query_url, headers=headers)
    json_result = json.loads(result.content)["tracks"]["items"]
    if len(json_result) == 0:
        print("No track with this name exists...")
        return None
    return json_result[0]

# get artist by artist id
def get_artist(token, artist_id):
    url = f"https://api.spotify.com/v1/artists/{artist_id}"
    headers = get_auth_header(token)
    result = get(url, headers=headers)
    json_result = json.loads(result.content)
    return json_result

# get songs by artist from artist id
def get_songs_by_artist(token, artist_id):
    url = f"https://api.spotify.com/v1/artists/{artist_id}/top-tracks?country=CA"
    headers = get_auth_header(token)
    result = get(url, headers=headers)
    json_result = json.loads(result.content)["tracks"]
    return json_result

# get all available markets on Spotify (CA, US, etc)
def get_markets(token):
    url = "https://api.spotify.com/v1/markets"
    headers = get_auth_header(token)
    result = get(url, headers=headers)
    json_result = json.loads(result.content)
    return json_result

# get all available genres in the database
def get_genres(token):
    url = "https://api.spotify.com/v1/recommendations/available-genre-seeds"
    headers = get_auth_header(token)
    result = get(url, headers=headers)
    print("genre result:", result)
    json_result = json.loads(result.content)
    return json_result

# get recommendations based on tracks
def get_recommendations(token, seed_tracks):
    url = f"https://api.spotify.com/v1/recommendations?market=CA&limit=1&seed_tracks={seed_tracks}"
    headers = get_auth_header(token)
    result = get(url, headers=headers)
    json_result = json.loads(result.content)['tracks']
    return json_result


#### Set up token and play around

In [5]:
token = get_token()
result = search_for_artist(token, "Of Monsters and Men")
artist_id = result["id"]
print(result['name'], "Top Tracks")
songs = get_songs_by_artist(token, artist_id)
for idx, song in enumerate(songs):
    print(f"{idx + 1}. {song['name']}")

Of Monsters and Men Top Tracks
1. Little Talks
2. Dirty Paws
3. Mountain Sound
4. King And Lionheart
5. Crystals
6. Love Love Love
7. Alligator
8. Wolves Without Teeth
9. Human
10. Empire


## PROJECT PLAN
**Problem:** Given a song, recommend 5 new songs.<br><br>
**Solution:**
1. Build and train the model
    1. Use the Spotify Web API's *Get Tracks' Audio Features* to get the features of tracks.
    2. Perform exploratory data analysis to find general patterns, missing values, identify outliers, etc.
    3. Perform feature engineering to remove irrelevant data, combine data, and mutate data.
    4. Use Spotify Web API's *Get Recommendations* as labeled data.
    5. Train the model.<br><br>
2. Test the model
    1. Input a song.
    2. Use the Spotify Web API to get the track's audio features.
    2. Check the proximity of model's recommended songs vs Spotify's recommended songs.<br><br>
3. Use the model
    1. Input a song.
    2. Enjoy the 5 recommended songs :)

## EDA and Feature Engineering

#### Load in data from Kaggle dataset

In [6]:
# load in the data and display first track
all_song_data = pd.read_csv("spotify_data.csv")
all_song_data.head(1)

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,0,Jason Mraz,I Won't Give Up,53QF56cjZA9RTuuMZDrSA6,68,2012,acoustic,0.483,0.303,4,-10.058,1,0.0429,0.694,0.0,0.115,0.139,133.406,240166,3


In [7]:
# drop the first column since it is just the index
all_song_data = all_song_data.drop(all_song_data.columns[0], axis=1)
all_song_data.head(1)

Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Jason Mraz,I Won't Give Up,53QF56cjZA9RTuuMZDrSA6,68,2012,acoustic,0.483,0.303,4,-10.058,1,0.0429,0.694,0.0,0.115,0.139,133.406,240166,3


In [8]:
# use artist id from earlier to check format of genre
artist_genre = get_artist(token, artist_id)['genres']
print(artist_genre)
jason_mraz = search_for_artist(token, "Jason Mraz")['id']
jason_genre = get_artist(token, jason_mraz)['genres']
print(jason_genre)
kaggle_genres = list(all_song_data['genre'].unique())
print(kaggle_genres)
print('dance' in kaggle_genres)

['folk-pop', 'metropopolis', 'modern rock', 'stomp and holler']
['acoustic pop', 'dance pop', 'neo mellow', 'pop']
['acoustic', 'afrobeat', 'alt-rock', 'ambient', 'black-metal', 'blues', 'breakbeat', 'cantopop', 'chicago-house', 'chill', 'classical', 'club', 'comedy', 'country', 'dance', 'dancehall', 'death-metal', 'deep-house', 'detroit-techno', 'disco', 'drum-and-bass', 'dub', 'dubstep', 'edm', 'electro', 'electronic', 'emo', 'folk', 'forro', 'french', 'funk', 'garage', 'german', 'gospel', 'goth', 'grindcore', 'groove', 'guitar', 'hard-rock', 'hardcore', 'hardstyle', 'heavy-metal', 'hip-hop', 'house', 'indian', 'indie-pop', 'industrial', 'jazz', 'k-pop', 'metal', 'metalcore', 'minimal-techno', 'new-age', 'opera', 'party', 'piano', 'pop', 'pop-film', 'power-pop', 'progressive-house', 'psych-rock', 'punk', 'punk-rock', 'rock', 'rock-n-roll', 'romance', 'sad', 'salsa', 'samba', 'sertanejo', 'show-tunes', 'singer-songwriter', 'ska', 'sleep', 'songwriter', 'soul', 'spanish', 'swedish', 't

#### Manually reduce amount of genres

In [13]:
# list of genres to remove
remove_genres = []
# loop through all genres
for genre in kaggle_genres:
    # there are 5 subgenres of metal that can be reduced to just metal
    if "metal" in genre:
        if genre != "metal":
            remove_genres.append(genre)
    if "pop" in genre:
        print('pop:', genre)
    if "indie" in genre:
        print('indie:', genre)
    # there are 6 subgenres of rock that can be reduced to just rock
    if "rock" in genre:
        if genre != "rock":
            remove_genres.append(genre)

# remove the genres in remove_genres
for genre in remove_genres:
    kaggle_genres.remove(genre)

print(kaggle_genres)


pop: cantopop
pop: indie-pop
indie: indie-pop
pop: k-pop
pop: pop
pop: pop-film
pop: power-pop
['acoustic', 'afrobeat', 'ambient', 'blues', 'breakbeat', 'cantopop', 'chicago-house', 'chill', 'classical', 'club', 'comedy', 'country', 'dance', 'dancehall', 'deep-house', 'detroit-techno', 'disco', 'drum-and-bass', 'dub', 'dubstep', 'edm', 'electro', 'electronic', 'emo', 'folk', 'forro', 'french', 'funk', 'garage', 'german', 'gospel', 'goth', 'grindcore', 'groove', 'guitar', 'hardcore', 'hardstyle', 'hip-hop', 'house', 'indian', 'indie-pop', 'industrial', 'jazz', 'k-pop', 'metal', 'minimal-techno', 'new-age', 'opera', 'party', 'piano', 'pop', 'pop-film', 'power-pop', 'progressive-house', 'punk', 'rock', 'romance', 'sad', 'salsa', 'samba', 'sertanejo', 'show-tunes', 'singer-songwriter', 'ska', 'sleep', 'songwriter', 'soul', 'spanish', 'swedish', 'tango', 'techno', 'trance', 'trip-hop']


#### One Hot Encode the Genres in Kaggle Dataset

#### Find difference in genres between Kaggle dataset and Spotify's genres

In [8]:
'''# variables with the genres from kaggle and genres from spotify
spotify_genres = get_genres(token)['genres']
kaggle_genres = list(all_song_data['genre'].unique())

difference = list(set(kaggle_genres) - set(spotify_genres))
print("Genres in Kaggle dataset but not in Spotify dataset:", len(difference))
difference2 = list(set(spotify_genres) - set(kaggle_genres))
print("Genres in Spotify dataset but not in Kaggle dataset:", len(difference2))'''
# temporarily commented out cause currently timed out from Spotify API
peng = 1

#### Get recommendations for all tracks in Kaggle dataset
Did not work because I'm making 1 million get requests to the Spotify API and so Response 429 was raised stopping me from making more get requests.<br><br>
I'll try to do this later on a smaller scale!

In [9]:
'''# begin process of getting recommendations for all 1 million tracks
for trackid in all_song_data['track_id']:
    rec_track = get_recommendations(token, trackid)['tracks']
    for track in rec_track:
        rec_id = track['id']

# split data into train data and test data
from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split()'''
jeffrey = 1