In [36]:
import json
from tqdm.notebook import tqdm
import psycopg2
import psycopg2.extras
from pprint import pprint
import datetime
from numpy import interp
from matplotlib import pyplot
import pandas as pd
import pandas.io.sql as sqlio
from pandas.plotting import scatter_matrix
from matplotlib.pyplot import figure
import copy
import matplotlib.pyplot as plt
import collections
import requests

pd.set_option('mode.chained_assignment', None)

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.utils import shuffle
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import export_graphviz

In [3]:
def resultsAsDictArray(results, colnames):
    return_array = []
    for result in tqdm(results):
        d = {}
        for col in colnames:
            d[col] = result[colnames.index(col)]
        return_array.append(d)
    return return_array

In [4]:
conn = psycopg2.connect(user='postgres', database="postgres", password='password', host='localhost', port='5432')
cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)

In [5]:
cursor.execute('SELECT * FROM music')
results = cursor.fetchall()
colnames = [desc[0] for desc in cursor.description]
musics = resultsAsDictArray(results, colnames)

  0%|          | 0/697510 [00:00<?, ?it/s]

In [6]:
cursor.execute('SELECT * FROM music_artist')
results = cursor.fetchall()
colnames = [desc[0] for desc in cursor.description]
musics_artists_raw = resultsAsDictArray(results, colnames)

  0%|          | 0/1127195 [00:00<?, ?it/s]

In [7]:
musics_artists = {}
for music_artist in musics_artists_raw:
    music_id = music_artist['music_id']
    if music_id not in musics_artists:
        musics_artists[music_id] = []
    musics_artists[music_id].append(music_artist['artist_id'])

In [8]:
cursor.execute("""
SELECT * 
FROM artist a 
INNER JOIN genre_artist ga ON a.artist_id = ga.artist_id
INNER JOIN genre g ON g.genre_id = ga.genre_id
INNER JOIN genre_category gc ON gc.genre_id = g.genre_id
INNER JOIN category c ON c.category_id = gc.category_id
""")
results = cursor.fetchall()
colnames = [desc[0] for desc in cursor.description]
artists_raw = resultsAsDictArray(results, colnames)

  0%|          | 0/12574 [00:00<?, ?it/s]

In [9]:
artists_multiple = {}
for artist in artists_raw:
    artist_id = artist['artist_id']
    if artist_id not in artists_multiple:
        artists_multiple[artist_id] = []
    artists_multiple[artist_id].append(artist)
#     pprint(artist)
#     break

In [10]:
artists = {}
for artist in tqdm(artists_multiple):
    for i in artists_multiple[artist]:
        if i['artist_id'] not in artists:
            artists[i['artist_id']] = copy.deepcopy(i)
            artists[i['artist_id']]['genres'] = set()
            artists[i['artist_id']]['categories'] = set()
            artists[i['artist_id']].pop('category_id', None)
            artists[i['artist_id']].pop('category', None)
            artists[i['artist_id']].pop('genre_artist_id', None)
            artists[i['artist_id']].pop('genre_category_id', None)
            artists[i['artist_id']].pop('genre_id', None)
            artists[i['artist_id']].pop('genre', None)
        artists[i['artist_id']]['genres'].add(i['genre'])
        artists[i['artist_id']]['categories'].add(i['category'])
#     pprint(artists)
#     break

  0%|          | 0/3380 [00:00<?, ?it/s]

In [11]:
cursor.execute('SELECT * FROM category')
results = cursor.fetchall()
colnames = [desc[0] for desc in cursor.description]
categories = resultsAsDictArray(results, colnames)

  0%|          | 0/21 [00:00<?, ?it/s]

In [12]:
cursor.execute('SELECT * FROM genre')
results = cursor.fetchall()
colnames = [desc[0] for desc in cursor.description]
genres = resultsAsDictArray(results, colnames)

  0%|          | 0/1583 [00:00<?, ?it/s]

In [14]:
for music in tqdm(musics):
    music_id = music['music_id']
    for category in categories:
        music[category['category']] = 0
    list_artist = []
    for music_artist in musics_artists.get(music_id, []):
        artist = artists.get(music_artist, '')
        if artist == '':
            continue
        for category in list(artist['categories']):
            music[category] = 1
        music['genres'] = ', '.join(list(artist['genres']))
        list_artist.append(artist['name'])
        
    music['artists'] = ', '.join(list_artist)
#     pprint(music)
#     break

  0%|          | 0/697510 [00:00<?, ?it/s]

In [15]:
df = pd.DataFrame.from_records(musics)

In [16]:
df_filtered = df.drop(columns=['name', 'duration_ms', 'mode', 'time_signature', 'genres', 'artists'])

df_filtered.insert(2, 'like_rate_temp', df_filtered['like_rate'])
df_filtered.drop('like_rate', axis=1, inplace=True) 
df_filtered.rename(columns={'like_rate_temp': 'like_rate'}, inplace=True)

df_filtered = df_filtered.reset_index()
df_filtered = df_filtered.drop(columns=['index'])



df_filtered

Unnamed: 0,music_id,music_spotify_uri,like_rate,popularity,danceability,energy,music_key,loudness,speechiness,acousticness,...,Reggae,Nacional,Funk,Soundtrack,Punk,Country,Classical,Instrumental,Sertanejo,Lo-fi
0,8040,spotify:track:0awxPw2WkYJVzcSKWvJZAh,,45.0,0.631,0.534000,2.0,-6.783,0.0848,0.5050,...,0,0,0,0,0,0,0,0,0,0
1,8041,spotify:track:4ToSbnK7fD5vE9R5sgWPsy,,0.0,0.536,0.762000,8.0,-6.749,0.0427,0.2700,...,0,0,0,0,0,0,0,0,0,0
2,8042,spotify:track:5GW2n1h3BNYnpU7lK23iKv,,0.0,0.695,0.705000,7.0,-5.493,0.0243,0.3140,...,0,0,0,0,0,0,0,0,0,0
3,8043,spotify:track:3M0pROlXyH0bzh9CNJhDOJ,,0.0,0.656,0.572000,4.0,-8.162,0.0319,0.6530,...,0,0,0,0,0,0,0,0,0,0
4,8044,spotify:track:1uGAMOMKqPPYxAllZ1PV4s,,0.0,0.696,0.417000,9.0,-9.624,0.0270,0.8260,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
697505,8035,spotify:track:3Z9Kaszz1KKOdWhP7Q8fVR,,0.0,0.000,0.000278,5.0,-32.668,0.0000,0.0219,...,0,0,0,0,0,0,0,0,0,0
697506,8036,spotify:track:20FREPNcSl4TnZEt1GAYSB,,0.0,0.717,0.807000,8.0,-8.348,0.0632,0.0912,...,0,0,0,0,0,0,0,0,0,0
697507,8037,spotify:track:47uRsTBT1Jb93QSvY5C12O,,0.0,0.670,0.685000,2.0,-5.486,0.0510,0.0625,...,0,0,0,0,0,0,0,0,0,0
697508,8038,spotify:track:6vzhxMkDnTdRURQzqI1p0L,,0.0,0.814,0.642000,0.0,-6.394,0.0951,0.1320,...,0,0,0,0,0,0,0,0,0,0


In [78]:
endpoint = "https://api.spotify.com/v1/me"
data = {}
headers = {"Authorization": "Bearer BQDoEi1eiOBoWbSnToZCyMbvmeeXZY8L7756nGiXTsGyLmzZOpYUCPGMAO_2Rz95ENbtFtRIxqyAz-e4F-JMllA-UcD2p-w0zjKg9fOiHQa1jPYD55IWzHiHnd7mNw-IwIKvf7vYf5I3aZ9ZWCG5g0wqXgXc5Mrt-bGaSNyHzDZp8UOT7iTUwoRp93y1gFWFEkYyVOwmmjM46XqmxOqWb1VUWlpHMJ6LwC3ihddoTX_YU3JqW8d7JBkZEPxpkIb0fQaExOwtTyVdv5GGcSG9WnLqjg"}

response = requests.get(endpoint, data=data, headers=headers).json()
me = response

In [79]:
def createPlaylist(name, description):
    endpoint = "https://api.spotify.com/v1/users/" + me['id'] + "/playlists"
    data = {
      "name": name,
      "description": description,
      "public": "false"
    }

    response = requests.post(endpoint, data=json.dumps(data), headers=headers).json()
    #print(response)
    return response

def addToPlaylist(songList, playlist_id):
    if len(songList) > 99:
        for k in range(0, len(songList), 99):
            kmin = k
            kmax = kmin + 98
            if kmax > len(songList):
                kmax = len(songList)
            addToPlaylist(songList[kmin:kmax], playlist_id)
            print(str(kmin) + '/' + str(kmax))
        return
    
    endpoint = "https://api.spotify.com/v1/playlists/" + playlist_id + "/tracks"
    data = {
      "uris": songList
    }

    response = requests.post(endpoint, data=json.dumps(data), headers=headers).json()
    return response

In [81]:
date = datetime.date.today()
time = datetime.datetime.now()
name = '%04d-%02d-%02d %02d:%02d' % (date.year, date.month, date.day, time.hour, time.minute)

for cat_raw in tqdm(categories):
    category = cat_raw['category']
    
    df_category = df_filtered[df_filtered[category] == 1]
    print(category + ': ' + str(len(df_category)) + ' songs')
    
    df_train = df_category[df_category['like_rate'].notna()]
    df_train['like_rate'][df_train['like_rate'] > 0.35] = 1
    df_train['like_rate'][df_train['like_rate'] <= 0.35] = 0
    df_train
    
    array = df_train.values
    X = array[:,3:]
    y = array[:,2]
    y = y.astype('int')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    dtc = DecisionTreeClassifier(random_state=42)
    dtc.fit(X_train, y_train)
    print('\tFirst Test: ' + str(dtc.score(X_test, y_test)))
    
    dtc = DecisionTreeClassifier(criterion='entropy', max_features=10, random_state=42, max_leaf_nodes=150, max_depth=7, min_samples_split=100)
    dtc.fit(X_train, y_train)
    print('\tSecond Test: ' + str(dtc.score(X_test, y_test)))
    
    bag = BaggingClassifier(dtc, n_estimators=100, max_samples=0.8, random_state=42)
    bag.fit(X_train, y_train)
    print('\tThird Test: ' + str(bag.score(X_test, y_test)))
    
    df_try = df_category[df_category['like_rate'].isna()]
    df_try = df_try.reset_index()
    df_try = df_try.drop(columns=['index'])

    array = df_try.values
    X = array[:,3:]
    
    music_note = {}
    for i in range(0, len(X), 1000):
        print('\tTraining: ' + str(i) + '/' + str(len(X)) + '(' + str(int(i * 100.0 / len(X))) + '%)', end='\r')
        max_i = i + 1000
        if max_i >= len(X):
            max_i = len(X) - 2
        try:
            predicted = bag.predict(X[i:max_i])
            for index, value in enumerate(predicted):
                music_note[df_try.loc[i + index]['music_spotify_uri']] = value
        except:
            pass
            
    music_note_sorted = dict(sorted(music_note.items(), key=lambda item: item[1], reverse=True)) 
    print('\t\t' + str(collections.Counter(music_note_sorted.values())))
    
    to_print = []
    for music in music_note_sorted:
        if music_note_sorted[music] == 1:
            to_print.append(music)
        
    if len(to_print) == 0:
        continue
    # Create Playlist
    playlist = createPlaylist(name + ' ' + category.upper() +  ' [' + str(len(to_print)) + ']', '')
    
    # Add songs
    response = addToPlaylist(to_print, playlist['id'])
#     break

  0%|          | 0/21 [00:00<?, ?it/s]

Metal: 10079 songs
	First Test: 0.7058823529411765
	Second Test: 0.8431372549019608
	Third Test: 0.8431372549019608
		Counter({0: 9907})(90%)
Rock: 125994 songs
	First Test: 0.6923076923076923
	Second Test: 0.8168498168498168
	Third Test: 0.8168498168498168
		Counter({0: 123979, 1: 21})
Pop: 218708 songs
	First Test: 0.702725724020443
	Second Test: 0.7921635434412265
	Third Test: 0.7853492333901193
		Counter({0: 213664, 1: 336})
0/98
99/197
198/296
297/336
Indie: 79704 songs
	First Test: 0.6983122362869199
	Second Test: 0.7848101265822784
	Third Test: 0.7932489451476793
		Counter({0: 77978, 1: 22})
Rap: 59998 songs
	First Test: 0.8401639344262295
	Second Test: 0.860655737704918
	Third Test: 0.9098360655737705
		Counter({0: 59000})6(99%)
R&b: 52985 songs
	First Test: 0.7416267942583732
	Second Test: 0.8038277511961722
	Third Test: 0.8038277511961722
		Counter({0: 52000})1(99%)
Hip hop: 47881 songs
	First Test: 0.8171428571428572
	Second Test: 0.8685714285714285
	Third Test: 0.8685714285

In [51]:
# name = str(date.year) + '-' + str(date.month) + '-' + str(date.day) + ' ' + str(time.hour) + ':' + str(time.minute)
print(name)

2021-08-03 12:50
