In [1]:
# Standard Imports

import numpy as np
import pandas as pd
import os
import sys
from collections import defaultdict
from importlib import reload
from bs4 import BeautifulSoup
import requests

import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'


# Load MongoDB

from pymongo import MongoClient
client = MongoClient()
# Access/Initiate Database
db = client['producer_db']
# Access/Initiate Table
tab = db['songs']
collection = db.tab

# Authorize Spotify API

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

client_id = os.environ['SPOTIFY_CLIENT_ID']
client_secret = os.environ['SPOTIFY_CLIENT_SECRET']
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [2]:
len(collection.find_one()['audio_analysis']['segments'])

498

### Gameplan: 

Make a `timbre` array for every song of size (12,100) by taking the first 100 segments. Make a dictionary where...

In [3]:
test_dict = {'colA':[1,2,3],
             'colB':[4,5,6],
             'colC':['now','we are','done'],
             'colD':[np.array([1,2,3]),np.array([1,2,3]),np.array([1,2,3])]}

pd.DataFrame(test_dict)['colD'][0]

array([1, 2, 3])

In [4]:
track = collection.find_one()['track']
artist = collection.find_one()['artist']
producer = collection.find_one()['producer']

In [5]:
timbre_list = []

for i in range(100):
    timbre_list.append(collection.find_one()['audio_analysis']['segments'][i]['timbre'])
    
timbre = np.stack(timbre_list)

In [6]:
timbre.flatten().shape

(1200,)

Let's see if we can use PCA to distinguish between George Martin (ct. 130) and Rick Rubin (ct. 155)

In [7]:
gm = collection.find({'producer':'George Martin'})
rr = collection.find({'producer':'Rick Rubin'})

In [8]:
tracks = []
artists = []
producers = []
timbres = []
collections = [gm, rr]

for collection in collections:
    for song in collection:
        track = song['track']
        artist = song['artist']
        producer = song['producer']

        timbre_list=[]
        for i in range(80):
            timbre_list.append(song['audio_analysis']['segments'][i]['timbre'])
        timbre = np.concatenate(timbre_list, axis=0)
    
        tracks.append(track)
        artists.append(artist)
        producers.append(producer)
        timbres.append(timbre)
        
producer_dict = {}
producer_dict['track'] = tracks
producer_dict['artist'] = artists
producer_dict['producer'] = producers
producer_dict['timbre'] = timbres

producer_df = pd.DataFrame(producer_dict)

producer_df.head()

Unnamed: 0,track,artist,producer,timbre
0,12-Bar Original,the Beatles,George Martin,"[0.0, 171.13, 9.469, -28.48, 57.491, -50.067, ..."
1,Across the Universe,the Beatles,George Martin,"[0.0, 171.13, 9.469, -28.48, 57.491, -50.067, ..."
2,Act Naturally,Buck Owens and the Buckaroos,George Martin,"[41.561, 196.927, 110.997, 28.169, 47.868, -31..."
3,Alfie,Cilla Black,George Martin,"[0.0, 171.13, 9.469, -28.48, 57.491, -50.067, ..."
4,All I've Got to Do,the Beatles,George Martin,"[0.0, 171.13, 9.469, -28.48, 57.491, -50.067, ..."


In [9]:
producer_df.producer.value_counts()

George Martin    213
Rick Rubin       155
Name: producer, dtype: int64

In [12]:
producer_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 368 entries, 0 to 367
Data columns (total 4 columns):
track       368 non-null object
artist      368 non-null object
producer    368 non-null object
timbre      368 non-null object
dtypes: object(4)
memory usage: 11.6+ KB


In [16]:
# Class balance a df. 155 Rick Rubin and 155 George Martin

df = producer_df[producer_df['producer'] == "George Martin"].sample(155, random_state = 137)
df2 = producer_df[producer_df['producer'] == "Rick Rubin"]

In [24]:
data = pd.concat([df, df2], axis = 0)
data['target'] = (data['producer'] == 'George Martin').astype(int)

In [25]:
data.head(), data.tail()

(                                    track       artist       producer  \
 125                              No Reply  the Beatles  George Martin   
 126  Norwegian Wood (This Bird Has Flown)  the Beatles  George Martin   
 173                                Taxman  the Beatles  George Martin   
 174                  Tell Me What You See  the Beatles  George Martin   
 102                             Let It Be  the Beatles  George Martin   
 
                                                 timbre  target  
 125  [0.0, 171.13, 9.469, -28.48, 57.491, -50.067, ...       1  
 126  [0.0, 171.13, 9.469, -28.48, 57.491, -50.067, ...       1  
 173  [0.0, 171.13, 9.469, -28.48, 57.491, -50.067, ...       1  
 174  [0.0, 171.13, 9.469, -28.48, 57.491, -50.067, ...       1  
 102  [27.339, -115.081, -4.784, 13.866, 53.215, 100...       1  ,
                                             track                 artist  \
 363                   You Don't Know How It Feels              Tom Petty   
 36

In [26]:
# Set up a test train split. 

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

In [40]:
X = np.stack(data['timbre'].to_list())
y = data['target'].values

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 137, test_size = 0.3 )

In [42]:
X_train.shape, X_test.shape

((217, 960), (93, 960))