In [34]:
import os
import glob 
from hdf5_getters import *
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, MinMaxScaler

In [38]:
def assemble_df(features: [str], basedir: str, ext: str = ".h5") -> pd.DataFrame:
    
    msd_dict = {feat:[] for feat in features}
    
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root, '*'+ext))
        
        for f in files:
            hf = open_h5_file_read(f)
            
            for feat in features:
                info = eval("get_{}(hf)".format(feat))
                msd_dict["{}".format(feat)].append(info)
            
            hf.close()
        
    return pd.DataFrame(msd_dict)
    

In [39]:
msd_df = assemble_df(["title", "artist_name", "artist_hotttnesss", "artist_terms", 
                      "artist_terms_freq", "artist_terms_weight", "danceability",
                      "duration", "energy", "loudness", "tempo", "year"], 
                     "/home/malits/data/MillionSongSubset/data")

In [42]:
msd_df.to_csv("/home/malits/data/MillionSongsDataset.csv")

Next steps: 
- Clean up data (i.e., remove 'b' indicator)
- Scale values appropriately
- Create pipeline
- Column transformer - start with results on TFIDF for artist labels and see how far that gets
- summary stats maybe but also not necessary
- afterwards - do year prediction. Maybe some sort of sentiment analysis based off of song content

In [2]:
msd_df = pd.read_csv("/home/malits/data/MillionSongsDataset.csv")

In [3]:
msd_df.head(2)

Unnamed: 0.1,Unnamed: 0,title,artist_name,artist_hotttnesss,artist_terms,artist_terms_freq,artist_terms_weight,danceability,duration,energy,loudness,tempo,year
0,0,b'Le Precipice',b'Daara J',0.300735,[b'afrobeat' b'reggae' b'hip hop' b'rap' b'con...,[0.94831612 0.97681825 1. 0.75112764 0...,[1. 0.91655448 0.91356074 0.71026606 0...,0.0,261.61587,0.0,-7.727,91.005,0
1,1,b'Leaders Of Men',b'Joy Division',0.576295,[b'alternative' b'indie' b'gothic rock' b'guit...,[0.99228052 1. 0.42174574 0.39448353 0...,[1. 0.99996121 0.66981663 0.57741088 0...,0.0,141.08689,0.0,-6.021,150.04,1978


In [4]:
working_df = msd_df[['title', 'artist_name', 'artist_terms', 'duration', 'loudness', 'tempo', 'year']]

In [5]:
working_df.head(2)

Unnamed: 0,title,artist_name,artist_terms,duration,loudness,tempo,year
0,b'Le Precipice',b'Daara J',[b'afrobeat' b'reggae' b'hip hop' b'rap' b'con...,261.61587,-7.727,91.005,0
1,b'Leaders Of Men',b'Joy Division',[b'alternative' b'indie' b'gothic rock' b'guit...,141.08689,-6.021,150.04,1978


In [8]:
decades = [0, 1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010]
working_df['decade'] = pd.cut(working_df["year"], decades)
working_df["valid_decade"] = (working_df["year"] > 0).apply(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [93]:
working_df = pd.concat((working_df, pd.get_dummies(working_df["decade"])), axis=1)

In [146]:
class KNN:
    
    def transform_data(self, X: pd.DataFrame) -> np.array:
            
        ct = ColumnTransformer(
            [
                ("term_tfidf", TfidfVectorizer(use_idf=True, lowercase=True), 'artist_terms'),
                ("duration_norm", MinMaxScaler(), ['duration']),
                ("loudness_norm", MinMaxScaler(), ['loudness']),
                ("tempo_norm", MinMaxScaler(), ['tempo']),
            ]
        )

        X_transformed = ct.fit_transform(X)
        
        one_hot_decades = np.array(X.loc[:, 'valid_decade':])
        
        feature_matrix = np.concatenate((X_transformed, one_hot_decades), axis=1)

        return feature_matrix
    
    def train_knn(self, X: pd.DataFrame, k: int = 5):
        self.data = X
        feature_matrix = self.transformed_data(X)
        
        knn = NearestNeighbors(n_neighbors=k)
        knn.fit(feature_matrix)
        
        self.knn = knn
        
    def get_neighbors(self, X: pd.DataFrame, k: int = 0): 
        query = transform_data(X)
        
        dist, ind = self.knn.kneighbors(query, n_neighbors=k)
        
        return self.data.iloc[ind]

In [147]:
X_train, X_test = train_test_split(working_df)