In [1]:
# modules we'll use

import numpy as np
import pandas as pd
import gensim

#mount Google Drive
#from google.colab import drive
#drive.mount('/content/drive')

from preprocessing.label_binarizer import DataFrameLabelBinarizer
from preprocessing.mean_embedding_vector import MeanEmbeddingVectorizer



In [2]:
# read in all our data
# Google Drive's path: /content/drive/MyDrive/Praca Inżynierska/kickstarter/data/spotify.csv
spotify_orig  = pd.read_csv("../data/spotify.csv")
# set seed for reproducibility
np.random.seed(0)

In [3]:
# look at a few rows of the spotify dataset file
spotify_orig.sample(5)

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
107349,0.817,2013,0.0158,['Parmalee'],0.551,214933,0.863,0,3Bdqlr7jQLNhITAgcBGQBG,0.0,11,0.0843,-3.506,1,Close Your Eyes,50,2013-12-10,0.0322,143.952
16271,0.548,2003,0.00661,['JAY-Z'],0.494,234627,0.887,1,7sLpSWxQazJzDVG6YGzlVs,0.0,6,0.103,-4.297,0,99 Problems,61,2003-11-14,0.398,89.554
90972,0.732,2014,0.0477,['Sam Hunt'],0.59,235507,0.94,0,3BuPop8SzLG2Q88TJcFAjp,0.0,9,0.379,-4.124,1,Raised On It,54,2014-10-27,0.0409,94.02
84553,0.475,1981,0.000473,['Iron Maiden'],0.34,288947,0.974,0,7EvjTEzuv7TWaIaWY63sWV,0.0928,0,0.373,-5.114,1,Drifter - 2015 Remaster,29,1981-02-02,0.106,101.276
75895,0.55,1930,0.994,"['Markos Vamvakaris', 'Apostolos Xatzixristos']",0.41,197653,0.169,0,38PozVGXXoeO8dTEVzy74Y,0.901,2,0.113,-18.862,1,Soultana maurofora,0,1930-01-01,0.0391,93.89


In [4]:
# get info about DataFrame columns
spotify_orig.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170653 entries, 0 to 170652
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   valence           170653 non-null  float64
 1   year              170653 non-null  int64  
 2   acousticness      170653 non-null  float64
 3   artists           170653 non-null  object 
 4   danceability      170653 non-null  float64
 5   duration_ms       170653 non-null  int64  
 6   energy            170653 non-null  float64
 7   explicit          170653 non-null  int64  
 8   id                170653 non-null  object 
 9   instrumentalness  170653 non-null  float64
 10  key               170653 non-null  int64  
 11  liveness          170653 non-null  float64
 12  loudness          170653 non-null  float64
 13  mode              170653 non-null  int64  
 14  name              170653 non-null  object 
 15  popularity        170653 non-null  int64  
 16  release_date      17

In [5]:
# convert 'key' to one-hot-encoded
lb_category = DataFrameLabelBinarizer(data_frame=spotify_orig, column_to_encode='key')
spotify_orig = lb_category.encode()

# look at a few rows of the spotify dataset file
spotify_orig.sample(5)

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,...,2,3,4,5,6,7,8,9,10,11
96559,0.625,1959,0.682,['Martin Denny'],0.427,142400,0.411,0,7niPihgrMNksket1T0ZbKa,0.41,...,0,0,0,0,0,0,0,0,0,0
153920,0.515,2012,0.0362,"['Rihanna', 'Chris Brown']",0.741,216293,0.6,1,0qJWmTaT1qvCq0brgx8k2P,0.0,...,0,0,0,0,0,0,0,0,0,0
77383,0.0661,1945,0.914,"['Gustav Mahler', 'Bruno Walter', 'New York Ph...",0.261,1097547,0.243,0,28sMV7kIkozKg1sc33Cvmf,0.917,...,0,0,0,0,0,0,0,0,0,0
58470,0.0395,1933,0.955,"['Johann Sebastian Bach', 'Albert Schweitzer']",0.174,202503,0.0719,0,3uHoDBPDUYKDBv5lezfCxS,0.853,...,0,0,0,0,0,0,0,0,0,1
115209,0.825,1971,0.155,['Yusuf / Cat Stevens'],0.732,212667,0.653,0,5PUP1Qicfa9rMgxAkUahIC,0.000506,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# convert release_date to milliseconds
spotify_orig['release_date'] = pd.to_datetime(spotify_orig['release_date'], format = "%Y-%m-%d").astype(np.int64) / 10**9

# look at a few rows of the ks_projects file
spotify_orig['release_date'].sample(5)

39904    -1.167610e+09
143595   -4.102272e+08
76669    -9.763200e+08
141674   -7.258464e+08
143153   -4.441824e+08
Name: release_date, dtype: float64

In [7]:
# convert 'name' to embedding (vector)
names_w2v = gensim.models.KeyedVectors.load_word2vec_format(fname="./data/spotify_track_names_embedding.bin", binary=True)
embedding_vectorizer = MeanEmbeddingVectorizer(names_w2v)
spotify_orig = embedding_vectorizer.transform(data_frame=spotify_orig, column_to_encode='name')

In [8]:
# look at a few rows of the ks_projects file
spotify_orig.head(30)


Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,...,290,291,292,293,294,295,296,297,298,299
0,0.0594,1921,0.982,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878,...,0.044609,0.002701,0.082216,0.066865,0.007549,0.057522,0.084335,0.024367,-0.091546,0.055006
1,0.963,1921,0.732,['Dennis Day'],0.819,180533,0.341,0,7xPhfUan2yNtyFG0cUWkt8,0.0,...,0.078323,0.075364,0.036391,0.035745,0.059904,0.150445,0.010158,0.032958,-0.002272,0.11125
2,0.0394,1921,0.961,['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,0.328,500062,0.166,0,1o6I8BglA6ylDMrIELygv1,0.913,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.165,1921,0.967,['Frank Parker'],0.275,210000,0.309,0,3ftBPsC5vPBKxYSee08FDH,2.8e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.253,1921,0.957,['Phil Regan'],0.418,166693,0.193,0,4d6HGyGT8e121BsdKmw9v6,2e-06,...,0.039865,0.064271,-0.030167,0.000964,0.05732,0.167101,-0.007649,-0.022731,-0.009534,0.102913
5,0.196,1921,0.579,['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,0.697,395076,0.346,0,4pyw9DVHGStUre4J6hPngr,0.168,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.406,1921,0.996,['John McCormack'],0.518,159507,0.203,0,5uNZnElqOS3W4fRmRYPk4T,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0731,1921,0.993,['Sergei Rachmaninoff'],0.389,218773,0.088,0,02GDntOXexBFUvSgaXLPkd,0.527,...,0.038651,0.016879,0.099215,0.036551,0.0309,0.036561,0.048462,0.009621,-0.089695,0.026516
8,0.721,1921,0.996,['Ignacio Corsini'],0.485,161520,0.13,0,05xDjWH9ub67nJJk82yfGf,0.151,...,0.015453,0.011147,0.120185,-0.004982,0.058531,0.013018,0.006694,-0.027594,-0.083273,-0.042136
9,0.771,1921,0.982,['Fortugé'],0.684,196560,0.257,0,08zfJvRLp7pjAb94MA9JmF,0.0,...,-0.006613,-0.026098,0.137225,0.080433,0.010241,-0.003989,0.035694,-0.022774,-0.046326,0.012025
