In [1]:
# modules we'll use
from collections import Callable

import numpy as np
import pandas as pd
import gensim
import tensorflow as tf
import sys

from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import layers

#mount Google Drive
#from google.colab import drive
#drive.mount('/content/drive', force_remount=True)
#sys.path.insert(0,'/content/drive/MyDrive/Praca_Inżynierska/kickstarter/code')
from genetic_algorithm.crossover import OnePointCrossover
from genetic_algorithm.gene import FloatGene, Gene, IntegerGene
from genetic_algorithm.genetic_algorithm import GeneticAlgorithm
from genetic_algorithm.genome import Genome, LabeledSequence
from genetic_algorithm.initialization import RandomInitialization
from genetic_algorithm.mutation import RandomMutation
from genetic_algorithm.parent_selection import TournamentSelection

from preprocessing.label_binarizer import DataFrameLabelBinarizer
from plotting.loss_plotter import LossPlotter
from plotting.real_multi_pred_plotter import RealMultiPredPlotter

  
Using TensorFlow backend.


In [2]:
# read in all our data
# Google Drive's path: /content/drive/MyDrive/Praca_Inżynierska/kickstarter/data/spotify.csv
spotify_orig  = pd.read_csv("../data/spotify.csv")
# set seed for reproducibility
np.random.seed(0)

In [3]:
# look at a few rows of the Spotify dataset file
spotify_orig.sample(5)

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
107349,0.817,2013,0.0158,['Parmalee'],0.551,214933,0.863,0,3Bdqlr7jQLNhITAgcBGQBG,0.0,11,0.0843,-3.506,1,Close Your Eyes,50,2013-12-10,0.0322,143.952
16271,0.548,2003,0.00661,['JAY-Z'],0.494,234627,0.887,1,7sLpSWxQazJzDVG6YGzlVs,0.0,6,0.103,-4.297,0,99 Problems,61,2003-11-14,0.398,89.554
90972,0.732,2014,0.0477,['Sam Hunt'],0.59,235507,0.94,0,3BuPop8SzLG2Q88TJcFAjp,0.0,9,0.379,-4.124,1,Raised On It,54,2014-10-27,0.0409,94.02
84553,0.475,1981,0.000473,['Iron Maiden'],0.34,288947,0.974,0,7EvjTEzuv7TWaIaWY63sWV,0.0928,0,0.373,-5.114,1,Drifter - 2015 Remaster,29,1981-02-02,0.106,101.276
75895,0.55,1930,0.994,"['Markos Vamvakaris', 'Apostolos Xatzixristos']",0.41,197653,0.169,0,38PozVGXXoeO8dTEVzy74Y,0.901,2,0.113,-18.862,1,Soultana maurofora,0,1930-01-01,0.0391,93.89


In [4]:
# get info about DataFrame columns
spotify_orig.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170653 entries, 0 to 170652
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   valence           170653 non-null  float64
 1   year              170653 non-null  int64  
 2   acousticness      170653 non-null  float64
 3   artists           170653 non-null  object 
 4   danceability      170653 non-null  float64
 5   duration_ms       170653 non-null  int64  
 6   energy            170653 non-null  float64
 7   explicit          170653 non-null  int64  
 8   id                170653 non-null  object 
 9   instrumentalness  170653 non-null  float64
 10  key               170653 non-null  int64  
 11  liveness          170653 non-null  float64
 12  loudness          170653 non-null  float64
 13  mode              170653 non-null  int64  
 14  name              170653 non-null  object 
 15  popularity        170653 non-null  int64  
 16  release_date      17

In [5]:
# convert 'key' to one-hot-encoded
lb_category = DataFrameLabelBinarizer(data_frame=spotify_orig, column_to_encode='key')
spotify_converted = lb_category.encode()

In [6]:
# convert release_date to milliseconds
spotify_converted['release_date'] = pd.to_datetime(spotify_converted['release_date'], format = "%Y-%m-%d")\
                                        .astype(np.int64) / 10**9

In [7]:
# drop redundant columns
spotify_reduced = spotify_converted.drop(['artists', 'name', 'year', 'id'], axis=1)

In [8]:
# get info about DataFrame columns
spotify_reduced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170653 entries, 0 to 170652
Data columns (total 26 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   valence           170653 non-null  float64
 1   acousticness      170653 non-null  float64
 2   danceability      170653 non-null  float64
 3   duration_ms       170653 non-null  int64  
 4   energy            170653 non-null  float64
 5   explicit          170653 non-null  int64  
 6   instrumentalness  170653 non-null  float64
 7   liveness          170653 non-null  float64
 8   loudness          170653 non-null  float64
 9   mode              170653 non-null  int64  
 10  popularity        170653 non-null  int64  
 11  release_date      170653 non-null  float64
 12  speechiness       170653 non-null  float64
 13  tempo             170653 non-null  float64
 14  0                 170653 non-null  int32  
 15  1                 170653 non-null  int32  
 16  2                 17

In [9]:
# look at a few rows of the Spotify dataset
spotify_converted.sample(5)

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,...,2,3,4,5,6,7,8,9,10,11
96559,0.625,1959,0.682,['Martin Denny'],0.427,142400,0.411,0,7niPihgrMNksket1T0ZbKa,0.41,...,0,0,0,0,0,0,0,0,0,0
153920,0.515,2012,0.0362,"['Rihanna', 'Chris Brown']",0.741,216293,0.6,1,0qJWmTaT1qvCq0brgx8k2P,0.0,...,0,0,0,0,0,0,0,0,0,0
77383,0.0661,1945,0.914,"['Gustav Mahler', 'Bruno Walter', 'New York Ph...",0.261,1097547,0.243,0,28sMV7kIkozKg1sc33Cvmf,0.917,...,0,0,0,0,0,0,0,0,0,0
58470,0.0395,1933,0.955,"['Johann Sebastian Bach', 'Albert Schweitzer']",0.174,202503,0.0719,0,3uHoDBPDUYKDBv5lezfCxS,0.853,...,0,0,0,0,0,0,0,0,0,1
115209,0.825,1971,0.155,['Yusuf / Cat Stevens'],0.732,212667,0.653,0,5PUP1Qicfa9rMgxAkUahIC,0.000506,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# get index of 'popularity' column
popularity_index = spotify_reduced.columns.get_loc("popularity")

In [11]:
# convert to numpy_array
spotify_numpy = spotify_reduced.to_numpy()

In [12]:
# get X and y from numpy array
X = np.delete(arr=spotify_numpy, obj=popularity_index, axis=1)
y = spotify_numpy[:, popularity_index]

In [13]:
# count percent of y values above 0.5
np.count_nonzero(y > 0.5) / y.shape[0]

0.8365572243089778

In [14]:
# get train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

In [15]:
# train set size (train + valid)
X_train.shape[0]

119457

In [16]:
# test set size
X_test.shape[0]

51196

In [17]:
# count percent of y_train values above 0.5
np.count_nonzero(y_train > 0.5) / y_train.shape[0]

0.8375482391153302

In [18]:
# count percent of y_test values above 0.5
np.count_nonzero(y_test > 0.5) / y_test.shape[0]

0.8342448628799125

In [19]:
# create model function

def get_model(params: LabeledSequence[Gene]) -> keras.Sequential:
    model = keras.models.Sequential()

    model.add(layers.Input(shape=(X_train.shape[1],), name='inputs'))

    model.add(layers.Dense(units=params.get_by_label('hidden_1_units').value, kernel_initializer=keras.initializers.he_normal(),
                           bias_initializer=keras.initializers.he_normal(),
                           activation=keras.activations.elu, name='hidden_1'))
    model.add(layers.BatchNormalization(momentum=0.99))
    model.add(layers.Dropout(rate=params.get_by_label('hidden_1_dropout_rate').value))

    model.add(layers.Dense(units=params.get_by_label('hidden_2_units').value, kernel_initializer=keras.initializers.he_normal(),
                           bias_initializer=keras.initializers.he_normal(),
                           activation=keras.activations.elu, name='hidden_2'))
    model.add(layers.BatchNormalization(momentum=0.99))
    model.add(layers.Dropout(rate=params.get_by_label('hidden_2_dropout_rate').value))

    model.add(layers.Dense(units=1, activation=keras.activations.linear, name="outputs"))
    model.add(layers.BatchNormalization(momentum=0.99))

    return model

In [20]:
# train model
# optimizer = tf.keras.optimizers.Adam(learning_rate=0.05)
# model.compile(optimizer=optimizer, loss=keras.losses.mse, metrics=[keras.metrics.mse, keras.metrics.mae,
#                                                                    keras.metrics.RootMeanSquaredError(name='rmse')])
# history=model.fit(X_train, y_train, epochs=10, batch_size=170, verbose=1, validation_split=0.2)

In [21]:
# train set size
int(X_train.shape[0]*0.8)

95565

In [22]:
# validation set size
int(X_train.shape[0]*0.2)

23891

In [23]:
class GeneticAlgorithmImpl(GeneticAlgorithm):

    def init_population(self, ):
        return RandomInitialization(self.population_size, self.genome).init_population()

    def select_parents(self, fitness):
        return TournamentSelection().select_parents(self.population, fitness)

    def crossover(self, parents):
        return OnePointCrossover().crossover(parents)

    def mutate(self, crossovers):
        return RandomMutation().mutate(crossovers)

In [24]:
labeled_sequence = LabeledSequence()
labeled_sequence.append(IntegerGene(label='hidden_1_units', minimum=20, maximum=200))
labeled_sequence.append(FloatGene(label='hidden_1_dropout_rate', minimum=0.1, maximum=0.5))
labeled_sequence.append(IntegerGene(label='hidden_2_units', minimum=10, maximum=100))
labeled_sequence.append(FloatGene(label='hidden_2_dropout_rate', minimum=0.1, maximum=0.5))
labeled_sequence.append(FloatGene(label='learning_rate', minimum=0.01, maximum=0.2))
labeled_sequence.append(IntegerGene(label='batch_size', minimum=100, maximum=200))

In [34]:
def rank(params: LabeledSequence[Gene]) -> float:
    optimizer = tf.keras.optimizers.Adam(learning_rate=params.get_by_label('learning_rate').value)
    model = get_model(params)
    model.compile(optimizer=optimizer, loss=keras.losses.mse, metrics=[keras.metrics.mse, keras.metrics.mae,
                                                                   keras.metrics.RootMeanSquaredError(name='rmse')])
    history=model.fit(X_train, y_train, epochs=5, batch_size=params.get_by_label('batch_size').value, verbose=1,
                      validation_split=0.2)
    return float(list(history.history.items())[5][1][4])

In [35]:
genome = Genome(genes=labeled_sequence, rank_funk=rank)
algorithm_impl = GeneticAlgorithmImpl(genome=genome, population_size=5, generation_count=1,
                                        best_last_generations_size=5)
algorithm_impl.calculate()

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
dict_keys(['loss', 'mean_squared_error', 'mean_absolute_error', 'rmse', 'val_loss', 'val_mean_squared_error', 'val_mean_absolute_error', 'val_rmse'])
[('loss', [318.2067565917969, 119.25093078613281, 118.61714935302734, 117.72080993652344, 117.61836242675781]), ('mean_squared_error', [318.2067565917969, 119.25093078613281, 118.61714935302734, 117.72080993652344, 117.61836242675781]), ('mean_absolute_error', [12.711783409118652, 8.03632640838623, 8.001216888427734, 7.950716018676758, 7.937836647033691]), ('rmse', [17.838350296020508, 10.920207023620605, 10.89115047454834, 10.849922180175781, 10.845199584960938]), ('val_loss', [156.40872192382812, 113.98335266113281, 114.73445892333984, 112.3791732788086, 111.0982894897461]), ('val_mean_squared_error', [156.40872192382812, 113.98335266113281, 114.73445892333984, 112.3791732788086, 111.0982894897461]), ('val_mean_absolute_error', [9.575161933898926, 7.733787536621094, 7.552924156188965, 7.

<genetic_algorithm.genome.Genome at 0x1a1e0842bc8>

In [None]:
# save model
# model.save('models/spotify_v2_he_elu_adam_30_15')

In [None]:
# print history keys
# print(history.history.keys())

# print mse value from last epoch
# print(list(history.history.items())[1][1][9])

In [None]:
# loss_plotter = LossPlotter(history)
# loss_plotter.plot()

In [None]:
# Evaluate the model on the test data using `evaluate`

spotify_v2_he_elu_adam_20_10 = tf.keras.models.load_model('models/spotify_v2_he_elu_adam_20_10')
spotify_v2_he_elu_adam_20_10_results = spotify_v2_he_elu_adam_20_10.evaluate(X_test, y_test, batch_size=170)
print("spotify_v2_he_elu_adam_20_10 results:")
print(spotify_v2_he_elu_adam_20_10_results)

spotify_v2_he_elu_adam_30_15 = tf.keras.models.load_model('models/spotify_v2_he_elu_adam_30_15')
spotify_v2_he_elu_adam_30_15_results = spotify_v2_he_elu_adam_30_15.evaluate(X_test, y_test, batch_size=170)
print("spotify_v2_he_elu_adam_30_15 results:")
print(spotify_v2_he_elu_adam_30_15_results)

spotify_v2_he_elu_adam_40_20_10 = tf.keras.models.load_model('models/spotify_v2_he_elu_adam_40_20_10')
spotify_v2_he_elu_adam_40_20_10_results = spotify_v2_he_elu_adam_40_20_10.evaluate(X_test, y_test, batch_size=170)
print("spotify_v2_he_elu_adam_40_20_10 results:")
print(spotify_v2_he_elu_adam_40_20_10_results)

spotify_v2_he_elu_adam_40_20_10_5 = tf.keras.models.load_model('models/spotify_v2_he_elu_adam_40_20_10_5')
spotify_v2_he_elu_adam_40_20_10_5_results = spotify_v2_he_elu_adam_40_20_10_5.evaluate(X_test, y_test, batch_size=170)
print("spotify_v2_he_elu_adam_40_20_10_5 results:")
print(spotify_v2_he_elu_adam_40_20_10_5_results)

In [None]:
# Generate predictions (probabilities -- the output of the last layer)
# on new data using `predict`
print("Generate predictions for 10 samples")

spotify_v2_he_elu_adam_20_10_predictions = spotify_v2_he_elu_adam_20_10.predict(X_test[:10])
spotify_v2_he_elu_adam_30_15_predictions = spotify_v2_he_elu_adam_30_15.predict(X_test[:10])
spotify_v2_he_elu_adam_40_20_10_predictions = spotify_v2_he_elu_adam_40_20_10.predict(X_test[:10])
spotify_v2_he_elu_adam_40_20_10_5_predictions = spotify_v2_he_elu_adam_40_20_10_5.predict(X_test[:10])

real = np.reshape(y_test[:10], (-1, 1))
real_pred = np.append(real, spotify_v2_he_elu_adam_20_10_predictions, axis=1)
real_pred = np.append(real_pred, spotify_v2_he_elu_adam_30_15_predictions, axis=1)
real_pred = np.append(real_pred, spotify_v2_he_elu_adam_40_20_10_predictions, axis=1)
real_pred = np.append(real_pred, spotify_v2_he_elu_adam_40_20_10_5_predictions, axis=1)
print("real/predictions")
print(real_pred)

In [None]:
# plot real/pred chart
pred_dict = {"spotify_v2_he_elu_adam_20_10":spotify_v2_he_elu_adam_20_10_predictions,
             "spotify_v2_he_elu_adam_30_15":spotify_v2_he_elu_adam_30_15_predictions,
             "spotify_v2_he_elu_adam_40_20_10":spotify_v2_he_elu_adam_40_20_10_predictions,
             "spotify_v2_he_elu_adam_40_20_10_5":spotify_v2_he_elu_adam_40_20_10_5_predictions}
real_pred_plot = RealMultiPredPlotter(real=real, pred_dict=pred_dict)
real_pred_plot.plot()
