In [1]:
import os
import glob
import pandas as pd
import numpy as np 

np.random.seed(20)

In [2]:
# Define working directory
os.chdir(r"C:\Users\Cristina Bardan\Desktop\Repositories\FinalProject-Spotify\Original_data")

In [3]:
# Use glob to match the pattern ‘csv’
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
all_filenames

['dataset-of-00s.csv',
 'dataset-of-10s.csv',
 'dataset-of-60s.csv',
 'dataset-of-70s.csv',
 'dataset-of-80s.csv',
 'dataset-of-90s.csv']

In [4]:
# Combine all files in the list and export as CSV
data = pd.concat([pd.read_csv(f) for f in all_filenames ])
data.reset_index(drop=True, inplace=True)

In [5]:
data.columns

Index(['track', 'artist', 'uri', 'danceability', 'energy', 'key', 'loudness',
       'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'duration_ms', 'time_signature', 'chorus_hit',
       'sections', 'target'],
      dtype='object')

In [6]:
y = data.target
x = data[['danceability', 'energy', 'key', 'loudness','mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness','valence', 'tempo', 'duration_ms', 'time_signature', 'chorus_hit','sections']]

In [7]:
print(x.shape, y.shape)

(41106, 15) (41106,)


In [8]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=12)

### With RANDOM FOREST

In [9]:
from sklearn.preprocessing import StandardScaler

x_scaler = StandardScaler().fit(x_train)

x_train_scaled = x_scaler.transform(x_train)
x_test_scaled = x_scaler.transform(x_test)

In [10]:
# Create a random forest classifier
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=200) 
model = model.fit(x_train_scaled, y_train)

In [11]:
print(f"Training Data Score: {model.score(x_train_scaled, y_train)}")
print(f"Testing Data Score: {model.score(x_test_scaled, y_test)}")

Training Data Score: 0.9993836971682507
Testing Data Score: 0.7883623625571665


In [12]:
# Validation of feature importance
importances = model.feature_importances_
importances

array([0.10271071, 0.08363844, 0.02972493, 0.0823068 , 0.01026878,
       0.07410243, 0.10706546, 0.16160161, 0.05138273, 0.07215949,
       0.05429002, 0.07510012, 0.0067794 , 0.05092323, 0.03794585])

In [13]:
# We can sort the features by their importance

sorted(zip(model.feature_importances_, x.columns), reverse=True)

[(0.1616016100025217, 'instrumentalness'),
 (0.1070654609190537, 'acousticness'),
 (0.10271071047704178, 'danceability'),
 (0.08363844200055588, 'energy'),
 (0.08230679984020853, 'loudness'),
 (0.07510012135816571, 'duration_ms'),
 (0.07410242758962575, 'speechiness'),
 (0.07215948844632883, 'valence'),
 (0.05429002409144878, 'tempo'),
 (0.05138273046462514, 'liveness'),
 (0.050923230770768046, 'chorus_hit'),
 (0.03794584582792825, 'sections'),
 (0.029724932619056243, 'key'),
 (0.010268775607701109, 'mode'),
 (0.006779399984970517, 'time_signature')]

In [14]:
import pickle
with open('../Models/h5/rf.h5', 'wb') as f:
    pickle.dump(model, f)

In [15]:
with open('../Models/h5/rf.h5','rb') as fp:
    thing = pickle.load(fp)
thing

RandomForestClassifier(n_estimators=200)