In [1]:
# prompt: create a machine learning model that takes an existing dataset of songs with various characteristics and suggests songs for user based on a song request

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from pathlib import Path


In [3]:
# Download data with genre
file_path_test = Path(r"Resources\genre_music.csv")
genre_df = pd.read_csv(file_path_test)

In [5]:
genre_df.head()

Unnamed: 0,track,artist,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_s,time_signature,chorus_hit,sections,popularity,decade,genre
0,Jealous Kind Of Fella,Garland Green,0.417,0.62,3,-7.727,1,0.0403,0.49,0.0,0.0779,0.845,185.655,173.533,3,32.94975,9,1,60s,edm
1,Initials B.B.,Serge Gainsbourg,0.498,0.505,3,-12.475,1,0.0337,0.018,0.107,0.176,0.797,101.801,213.613,4,48.8251,10,0,60s,pop
2,Melody Twist,Lord Melody,0.657,0.649,5,-13.392,1,0.038,0.846,4e-06,0.119,0.908,115.94,223.96,4,37.22663,12,0,60s,pop
3,Mi Bomba Sonó,Celia Cruz,0.59,0.545,7,-12.058,0,0.104,0.706,0.0246,0.061,0.967,105.592,157.907,4,24.75484,8,0,60s,pop
4,Uravu Solla,P. Susheela,0.515,0.765,11,-3.515,0,0.124,0.857,0.000872,0.213,0.906,114.617,245.6,4,21.79874,14,0,60s,r&b


In [9]:
# List the distinct music genres included
genre_df.genre.unique()

array(['edm', 'pop', 'r&b', 'rock', 'rap', 'latin'], dtype=object)

In [15]:
# Create copy
copy_df = genre_df.drop(columns = ['track', 'artist', 'genre'])
copy_df = pd.get_dummies(copy_df)
copy_df

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,time_signature,chorus_hit,sections,popularity,decade_00s,decade_10s,decade_60s,decade_70s,decade_80s,decade_90s
0,0.417,0.620,3,-7.727,1,0.0403,0.4900,0.000000,0.0779,0.8450,...,3,32.94975,9,1,False,False,True,False,False,False
1,0.498,0.505,3,-12.475,1,0.0337,0.0180,0.107000,0.1760,0.7970,...,4,48.82510,10,0,False,False,True,False,False,False
2,0.657,0.649,5,-13.392,1,0.0380,0.8460,0.000004,0.1190,0.9080,...,4,37.22663,12,0,False,False,True,False,False,False
3,0.590,0.545,7,-12.058,0,0.1040,0.7060,0.024600,0.0610,0.9670,...,4,24.75484,8,0,False,False,True,False,False,False
4,0.515,0.765,11,-3.515,0,0.1240,0.8570,0.000872,0.2130,0.9060,...,4,21.79874,14,0,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41094,0.172,0.358,9,-14.430,1,0.0342,0.8860,0.966000,0.3140,0.0361,...,4,24.30824,7,0,False,True,False,False,False,False
41095,0.910,0.366,1,-9.954,1,0.0941,0.0996,0.000000,0.2610,0.7400,...,4,32.53856,8,1,False,True,False,False,False,False
41096,0.719,0.804,10,-4.581,1,0.0355,0.0132,0.000003,0.1390,0.6050,...,4,20.73371,7,1,False,True,False,False,False,False
41097,0.600,0.177,7,-16.070,1,0.0561,0.9890,0.868000,0.1490,0.5600,...,4,21.65301,14,0,False,True,False,False,False,False


In [17]:
correlation_matrix = copy_df.corr()

print(correlation_matrix)

                  danceability    energy       key  loudness      mode  \
danceability          1.000000  0.206036  0.015433  0.273997 -0.032740   
energy                0.206036  1.000000  0.022598  0.772611 -0.033907   
key                   0.015433  0.022598  1.000000  0.008483 -0.140398   
loudness              0.273997  0.772611  0.008483  1.000000  0.000384   
mode                 -0.032740 -0.033907 -0.140398  0.000384  1.000000   
speechiness           0.156362  0.122360  0.026554  0.069115 -0.059758   
acousticness         -0.261122 -0.715084 -0.024240 -0.566503  0.050028   
instrumentalness     -0.301834 -0.208113 -0.013120 -0.374206 -0.075968   
liveness             -0.115275  0.157797  0.000639  0.086676  0.008781   
valence               0.553845  0.341398  0.007748  0.271706  0.035613   
tempo                -0.066588  0.224107  0.001116  0.169506  0.027088   
duration_s           -0.062915  0.011961  0.015480 -0.049733 -0.074744   
time_signature        0.191814  0.1962

In [159]:
# Create a clustering model
y = genre_df['genre']
X = copy_df.drop(columns = ['key', 'mode', 'liveness', 'duration_s', 'time_signature', 'chorus_hit', 'sections'])
# 'decade_60s', 'decade_70s', 'decade_80s', 'decade_90s', 'decade_00s', 'decade_10s'

In [161]:
# Import the required modules
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [163]:
# Split into testing and training sets using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [165]:
# Create the StandardScaler instance
scaler = StandardScaler()
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [167]:
# Declare a logistic regression model.
# Apply a random_state of 9 to the model
logistic_regression_model = LogisticRegression(random_state=9)

# Fit and save the logistic regression model using the training data
lr_model = logistic_regression_model.fit(X_train_scaled, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [169]:
#Generate training predictions
training_predictions = lr_model.predict(X_train_scaled)

#Generate testing predictions
testing_predictions = logistic_regression_model.predict(X_test_scaled)


In [171]:
# Import the model for sklearn confusion matrix
from sklearn.metrics import confusion_matrix

# Create and save the confusion matrix for the training data
training_matrix = confusion_matrix(y_train, training_predictions)

# Print the confusion matrix for the training data
print(training_matrix)

[[1073   32   21    2    5   13]
 [   7 1517  126  484   13   11]
 [  60    0 9620 2575   22   75]
 [  33  774 3252 4089   40   69]
 [   3    1    4    5 1737   27]
 [  61   23   22    7    7 5014]]


In [173]:
# Create and save the training classification report
training_report = classification_report(y_train, training_predictions)

# Print the training classification report
print(training_report)

              precision    recall  f1-score   support

         edm       0.87      0.94      0.90      1146
       latin       0.65      0.70      0.67      2158
         pop       0.74      0.78      0.76     12352
         r&b       0.57      0.50      0.53      8257
         rap       0.95      0.98      0.96      1777
        rock       0.96      0.98      0.97      5134

    accuracy                           0.75     30824
   macro avg       0.79      0.81      0.80     30824
weighted avg       0.74      0.75      0.74     30824



In [175]:
# Create and save the testing classification report
testing_report = classification_report(y_test, testing_predictions)

# Print the testing classification report
print(testing_report)

              precision    recall  f1-score   support

         edm       0.89      0.91      0.90       369
       latin       0.61      0.68      0.65       712
         pop       0.74      0.78      0.76      4108
         r&b       0.57      0.50      0.53      2783
         rap       0.95      0.97      0.96       587
        rock       0.96      0.99      0.97      1716

    accuracy                           0.75     10275
   macro avg       0.79      0.80      0.79     10275
weighted avg       0.74      0.75      0.74     10275

