In [62]:
import numpy as np
import pandas as pd
from path import Path
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import balanced_accuracy_score

In [63]:
# set variable equal to loaded dataset
file = ("filtered_tracks.csv")
tracks_df = pd.read_csv(file)
tracks_df.shape

(155529, 20)

In [64]:
tracks_df.head()

Unnamed: 0,id,song,popularity,duration_ms,explicit,artist,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,6Pkt6qVikqPBt9bEQy8iTz,A Lover's Concerto,41,159560,0,The Toys,6lH5PpuiMa5SpfjoIOlwCS,2020-03-13,0.671,0.867,2,-2.706,1,0.0571,0.436,0.0,0.139,0.839,120.689,4
1,1hx7X9cMXHWJjknb9O6Ava,The September Of My Years - Live At The Sands ...,26,187333,0,Frank Sinatra,1Mxqyy3pSjf8kZZL4QVxS0,2018-05-04,0.319,0.201,7,-17.796,1,0.0623,0.887,0.0,0.904,0.239,117.153,3
2,19oquvXf3bc65GSqtPYA5S,It Was A Very Good Year - Live At The Sands Ho...,25,236800,0,Frank Sinatra,1Mxqyy3pSjf8kZZL4QVxS0,2018-05-04,0.269,0.129,7,-18.168,0,0.0576,0.938,5e-06,0.683,0.16,82.332,3
3,55qyghODi24yaDgKBI6lx0,"The Circle Game - Live at The 2nd Fret, Philad...",18,313093,0,Joni Mitchell,5hW4L92KnC6dX9t7tYM4Ve,2020-10-30,0.644,0.212,11,-14.118,1,0.0347,0.881,2.2e-05,0.798,0.441,117.072,3
4,00xemFYjQNRpOlPhVaLAHa,"Urge For Going - Live at The 2nd Fret, Philade...",18,295093,0,Joni Mitchell,5hW4L92KnC6dX9t7tYM4Ve,2020-10-30,0.627,0.184,1,-15.533,1,0.045,0.955,0.000162,0.0986,0.299,115.864,4


In [65]:
tracks_df.describe().transpose()


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
popularity,155529.0,40.473429,20.690985,0.0,30.0,44.0,55.0,97.0
duration_ms,155529.0,232191.074777,159297.947402,4937.0,185840.0,217800.0,256467.0,5403500.0
explicit,155529.0,0.159552,0.366191,0.0,0.0,0.0,0.0,1.0
danceability,155529.0,0.623606,0.160069,0.0,0.521,0.639,0.741,0.988
energy,155529.0,0.664439,0.212362,0.0,0.532,0.693,0.832,1.0
key,155529.0,5.339384,3.587823,0.0,2.0,6.0,8.0,11.0
loudness,155529.0,-7.252142,3.841879,-57.093,-8.484,-6.492,-4.947,1.933
mode,155529.0,0.584521,0.492806,0.0,0.0,1.0,1.0,1.0
speechiness,155529.0,0.102525,0.106938,0.0,0.0376,0.0563,0.12,0.962
acousticness,155529.0,0.275195,0.284772,0.0,0.0318,0.168,0.457,0.996


In [66]:
#get rid of white space before and after each column name
tracks_df.columns = tracks_df.columns.str.strip()
tracks_df.head()

Unnamed: 0,id,song,popularity,duration_ms,explicit,artist,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,6Pkt6qVikqPBt9bEQy8iTz,A Lover's Concerto,41,159560,0,The Toys,6lH5PpuiMa5SpfjoIOlwCS,2020-03-13,0.671,0.867,2,-2.706,1,0.0571,0.436,0.0,0.139,0.839,120.689,4
1,1hx7X9cMXHWJjknb9O6Ava,The September Of My Years - Live At The Sands ...,26,187333,0,Frank Sinatra,1Mxqyy3pSjf8kZZL4QVxS0,2018-05-04,0.319,0.201,7,-17.796,1,0.0623,0.887,0.0,0.904,0.239,117.153,3
2,19oquvXf3bc65GSqtPYA5S,It Was A Very Good Year - Live At The Sands Ho...,25,236800,0,Frank Sinatra,1Mxqyy3pSjf8kZZL4QVxS0,2018-05-04,0.269,0.129,7,-18.168,0,0.0576,0.938,5e-06,0.683,0.16,82.332,3
3,55qyghODi24yaDgKBI6lx0,"The Circle Game - Live at The 2nd Fret, Philad...",18,313093,0,Joni Mitchell,5hW4L92KnC6dX9t7tYM4Ve,2020-10-30,0.644,0.212,11,-14.118,1,0.0347,0.881,2.2e-05,0.798,0.441,117.072,3
4,00xemFYjQNRpOlPhVaLAHa,"Urge For Going - Live at The 2nd Fret, Philade...",18,295093,0,Joni Mitchell,5hW4L92KnC6dX9t7tYM4Ve,2020-10-30,0.627,0.184,1,-15.533,1,0.045,0.955,0.000162,0.0986,0.299,115.864,4


In [71]:
new_tracks_df = tracks_df.drop(['id','song','artist','release_date','id_artists'], axis =1)

# Drop the null columns where all values are null
new_tracks_df = new_tracks_df.dropna(axis='columns', how='all')

# Drop the null rows
new_tracks_df = new_tracks_df.dropna()

new_tracks_df.tail()

Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
155524,4,106002,0,0.626,0.53,5,-13.117,0,0.0284,0.113,0.856,0.104,0.215,120.113,4
155525,50,258267,0,0.56,0.518,0,-7.471,0,0.0292,0.785,0.0,0.0648,0.211,131.896,4
155526,72,153293,0,0.765,0.663,0,-5.223,1,0.0652,0.141,0.000297,0.0924,0.686,150.091,4
155527,70,187601,0,0.535,0.314,7,-12.823,0,0.0408,0.895,0.00015,0.0874,0.0663,145.095,4
155528,38,214360,0,0.686,0.723,6,-7.067,1,0.0363,0.105,0.0,0.264,0.975,112.204,4


In [72]:
# #scale the data
# sc = StandardScaler
# new_tracks_df = sc.fit_transform(new_tracks_df[['popularity', 'duration_ms', 'explicit', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']])
# new_tracks_df

# Create features and split into training and testing 

In [73]:
# Create our features
X = new_tracks_df.drop(columns = "explicit")
#X = pd.get_dummies(X)

# Create our target
y = new_tracks_df["explicit"]

In [74]:
X.describe()

Unnamed: 0,popularity,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
count,155529.0,155529.0,155529.0,155529.0,155529.0,155529.0,155529.0,155529.0,155529.0,155529.0,155529.0,155529.0,155529.0,155529.0
mean,40.473429,232191.1,0.623606,0.664439,5.339384,-7.252142,0.584521,0.102525,0.275195,0.094668,0.208225,0.511162,121.79286,3.936931
std,20.690985,159297.9,0.160069,0.212362,3.587823,3.841879,0.492806,0.106938,0.284772,0.251844,0.180872,0.245824,28.53585,0.370352
min,0.0,4937.0,0.0,0.0,0.0,-57.093,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,30.0,185840.0,0.521,0.532,2.0,-8.484,0.0,0.0376,0.0318,0.0,0.0971,0.318,99.027,4.0
50%,44.0,217800.0,0.639,0.693,6.0,-6.492,1.0,0.0563,0.168,2e-06,0.131,0.51,123.63,4.0
75%,55.0,256467.0,0.741,0.832,8.0,-4.947,1.0,0.12,0.457,0.00117,0.268,0.708,138.096,4.0
max,97.0,5403500.0,0.988,1.0,11.0,1.933,1.0,0.962,0.996,1.0,0.998,1.0,229.862,5.0


In [75]:
# Check the balance of our target values
y.value_counts()

0    130714
1     24815
Name: explicit, dtype: int64

In [76]:
# train the data
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state = 1)

In [77]:
# def accuracy(confusion_matrix):
#     diagonal_sum = confusion_matrix.trace()
#     sum_of_all_elements = confusion_matrix.sum()
#     return diagonal_sum / sum_of_all_elements

X_train.shape

(116646, 14)

In [78]:
X_test.shape

(38883, 14)

# Standardization: StandardScaler

In [79]:
scaler = StandardScaler()

In [80]:
train_scaled = scaler.fit_transform(X_train)

In [81]:
test_scaled = scaler.transform(X_test)

In [82]:
#Initializing the MLPClassifier

model = MLPClassifier(solver='lbfgs', random_state=42)

In [83]:
#Fitting the training data to the network
model.fit(train_scaled, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


MLPClassifier(random_state=42, solver='lbfgs')

In [84]:
from sklearn.metrics import accuracy_score
y_pred = model.predict(test_scaled)

In [85]:
accuracy_score(y_train, model.predict(train_scaled))*100

87.63781012636525

In [86]:
accuracy_score(y_test, y_pred)*100

86.84772265514492

In [87]:
cm = confusion_matrix
cm(y_test, y_pred)

array([[31109,  1661],
       [ 3453,  2660]], dtype=int64)

# Below here is previous practice work

In [31]:
#comparing the predictions against the actual observations in y_test
# cm = confusion_matrix(y_test, y_pred)

# #Print accuracy score
# print("Accuracy of MLPClassifier :", (accuracy(cm)*100))

# Calculate Balanced Accuracy Score
bas = balanced_accuracy_score

y_pred = classifier.predict(X_test)
bas(y_test, y_pred)*100

1.0416666666666665

In [32]:
# Displays confusion matrix
cm = confusion_matrix

cm(y_test, y_pred)

array([[2071,    0,    0, ...,    0,    0,    0],
       [ 866,    0,    0, ...,    0,    0,    0],
       [ 712,    0,    0, ...,    0,    0,    0],
       ...,
       [   1,    0,    0, ...,    0,    0,    0],
       [   1,    0,    0, ...,    0,    0,    0],
       [   1,    0,    0, ...,    0,    0,    0]], dtype=int64)

# Connect Data From SQL and Run ML Model

In [33]:
from sqlalchemy import create_engine
# Postgres username, password, and database name
POSTGRES_ADDRESS = 'music-kpop.cgfelcbkvk1j.us-east-2.rds.amazonaws.com' ## INSERT YOUR DB ADDRESS IF IT'S NOT ON PANOPLY
POSTGRES_PORT = '5432'
POSTGRES_USERNAME = 'kenneth' ## CHANGE THIS TO YOUR PANOPLY/POSTGRES USERNAME
POSTGRES_PASSWORD = 'password3' ## CHANGE THIS TO YOUR PANOPLY/POSTGRES PASSWORD 
POSTGRES_DBNAME = 'Popular Music' ## CHANGE THIS TO YOUR DATABASE NAME
# A long string that contains the necessary Postgres login information
postgres_str = ('postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}'.format(username=POSTGRES_USERNAME, 
                                                                                        password=POSTGRES_PASSWORD, 
                                                                                        ipaddress=POSTGRES_ADDRESS, 
                                                                                        port=POSTGRES_PORT, 
                                                                                        dbname=POSTGRES_DBNAME))
# Create the connection
cnx = create_engine(postgres_str)

In [34]:
#db_string = f"postgres://postgres:{db_password}@127.0.0.1:5432/movie_data"

NameError: name 'db_password' is not defined