In [None]:
import numpy as np
import pandas as pd
import json
from path import Path
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import balanced_accuracy_score
import matplotlib.pyplot as plt

In [None]:
# set variable equal to loaded dataset
tracks_df = pd.read_csv('popular_songs.csv', error_bad_lines=False)

#tracks_df = pd.read_csv(file)
#tracks_df.shape

In [None]:
tracks_df.head()

In [None]:
tracks_df.describe().transpose()

In [None]:
#get rid of white space before and after each column name
tracks_df.columns = tracks_df.columns.str.strip()
tracks_df.head()

In [None]:
new_tracks_df = tracks_df.drop(['song','artist','release_date','genres','popular_date'], axis =1)

# Drop the null columns where all values are null
new_tracks_df = new_tracks_df.dropna(axis='columns', how='all')

# Drop the null rows
new_tracks_df = new_tracks_df.dropna()

new_tracks_df.tail()

In [None]:
wob_df = pd.DataFrame(new_tracks_df['weeks_on_board'])
wob_df.describe()

In [None]:
# Create a box-and-whisker plot for the number of weeks on the board.
# Outliers are values that are above 36 weeks on the board.

x_labels = ["Weeks On Board"]
fig, ax = plt.subplots(figsize=(20,26))
ax.boxplot(wob_df, labels=x_labels)
# Add the title, y-axis label and grid
ax.set_title("Top 100 Billboard")
ax.set_ylabel("Number of weeks")
ax.set_yticks(np.arange(0, 100, step=1.0))
ax.grid()
plt.show()

In [None]:
header_list = ['popularity', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'ranking', 'weeks_on_board', 'target']
new_tracks_df = new_tracks_df.reindex(columns = header_list)
new_tracks_df.head()

In [None]:
# The average number of weeks that a song is on the billboard is 12 weeks.
# Create a for loop that determines if the row value in weeks_on_board column is greater than or equal to 12 and assign
# it a new value of 0 and add it to the 'target' column and everything else change to a 1.
for index in new_tracks_df.index:
    if new_tracks_df.loc[index,'weeks_on_board']>=12:
        new_tracks_df.loc[index,'target'] = 0
    else:
        new_tracks_df.loc[index,'target'] = 1

In [113]:
new_tracks_df.head()

Unnamed: 0,popularity,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,ranking,weeks_on_board,target
0,76.0,0.0906,0.499,0.8,0.0,0.147,-2.665,0.0502,139.919,70,19,0.0
1,78.0,0.0334,0.721,0.716,0.00084,0.237,-7.037,0.0657,104.994,67,8,1.0
2,71.0,0.188,0.846,0.748,0.0,0.093,-3.512,0.078,90.006,68,6,1.0
3,45.0,0.295,0.803,0.715,0.000134,0.0574,-3.28,0.298,101.085,69,7,1.0
4,71.0,0.036,0.445,0.492,0.0,0.122,-5.717,0.0294,148.013,75,3,1.0


In [115]:
target_tracks_df = new_tracks_df.drop(columns = "weeks_on_board")
target_tracks_df.head()

Unnamed: 0,popularity,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,ranking,target
0,76.0,0.0906,0.499,0.8,0.0,0.147,-2.665,0.0502,139.919,70,0.0
1,78.0,0.0334,0.721,0.716,0.00084,0.237,-7.037,0.0657,104.994,67,1.0
2,71.0,0.188,0.846,0.748,0.0,0.093,-3.512,0.078,90.006,68,1.0
3,45.0,0.295,0.803,0.715,0.000134,0.0574,-3.28,0.298,101.085,69,1.0
4,71.0,0.036,0.445,0.492,0.0,0.122,-5.717,0.0294,148.013,75,1.0


# Use the Nueral Network MLPClassifier ML Model for the target_tracks_df
    1. This dataframe excludes the number of weeks on the board and instead uses a binary outcome. This binary values was determined by if the weeks on board is greater than or equal to 12 then change to 0, else change to 1.
    

### Create features and split into training and testing 

In [116]:
# Create our features
X = target_tracks_df.drop(columns = "target")
#X = pd.get_dummies(X)

# Create our target
y = target_tracks_df["target"]

In [117]:
X.describe()

Unnamed: 0,popularity,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,ranking
count,52395.0,52395.0,52395.0,52395.0,52395.0,52395.0,52395.0,52395.0,52395.0,52395.0
mean,63.208398,0.166375,0.646721,0.676498,0.010053,0.178256,-5.93654,0.10192,122.542894,50.471037
std,16.218658,0.209721,0.135772,0.166208,0.07609,0.135081,2.303629,0.102682,28.346675,28.863363
min,0.0,1.4e-05,0.0768,0.00231,0.0,0.02,-42.887,0.0225,48.718,1.0
25%,57.0,0.0219,0.557,0.567,0.0,0.0945,-7.031,0.0378,99.643,25.0
50%,66.0,0.0754,0.652,0.698,0.0,0.124,-5.603,0.0554,122.504,50.0
75%,73.0,0.228,0.742,0.805,1.1e-05,0.224,-4.421,0.121,140.309,75.0
max,97.0,0.996,0.981,0.996,0.973,0.989,0.175,0.951,208.067,100.0


In [118]:
# Check the balance of our target values
y.value_counts()

1.0    29045
0.0    23350
Name: target, dtype: int64

In [119]:
# train the data
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state = 1)

In [120]:
# def accuracy(confusion_matrix):
#     diagonal_sum = confusion_matrix.trace()
#     sum_of_all_elements = confusion_matrix.sum()
#     return diagonal_sum / sum_of_all_elements

X_train.shape

(39296, 10)

In [121]:
X_test.shape

(13099, 10)

### Standardization: StandardScaler

In [122]:
scaler = StandardScaler()

In [123]:
train_scaled = scaler.fit_transform(X_train)

In [124]:
test_scaled = scaler.transform(X_test)

In [125]:
#Initializing the MLPClassifier

model = MLPClassifier(solver='lbfgs', random_state=42)

In [126]:
#Fitting the training data to the network
model.fit(train_scaled, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


MLPClassifier(random_state=42, solver='lbfgs')

In [127]:
from sklearn.metrics import accuracy_score
y_pred = model.predict(test_scaled)

In [128]:
accuracy_score(y_train, model.predict(train_scaled))*100

72.12947882736157

In [129]:
accuracy_score(y_test, y_pred)*100

70.28017405908848

In [130]:
cm = confusion_matrix
cm(y_test, y_pred)

array([[3801, 2077],
       [1816, 5405]], dtype=int64)

# Use the Nueral Network MLPClassifier ML Model for the new_tracks_df
    1. This dataframe includes the number of weeks on the board and uses the binary values that were determined by if the weeks on board is greater than or equal to 12 then change to 0, else change to 1.
    

### Create features and split into training and testing 

In [131]:
# Create our features
X = new_tracks_df.drop(columns = "target")
#X = pd.get_dummies(X)

# Create our target
y = new_tracks_df["target"]

In [132]:
X.describe()

Unnamed: 0,popularity,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,ranking,weeks_on_board
count,52395.0,52395.0,52395.0,52395.0,52395.0,52395.0,52395.0,52395.0,52395.0,52395.0,52395.0
mean,63.208398,0.166375,0.646721,0.676498,0.010053,0.178256,-5.93654,0.10192,122.542894,50.471037,12.278118
std,16.218658,0.209721,0.135772,0.166208,0.07609,0.135081,2.303629,0.102682,28.346675,28.863363,10.130632
min,0.0,1.4e-05,0.0768,0.00231,0.0,0.02,-42.887,0.0225,48.718,1.0,1.0
25%,57.0,0.0219,0.557,0.567,0.0,0.0945,-7.031,0.0378,99.643,25.0,4.0
50%,66.0,0.0754,0.652,0.698,0.0,0.124,-5.603,0.0554,122.504,50.0,10.0
75%,73.0,0.228,0.742,0.805,1.1e-05,0.224,-4.421,0.121,140.309,75.0,17.0
max,97.0,0.996,0.981,0.996,0.973,0.989,0.175,0.951,208.067,100.0,87.0


In [133]:
# Check the balance of our target values
y.value_counts()

1.0    29045
0.0    23350
Name: target, dtype: int64

In [134]:
# train the data
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state = 1)

In [135]:
X_train.shape

(39296, 11)

In [136]:
X_test.shape

(13099, 11)

In [137]:
scaler = StandardScaler()

In [138]:
train_scaled = scaler.fit_transform(X_train)

In [139]:
test_scaled = scaler.transform(X_test)

In [140]:
#Initializing the MLPClassifier

model = MLPClassifier(solver='lbfgs', random_state=42)

In [141]:
#Fitting the training data to the network
model.fit(train_scaled, y_train)

MLPClassifier(random_state=42, solver='lbfgs')

In [142]:
from sklearn.metrics import accuracy_score
y_pred = model.predict(test_scaled)

In [143]:
accuracy_score(y_train, model.predict(train_scaled))*100

100.0

In [144]:
accuracy_score(y_test, y_pred)*100

100.0

In [145]:
cm = confusion_matrix
cm(y_test, y_pred)

array([[5878,    0],
       [   0, 7221]], dtype=int64)