In [262]:
import numpy as np
import pandas as pd
import json
from path import Path
from math import sqrt
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [None]:
# set variable equal to loaded dataset
tracks_df = pd.read_csv('popular_songs.csv', error_bad_lines=False)
tracks_df.head()

-------------------------------------------------------------------------------

# Look at the data and transform as needed before training for the Machine Learning

In [None]:
# Describe and transpose the dataframe
tracks_df.describe().transpose()

In [None]:
# Gets rid of white space before and after each column name
tracks_df.columns = tracks_df.columns.str.strip()
tracks_df.head()

In [None]:
# Drop columns that are not needed for the ML Model
new_tracks_df = tracks_df.drop(['song','artist','release_date','genres','popular_date'], axis =1)

# Drop the null columns where all values are null
new_tracks_df = new_tracks_df.dropna(axis='columns', how='all')

# Drop the null rows
new_tracks_df = new_tracks_df.dropna()
new_tracks_df.tail()

In [None]:
# Create a dataframe from the weeks_on_board column
wob_df = pd.DataFrame(new_tracks_df['weeks_on_board'])
# Use .describe to find the average amount of time that a song remains on the top 100 Billboard
wob_df.describe()

In [None]:
# Create a box-and-whisker plot for the number of weeks on the board.
# Outliers are values that are above 36 weeks on the board.

x_labels = ["Weeks On Board"]
fig, ax = plt.subplots(figsize=(20,26))
ax.boxplot(wob_df, labels=x_labels)
# Add the title, y-axis label and grid
ax.set_title("Top 100 Billboard")
ax.set_ylabel("Number of weeks")
ax.set_yticks(np.arange(0, 100, step=1.0))
ax.grid()
plt.show()

In [None]:
# Adds a new column with the name 'target' at the end of the dataframe
header_list = ['popularity', 'acousticness', 'danceability', 
               'energy', 'instrumentalness', 'liveness', 'loudness', 
               'speechiness', 'tempo', 'ranking', 'weeks_on_board', 'target']
new_tracks_df = new_tracks_df.reindex(columns = header_list)
new_tracks_df.head()

In [None]:
# The average number of weeks that a song is on the billboard is 12 weeks.
# Create a for loop that determines if the row value in weeks_on_board column is greater than or equal to 12 and assign
# it a new value of 0 and add it to the 'target' column and everything else change to a 1.
for index in new_tracks_df.index:
    if new_tracks_df.loc[index,'weeks_on_board']>=12:
        new_tracks_df.loc[index,'target'] = 0
    else:
        new_tracks_df.loc[index,'target'] = 1

In [None]:
new_tracks_df.head()

In [None]:
# Drops the weeks_on_board column since we are using the 'target' column 
# Drops ranking column to test how the ML models will perform without it
target_tracks_df = new_tracks_df.drop(["weeks_on_board","ranking"], axis=1)
target_tracks_df.head()

In [None]:
# Create our features
X = target_tracks_df.drop(columns = "target")
#X = pd.get_dummies(X)

# Create our target
y = target_tracks_df["target"]

In [None]:
X.describe()

In [None]:
# Check the balance of our target values
y.value_counts()

### Testing

In [None]:
# train the data
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.10,
                                                    random_state = 1)

In [None]:
# Check number of examples we have in our train data set
X_train.shape

In [None]:
# Check number of examples we have in our test data set
X_test.shape

### Standardization: StandardScaler

In [None]:
# Create scaler with default parameters
scaler = StandardScaler()

In [None]:
# Fits the scaler passing the training data and transforms the data
# Returns it to variable "train_scaled"
train_scaled = scaler.fit_transform(X_train)

In [None]:
# Transforms test data the same way
test_scaled = scaler.transform(X_test)

-------------------------------------------------------------------------------

# Use the Nueral Network MLPClassifier ML Model for the target_tracks_df WITHOUT ranking
    1. This dataframe excludes the number of weeks on the board and instead uses a binary outcome. This binary values was determined by if the weeks on board is greater than or equal to 12 then change to 0, else change to a 1.

In [None]:
# Initializing the MLPClassifier
from sklearn.neural_network import MLPClassifier
model = MLPClassifier(solver='lbfgs', random_state=42, max_iter=2500)

In [None]:
# Fitting the training data to the network
model.fit(train_scaled, y_train)

In [None]:
# Import the accuracy_score and declare the predictor variable
y_pred = model.predict(test_scaled)

In [None]:
# Accuracy score of our training data
# max_iter results
# 3000 = 73.03573322023115
# 2500 = 73.03573322023115
# 2000 = 72.9466652528894
# 1000 = 72.72611600042413

train_acc = accuracy_score(y_train, model.predict(train_scaled))*100
print(train_acc)

In [None]:
# Accuracy score of our test data
# max_iter results
# 3000 = 70.24809160305342
# 2500 = 70.24809160305342
# 2000 = 70.38167938931298
# 1000 = 69.80916030534351

test_acc = accuracy_score(y_test, y_pred)*100
print(test_acc)

In [None]:
# Generates a confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1'])

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {test_acc}")
print("Classification Report")
print(classification_report(y_test, y_pred))

-------------------------------------------------------------------------------

# Use the Random Forest ML Model for the target_tracks_df WITHOUT ranking

In [None]:
# Create a random forest classifier.
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=4096, random_state=78)

In [None]:
# Fitting the model
rf_model = rf_model.fit(train_scaled, y_train)

In [None]:
# Making predictions using the testing data.
predictions = rf_model.predict(test_scaled)

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)
cm_df

In [None]:
# accuracy from training data
accuracy_score(y_train, rf_model.predict(train_scaled))*100

In [None]:
# Calculating the accuracy score from test data
acc_score = accuracy_score(y_test, predictions)
print((acc_score)*100)

In [None]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

In [None]:
# Calculate feature importance in the Random Forest model
importances = rf_model.feature_importances_
importances

In [None]:
# Sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

-------------------------------------------------------------------------------

# Use the KNearestRegressor ML Model for the target_tracks_df WITHOUT ranking

In [None]:
from sklearn.neighbors import KNeighborsRegressor
knr_model = KNeighborsRegressor(n_neighbors=17)

In [None]:
knr_model.fit(train_scaled, y_train)

In [None]:
knr_mse = mean_squared_error(y_train, knr_model.predict(train_scaled))
knr_mae = mean_absolute_error(y_train, knr_model.predict(train_scaled))

In [None]:
print("mse = ",knr_mse," & mae = ",knr_mae," & rmse = ", sqrt(knr_mse))

In [None]:
test_knr_mse = mean_squared_error(y_test, knr_model.predict(test_scaled))
test_knr_mae = mean_absolute_error(y_test, knr_model.predict(test_scaled))
print("mse = ",test_knr_mse," & mae = ",test_knr_mae," & rmse = ", sqrt(test_knr_mse))

-------------------------------------------------------------------------------

# Use the KNeighborsClassifier ML Model for the target_tracks_df WITHOUT ranking

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knc_model = KNeighborsClassifier(n_neighbors=23)

In [None]:
knc_model.fit(train_scaled,y_train)

In [None]:
knc_mse = mean_squared_error(y_train, knc_model.predict(train_scaled))
knc_mae = mean_absolute_error(y_train, knc_model.predict(train_scaled))

In [None]:
print("mse = ",knc_mse," & mae = ",knc_mae," & rmse = ", sqrt(knc_mse))

In [None]:
test_mse = mean_squared_error(y_test, knc_model.predict(test_scaled))
test_mae = mean_absolute_error(y_test, knc_model.predict(test_scaled))
print("mse = ",test_mse," & mae = ",test_mae," & rmse = ", sqrt(test_mse))