<h1>Imports and API setups<h1>

In [3]:
from __future__ import print_function    # (at top of module)
import warnings
warnings.filterwarnings('always')
from spotipy.oauth2 import SpotifyClientCredentials
import json
import spotipy
import time
import sys
import csv
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
% matplotlib inline
import math
import seaborn as sns
import config


# Spotify API Setup
client_credentials_manager = SpotifyClientCredentials(config.client_id, config.client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

# Enables verbose requests tracing
sp.trace=False

## Read data from CSV

In [4]:
# Read the data from the file
data = pd.read_csv('file_path_big.csv')
data.head()
print("Number of entries in original data: " + str(len(data.index)))

Number of entries in original data: 441


## Clean the data

In [5]:
data = data.drop_duplicates(subset=['Song Title', 'Artist'], keep='first')
print("Number of entries in original data after cleaning: " + str(len(data.index)))

Number of entries in original data after cleaning: 441


## Analyzing the data

In [None]:
sns.countplot(x="Mode", data=data)

In [None]:
sns.countplot(x="Time Signature", data=data)

In [None]:
sns.countplot(x="Key", data=data)

In [None]:
data['Popularity'].plot.hist(bins=30, rwidth=10,figsize=(10,8))
plt.xlabel('Popularity')
plt.title("Popularity histogram")
plt.grid(alpha=0.3)

In [None]:
sns.set_style('darkgrid')
sns.distplot(data["Popularity"])

## Label the data according to threshold for plotting

In [9]:
# Label data according to a threshold for nicer plotting
labeled_data = data.copy()
threshold = 70
labels = []

# The threshold is on Popularity, see above for description of that feature
for item in data['Popularity']:
    if item > threshold:
        labels.append(1)
    else:
        labels.append(0)
labeled_data['Is_Popular'] = labels

In [None]:
sns.countplot(x="Is_Popular", hue = "Mode", data=labeled_data)

In [None]:
sns.scatterplot(x="Popularity", y = "Duration in ms" , hue = "Is_Popular", data=labeled_data).set_title("Duration and popularity")

In [None]:
sns.scatterplot(x="Popularity", y = "Instrumentalness" , hue = "Is_Popular", data=labeled_data).set_title("Instrumentalness and popularity")

In [None]:
sns.scatterplot(x="Popularity", y = "Loudness" , hue = "Is_Popular", data=labeled_data).set_title("Loudness and popularity")

## Data wrangling

In [None]:
# Check if we have any null items in our data
data.isnull().sum()

## Setting popularity threshold and adding lables to data

In [9]:
# Make a copy of the data to which we will ad labels and then remove any 
# columns that we will not need
# This is currently a duplicate of the functionality above - could maybe only do this in one place

final_data = data.copy()
threshold = 92
labels = []
labeled_popular = 0
labeled_notpopular = 0
for item in data['Popularity']:
    if item > threshold:
        labels.append(1)
        labeled_popular = labeled_popular + 1
    else:
        labels.append(0)
        labeled_notpopular = labeled_notpopular + 1
final_data['Is_Popular'] = labels

print('Number of popular examples after thresholding : ', labeled_popular)
print('Number of not popular examples after thresholding : ', labeled_notpopular)

# Drop unnecessary columns from original data
final_data.drop(['Song Title', 'Artist', 'Popularity'], 1, inplace=True)

Number of popular examples after thresholding :  41
Number of not popular examples after thresholding :  400


## Train data

In [10]:
# X will be our examples and y will be our labels
X = final_data.drop('Is_Popular', axis=1)
y = final_data['Is_Popular']
# Sanity checks
print("Number of entries in actual data: " + str(len(X.index)))
print("Number of entries in label data: " + str(len(y.index)))

Number of entries in actual data: 441
Number of entries in label data: 441


## Splitting data

In [12]:
# We split the data into training and testing data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
print("Items in training data set : ", str(len(X_train.index)))
print("Items in testing data set: ", str(len(X_test.index)))

Items in training data set :  220
Items in testing data set:  221


## Model training and prediction

In [13]:
# Initialize Logistic Regression Model
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
print(logmodel)
# Train the model
logmodel.fit(X_train, y_train)

# Classifiy test examples
predictions = logmodel.predict(X_test)

# Print the accuracy score of the model
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)




0.90497737556561086

In [16]:
# Print the classification report of the model
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.90      1.00      0.95       200
           1       0.00      0.00      0.00        21

   micro avg       0.90      0.90      0.90       221
   macro avg       0.45      0.50      0.48       221
weighted avg       0.82      0.90      0.86       221



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [17]:
# Print the confusion matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, predictions))

[[200   0]
 [ 21   0]]


## K Nearest Neighbours

In [24]:
from sklearn.neighbors import KNeighborsClassifier
nbrs = KNeighborsClassifier(n_neighbors = 5)

nbrs.fit(X_train, y_train)

# Classifiy test examples
predictionsKNN = nbrs.predict(X_test)
accuracy_score(y_test, predictionsKNN)

0.90497737556561086

In [25]:
print(classification_report(y_test, predictionsKNN))

              precision    recall  f1-score   support

           0       0.90      1.00      0.95       200
           1       0.00      0.00      0.00        21

   micro avg       0.90      0.90      0.90       221
   macro avg       0.45      0.50      0.48       221
weighted avg       0.82      0.90      0.86       221



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [26]:
confusion_matrix(y_test, predictionsKNN)

array([[200,   0],
       [ 21,   0]], dtype=int64)