In [76]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, f1_score, recall_score


In [77]:
data = pd.read_csv('cleaned_data.csv')
data.head()

Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,playlist_subgenre,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_s
0,6f807x0ima9a1j3VPbc7VN,I Don't Care (with Justin Bieber) - Loud Luxur...,Ed Sheeran,66,I Don't Care (with Justin Bieber) [Loud Luxury...,2019-06-14,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,dance pop,...,6,-2.634,1,0.0583,0.102,0.0,0.0653,0.518,122.036,194.754
1,0r7CVbZTWZgbTCYdfa2P31,Memories - Dillon Francis Remix,Maroon 5,67,Memories (Dillon Francis Remix),2019-12-13,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,dance pop,...,11,-4.969,1,0.0373,0.0724,0.00421,0.357,0.693,99.972,162.6
2,1z1Hg7Vb0AhHDiEmnDE79l,All the Time - Don Diablo Remix,Zara Larsson,70,All the Time (Don Diablo Remix),2019-07-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,dance pop,...,1,-3.432,0,0.0742,0.0794,2.3e-05,0.11,0.613,124.008,176.616
3,75FpbthrwQmzHlBJLuGdC7,Call You Mine - Keanu Silva Remix,The Chainsmokers,60,Call You Mine - The Remixes,2019-07-19,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,dance pop,...,7,-3.778,1,0.102,0.0287,9e-06,0.204,0.277,121.956,169.093
4,1e8PAfcKUYoKkxPhrHqw4x,Someone You Loved - Future Humans Remix,Lewis Capaldi,69,Someone You Loved (Future Humans Remix),2019-03-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,dance pop,...,1,-4.672,1,0.0359,0.0803,0.0,0.0833,0.725,123.976,189.052


In [78]:
data.isnull().sum()

track_id                    0
track_name                  0
track_artist                0
track_popularity            0
track_album_name            0
track_album_release_date    0
playlist_name               0
playlist_id                 0
playlist_genre              0
playlist_subgenre           0
danceability                0
energy                      0
key                         0
loudness                    0
mode                        0
speechiness                 0
acousticness                0
instrumentalness            0
liveness                    0
valence                     0
tempo                       0
duration_s                  0
dtype: int64

In [79]:
def extract_year_from_date(date_str):
    # This function attempts to handle strings and extract the year directly
    if pd.isna(date_str):
        return None  # Return None or a default value if the date is missing
    parts = date_str.split('-')
    # Assume the year is the first part of the date string
    if len(parts[0]) == 4:
        return int(parts[0])
    return None

In [80]:
# Extract the year
data['release_year'] = data['track_album_release_date'].apply(extract_year_from_date)

In [81]:
data.isnull().sum()

track_id                    0
track_name                  0
track_artist                0
track_popularity            0
track_album_name            0
track_album_release_date    0
playlist_name               0
playlist_id                 0
playlist_genre              0
playlist_subgenre           0
danceability                0
energy                      0
key                         0
loudness                    0
mode                        0
speechiness                 0
acousticness                0
instrumentalness            0
liveness                    0
valence                     0
tempo                       0
duration_s                  0
release_year                0
dtype: int64

In [82]:
# Features for the model 
features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
                           'instrumentalness', 'liveness', 'valence', 'tempo', 'release_year']

# Target variable
target = 'track_popularity'

#Prepare the data
X = data[features]
y = data[target]



In [83]:
#Normalizing the features
scaler = MinMaxScaler()

X_scaled = scaler.fit_transform(X)

X_scaled

array([[0.76093591, 0.9159853 , 0.91808981, ..., 0.52270434, 0.50967257,
        0.98412698],
       [0.73855544, 0.81496762, 0.86916162, ..., 0.69929364, 0.41752422,
        0.98412698],
       [0.68667345, 0.93098792, 0.90136831, ..., 0.6185671 , 0.51790845,
        0.98412698],
       ...,
       [0.53814852, 0.82096867, 0.87062842, ..., 0.43995964, 0.53453475,
        0.9047619 ],
       [0.63682604, 0.8879804 , 0.90285607, ..., 0.31079717, 0.5346141 ,
        0.9047619 ],
       [0.61342828, 0.8839797 , 0.87750141, ..., 0.09021191, 0.53451387,
        0.9047619 ]])

In [84]:
# Create a binary target based on 'track_popularity'
popularity_threshold = data['track_popularity'].quantile(0.75)
y = (data['track_popularity'] >= popularity_threshold).astype(int)


In [85]:
#Split the dataset into training and testing sets (80% for training, 20% for testing)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)


In [86]:
print(y)

0        1
1        1
2        1
3        1
4        1
        ..
26224    0
26225    0
26226    0
26227    0
26228    0
Name: track_popularity, Length: 26229, dtype: int32


In [87]:
#Create a new classifier object c
decision_tree = DecisionTreeClassifier()

#Train the model
decision_tree.fit(X_train, y_train)

#Make predictions on the test set
y_predict = decision_tree.predict(X_test)

# Calculating metrics
accuracy = accuracy_score(y_test, y_predict)
precision = precision_score(y_test, y_predict)
recall = recall_score(y_test, y_predict)
f_score = f1_score(y_test, y_predict)


print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F-Score:", f_score)


Accuracy: 0.6667937476172322
Precision: 0.34827332843497427
Recall: 0.3550561797752809
F-Score: 0.3516320474777448


In [88]:
from sklearn.model_selection import GridSearchCV


#Define the parameter
param_grid = {
    'max_depth': [None,10,20,30],
    'min_samples_split': [2,5,10],
    'min_samples_leaf':[1,2,4]
}
                    

grid_search = GridSearchCV(estimator=decision_tree,param_grid=param_grid,cv=5,verbose=2)

grid_search.fit(X_train,y_train)
print("Best parameters: ", grid_search.best_params_)  

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10; total time=   0.1s
[C

In [89]:
best_params = grid_search.best_params_

#Train a decision tree classifier with the best parameters
best_decision_tree = DecisionTreeClassifier(**best_params)
best_decision_tree.fit(X_train,y_train)

y_pred_2 = best_decision_tree.predict(X_test)

In [90]:
# Calculating metrics
accuracy = accuracy_score(y_test, y_pred_2)
precision = precision_score(y_test, y_pred_2)
recall = recall_score(y_test, y_pred_2)
f_score = f1_score(y_test, y_pred_2)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F-Score:", f_score)




Accuracy: 0.7310331681280976
Precision: 0.42338709677419356
Recall: 0.15730337078651685
F-Score: 0.22938285090114693


In [91]:
def recommendation(track_name, track_artist, data, features, model, top_n=10):
    # Searches the dataset for a track that matches the given track_name and track_artist.
    input_track = data[(data['track_name'] == track_name) & (data['track_artist'] == track_artist)]
    
    # Checks if the input_track DataFrame is empty (i.e., no track was found).
    if input_track.empty:
        return "Song is not found in our dataset."
    
    # Extracts the values of the specified features for the found input_track.
    input_features = input_track[features].values

    # Uses the decision tree model to predict popularity for each track in the dataset based on the specified features.
    popularity_scores = model.predict(data[features])

    # Calculate cosine similarity
    similarities = cosine_similarity(input_features, data[features].values).flatten()

    # Multiplies the cosine similarities by the predicted popularity scores.
    #
    scores = similarities * popularity_scores  

    # Determine top recommendations
    top_indices = np.argsort(scores)[-top_n-1:-1][::-1]
    similar_tracks = data.iloc[top_indices]

    results = similar_tracks[['track_name', 'track_artist']]


    return results


In [92]:
# Example usage of the function (we will use a real track name and artist from the dataset for demonstration)
example_track_name = "Memories"
example_track_artist = "Maroon 5"

# Get recommendations
recommended_songs = recommendation(example_track_name, example_track_artist, data, features,best_decision_tree)
recommended_songs

Unnamed: 0,track_name,track_artist
6077,Fekka,Zoop One
12343,Drive,Incubus
7173,"Ice Cream (feat. Ghostface Killah, Method Man ...",Raekwon
19012,Happy,Ashanti
15005,40%,Aya Nakamura
3408,Middle of Somewhere,The Neighbourhood
6219,I Got Love,Nate Dogg
8922,Gan-Ga - Remix,Bryant Myers
17984,Rich (feat. D Block Europe & Offset),The Plug
2379,Chlorine,Twenty One Pilots
