In [48]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, f1_score, recall_score


In [49]:
data = pd.read_csv('cleaned_data.csv')
data.head()

Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,playlist_subgenre,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,6f807x0ima9a1j3VPbc7VN,I Don't Care (with Justin Bieber) - Loud Luxur...,Ed Sheeran,66,I Don't Care (with Justin Bieber) [Loud Luxury...,2019-06-14,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,dance pop,...,6,-2.634,1,0.0583,0.102,0.0,0.0653,0.518,122.036,194754
1,0r7CVbZTWZgbTCYdfa2P31,Memories - Dillon Francis Remix,Maroon 5,67,Memories (Dillon Francis Remix),2019-12-13,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,dance pop,...,11,-4.969,1,0.0373,0.0724,0.00421,0.357,0.693,99.972,162600
2,1z1Hg7Vb0AhHDiEmnDE79l,All the Time - Don Diablo Remix,Zara Larsson,70,All the Time (Don Diablo Remix),2019-07-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,dance pop,...,1,-3.432,0,0.0742,0.0794,2.3e-05,0.11,0.613,124.008,176616
3,75FpbthrwQmzHlBJLuGdC7,Call You Mine - Keanu Silva Remix,The Chainsmokers,60,Call You Mine - The Remixes,2019-07-19,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,dance pop,...,7,-3.778,1,0.102,0.0287,9e-06,0.204,0.277,121.956,169093
4,1e8PAfcKUYoKkxPhrHqw4x,Someone You Loved - Future Humans Remix,Lewis Capaldi,69,Someone You Loved (Future Humans Remix),2019-03-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,dance pop,...,1,-4.672,1,0.0359,0.0803,0.0,0.0833,0.725,123.976,189052


In [50]:
data.isnull().sum()

track_id                    0
track_name                  0
track_artist                0
track_popularity            0
track_album_name            0
track_album_release_date    0
playlist_name               0
playlist_id                 0
playlist_genre              0
playlist_subgenre           0
danceability                0
energy                      0
key                         0
loudness                    0
mode                        0
speechiness                 0
acousticness                0
instrumentalness            0
liveness                    0
valence                     0
tempo                       0
duration_ms                 0
dtype: int64

In [51]:
def extract_year_from_date(date_str):
    # This function attempts to handle strings and extract the year directly
    if pd.isna(date_str):
        return None  # Return None or a default value if the date is missing
    parts = date_str.split('-')
    # Assume the year is the first part of the date string
    if len(parts[0]) == 4:
        return int(parts[0])
    return None

In [52]:
# Extract the year
data['release_year'] = data['track_album_release_date'].apply(extract_year_from_date)

In [53]:
data.isnull().sum()

track_id                    0
track_name                  0
track_artist                0
track_popularity            0
track_album_name            0
track_album_release_date    0
playlist_name               0
playlist_id                 0
playlist_genre              0
playlist_subgenre           0
danceability                0
energy                      0
key                         0
loudness                    0
mode                        0
speechiness                 0
acousticness                0
instrumentalness            0
liveness                    0
valence                     0
tempo                       0
duration_ms                 0
release_year                0
dtype: int64

In [90]:
# Features for the model 
features = ['danceability', 'energy', 'key','loudness','mode', 'speechiness', 'acousticness',
                           'instrumentalness', 'liveness', 'valence', 'tempo','duration_ms', 'release_year']

# Target variable
target = 'track_popularity'

#Prepare the data
X = data[features]
y = data[target]



In [91]:
print(y)

0        66
1        67
2        70
3        60
4        69
         ..
26224    42
26225    20
26226    14
26227    15
26228    27
Name: track_popularity, Length: 26229, dtype: int64


In [74]:
# Create a binary target based on 'track_popularity'
# Set '0' is unpopular and '1' is popular tracks
popularity_threshold = data['track_popularity'].quantile(0.75)
y = (data['track_popularity'] >= popularity_threshold).astype(int)


In [75]:
#Split the dataset into training and testing sets (80% for training, 20% for testing)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)


In [58]:
print(y)

0        1
1        1
2        1
3        1
4        1
        ..
26224    0
26225    0
26226    0
26227    0
26228    0
Name: track_popularity, Length: 26229, dtype: int32


In [76]:
#Create a new classifier object 
decision_tree = DecisionTreeClassifier()

#Train the model
decision_tree.fit(X_train, y_train)

#Make predictions on the test set
y_predict = decision_tree.predict(X_test)

# Calculating metrics
accuracy = accuracy_score(y_test, y_predict)
precision = precision_score(y_test, y_predict)
recall = recall_score(y_test, y_predict)
f_score = f1_score(y_test, y_predict)


print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F-Score:", f_score)


Accuracy: 0.6545939763629431
Precision: 0.32779783393501805
Recall: 0.3400749063670412
F-Score: 0.3338235294117647


In [77]:
from sklearn.model_selection import GridSearchCV


#Define the parameter
param_grid = {
    'max_depth': [None,10,20,30],
    'min_samples_split': [2,5,10],
    'min_samples_leaf':[1,2,4]
}
                    

grid_search = GridSearchCV(estimator=decision_tree,param_grid=param_grid,cv=5,verbose=2)

grid_search.fit(X_train,y_train)
print("Best parameters: ", grid_search.best_params_)  

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10; total time=   0.1s
[C

In [78]:
best_params = grid_search.best_params_

#Train a decision tree classifier with the best parameters
best_decision_tree = DecisionTreeClassifier(**best_params)
best_decision_tree.fit(X_train,y_train)

new_y_pred = best_decision_tree.predict(X_test)

In [79]:
# Calculating metrics
new_accuracy = accuracy_score(y_test, new_y_pred)
new_precision = precision_score(y_test, new_y_pred)
new_recall = recall_score(y_test, new_y_pred)
new_f_score = f1_score(y_test, new_y_pred)

print("Accuracy:", new_accuracy)
print("Precision:", new_precision)
print("Recall:", new_recall)
print("F-Score:", new_f_score)




Accuracy: 0.7308425467022494
Precision: 0.425531914893617
Recall: 0.1647940074906367
F-Score: 0.2375809935205183


In [88]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def recommendation(data,model,X):

    # User input for song and artist
    input_song_title = input("Enter the title of a song: ")
    input_artist = input("Enter the artist of the song: ")

    # Find the user's song in the dataset
    user_song = data[(data['track_name'] == input_song_title) & (data['track_artist'] == input_artist)]

    if not user_song.empty:
        # Predict the likelihood of liking for the dataset used in model training
        probabilities = model.predict_proba(X)[:, 1]
        
        # Update the main DataFrame with predicted likelihood
        data['predicted_likelihood'] = probabilities

        # Exclude the user's input song from the recommendations
        indices_to_exclude = user_song.index
        # Set probabilities to negative infinity for exclusion
        probabilities[indices_to_exclude] = -np.inf  

        # Calculate cosine similarity for the user's song
        user_song_features = user_song[features].values
        similarities = cosine_similarity(user_song_features, X).flatten()
        
        # Filter and sort the DataFrame based on 'predicted_likelihood'
        # Indices of songs sorted by decreasing likelihood
        sorted_indices = np.argsort(-probabilities)  
        # Choose top 10 song recommendations based on predicted likelihood
        top_similarities_indices = sorted_indices[:10]   
        
        
        top_recommendations = data.iloc[top_similarities_indices]
        top_recommendations['similarity'] = similarities[top_similarities_indices]  # Assign similarities to the top recommendations
        
        # Display the top 10 recommendations
        print("Top 10 recommended songs based on predicted likelihood of being 'liked':")
        print(top_recommendations[['track_name', 'track_artist', 'predicted_likelihood', 'similarity']])
    else:
        print("Sorry, the input song is not found in the database.")

In [89]:
recommendation(data,best_decision_tree,X)

Top 10 recommended songs based on predicted likelihood of being 'liked':
                              track_name          track_artist  \
18057               Barefoot In The Park           James Blake   
17620                          Lights On             Izzy Bizu   
11286         Sharing The Night Together              Dr. Hook   
11285  I Just Want To Be Your Everything             Andy Gibb   
11277                        Do It Again            Steely Dan   
15938                     Mírame - Remix            Nio Garcia   
11248               She's Always a Woman            Billy Joel   
5591                           Engineers               Hp Boyz   
11247                 Let Your Love Flow  The Bellamy Brothers   
11240                   Make It with You                 Bread   

       predicted_likelihood  similarity  
18057                   1.0    0.999998  
17620                   1.0    0.999999  
11286                   1.0    1.000000  
11285                   1.0    0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_recommendations['similarity'] = similarities[top_similarities_indices]  # Assign similarities to the top recommendations


Data without Outlier

In [80]:
# Import necessary library
from scipy.stats.mstats import winsorize

#It applies winsorization to the dataset, replacing outliers in 
#specified columns with values at the 5th and 95th percentiles.
# Define a function to winsorize outliers

def handle_outliers_winsorize(df):
    # Apply winsorization to each column containing outliers
    columns_with_outliers = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'tempo', 'duration_ms']
    for col in columns_with_outliers:
        df.loc[:, col] = winsorize(df[col], limits=[0.05, 0.05])  # Winsorize at the 5th and 95th percentiles
    return df

# Apply winsorization to your dataset
data_no_outliner = handle_outliers_winsorize(data)
data_no_outliner.head()

Unnamed: 0,track_id,track_name,track_artist,track_popularity,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,playlist_subgenre,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,release_year
0,6f807x0ima9a1j3VPbc7VN,I Don't Care (with Justin Bieber) - Loud Luxur...,Ed Sheeran,66,I Don't Care (with Justin Bieber) [Loud Luxury...,2019-06-14,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,dance pop,...,-3.004,1,0.0583,0.102,0.0,0.0653,0.518,122.036,194754,2019
1,0r7CVbZTWZgbTCYdfa2P31,Memories - Dillon Francis Remix,Maroon 5,67,Memories (Dillon Francis Remix),2019-12-13,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,dance pop,...,-4.969,1,0.0373,0.0724,0.00421,0.357,0.693,99.972,162600,2019
2,1z1Hg7Vb0AhHDiEmnDE79l,All the Time - Don Diablo Remix,Zara Larsson,70,All the Time (Don Diablo Remix),2019-07-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,dance pop,...,-3.432,0,0.0742,0.0794,2.3e-05,0.11,0.613,124.008,176616,2019
3,75FpbthrwQmzHlBJLuGdC7,Call You Mine - Keanu Silva Remix,The Chainsmokers,60,Call You Mine - The Remixes,2019-07-19,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,dance pop,...,-3.778,1,0.102,0.0287,9e-06,0.204,0.277,121.956,169093,2019
4,1e8PAfcKUYoKkxPhrHqw4x,Someone You Loved - Future Humans Remix,Lewis Capaldi,69,Someone You Loved (Future Humans Remix),2019-03-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,dance pop,...,-4.672,1,0.0359,0.0803,0.0,0.0833,0.725,123.976,189052,2019


In [81]:
#Prepare the data
X2 = data_no_outliner[features]
y2 = data_no_outliner[target]


In [82]:
# Create a binary target based on 'track_popularity'
popularity_threshold = data_no_outliner['track_popularity'].quantile(0.75)
y2 = (data['track_popularity'] >= popularity_threshold).astype(int)

In [83]:
#Split the dataset into training and testing sets (80% for training, 20% for testing)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X2,y2, test_size=0.2, random_state=42)

In [84]:
#Create a new classifier object c
decision_tree_2 = DecisionTreeClassifier()

#Train the model
decision_tree_2.fit(X_train_2, y_train_2)

#Make predictions on the test set
y_predict_2 = decision_tree_2.predict(X_test_2)

# Calculating metrics
accuracy_2 = accuracy_score(y_test_2, y_predict_2)
precision_2 = precision_score(y_test_2, y_predict_2)
recall_2 = recall_score(y_test_2, y_predict_2)
f_score_2 = f1_score(y_test_2, y_predict_2)


print("Accuracy:", accuracy_2)
print("Precision:", precision_2)
print("Recall:", recall_2)
print("F-Score:", f_score_2)


Accuracy: 0.6591688905833015
Precision: 0.3380986418870622
Recall: 0.3543071161048689
F-Score: 0.346013167520117


In [85]:
#Define the parameter
param_grid = {
    'max_depth': [None,10,20,30],
    'min_samples_split': [2,5,10],
    'min_samples_leaf':[1,2,4]
}
                    

grid_search_2 = GridSearchCV(estimator=decision_tree_2,param_grid=param_grid,cv=5,verbose=2)

grid_search_2.fit(X_train_2,y_train_2)
print("Best parameters: ", grid_search_2.best_params_)  

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2; total time=   0.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2; total time=   0.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2; total time=   0.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2; total time=   0.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2; total time=   0.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5; total time=   0.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5; total time=   0.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5; total time=   0.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5; total time=   0.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10; total time=   0.1s
[C

In [86]:
best_params_2 = grid_search_2.best_params_

#Train a decision tree classifier with the best parameters
best_decision_tree_2 = DecisionTreeClassifier(**best_params_2)
best_decision_tree_2.fit(X_train_2,y_train_2)

new_y_pred_2 = best_decision_tree_2.predict(X_test_2)

In [87]:
# Calculating metrics
new_accuracy_2 = accuracy_score(y_test_2, new_y_pred_2)
new_precision_2 = precision_score(y_test_2, new_y_pred_2)
new_recall_2 = recall_score(y_test_2, new_y_pred_2)
new_f_score_2 = f1_score(y_test_2, new_y_pred_2)

print("Accuracy:", new_accuracy_2)
print("Precision:", new_precision_2)
print("Recall:", new_recall_2)
print("F-Score:", new_f_score_2)

Accuracy: 0.7310331681280976
Precision: 0.4263565891472868
Recall: 0.1647940074906367
F-Score: 0.23770934629929766
