# 0. Libraries

In [14]:
import pandas as pd
import csv
import json
import ml_library as ml
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
import warnings
from sklearn.exceptions import ConvergenceWarning

# Suppress the ConvergenceWarning
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)

# 1. Loading the data

In [15]:
# Read the CSV file and convert to JSON
csv_file_path = 'data/spotify-2024.csv'
json_file_path = 'data/sample_input.json'

data = []

with open(csv_file_path, 'r', encoding='latin-1') as csv_file:
    csv_reader = csv.DictReader(csv_file)  # Automatically maps rows to dictionary using headers
    for row in csv_reader:
        data.append(row)

# Write the JSON to a file
with open(json_file_path, 'w') as json_file:
    json.dump(data, json_file, indent=4)

print(f"CSV converted to JSON and saved to {json_file_path}")

data= pd.read_json('data/sample_input.json')
data = pd.DataFrame(data)

print("json loaded")

CSV converted to JSON and saved to data/sample_input.json
json loaded


In [16]:
data

Unnamed: 0,Track,Album Name,Artist,Release Date,ISRC,All Time Rank,Track Score,Spotify Streams,Spotify Playlist Count,Spotify Playlist Reach,...,SiriusXM Spins,Deezer Playlist Count,Deezer Playlist Reach,Amazon Playlist Count,Pandora Streams,Pandora Track Stations,Soundcloud Streams,Shazam Counts,TIDAL Popularity,Explicit Track
0,MILLION DOLLAR BABY,Million Dollar Baby - Single,Tommy Richman,4/26/2024,QM24S2402528,1,725.4,390470936,30716,196631588,...,684,62,17598718,114,18004655,22931,4818457,2669262,,0
1,Not Like Us,Not Like Us,Kendrick Lamar,5/4/2024,USUG12400910,2,545.9,323703884,28113,174597137,...,3,67,10422430,111,7780028,28444,6623075,1118279,,1
2,i like the way you kiss me,I like the way you kiss me,Artemas,3/19/2024,QZJ842400387,3,538.4,601309283,54331,211607669,...,536,136,36321847,172,5022621,5639,7208651,5285340,,0
3,Flowers,Flowers - Single,Miley Cyrus,1/12/2023,USSM12209777,4,444.9,2031280633,269802,136569078,...,2182,264,24684248,210,190260277,203384,,11822942,,0
4,Houdini,Houdini,Eminem,5/31/2024,USUG12403398,5,423.3,107034922,7223,151469874,...,1,82,17660624,105,4493884,7006,207179,457017,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4595,For the Last Time,For the Last Time,$uicideboy$,9/5/2017,QM8DG1703420,4585,19.4,305049963,65770,5103054,...,,2,14217,,20104066,13184,50633006,656337,,1
4596,Dil Meri Na Sune,"Dil Meri Na Sune (From ""Genius"")",Atif Aslam,7/27/2018,INT101800122,4575,19.4,52282360,4602,1449767,...,,1,927,,,,,193590,,0
4597,Grace (feat. 42 Dugg),My Turn,Lil Baby,2/28/2020,USUG12000043,4571,19.4,189972685,72066,6704802,...,,1,74,6,84426740,28999,,1135998,,1
4598,Nashe Si Chadh Gayi,November Top 10 Songs,Arijit Singh,11/8/2016,INY091600067,4591,19.4,145467020,14037,7387064,...,,,,7,6817840,,,448292,,0


In [17]:
data[data['ISRC'] == 'FRX282477628']['TikTok Posts']

385    
Name: TikTok Posts, dtype: object

In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 29 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Track                       4600 non-null   object 
 1   Album Name                  4600 non-null   object 
 2   Artist                      4600 non-null   object 
 3   Release Date                4600 non-null   object 
 4   ISRC                        4600 non-null   object 
 5   All Time Rank               4600 non-null   object 
 6   Track Score                 4600 non-null   float64
 7   Spotify Streams             4600 non-null   object 
 8   Spotify Playlist Count      4600 non-null   object 
 9   Spotify Playlist Reach      4600 non-null   object 
 10  Spotify Popularity          4600 non-null   object 
 11  YouTube Views               4600 non-null   object 
 12  YouTube Likes               4600 non-null   object 
 13  TikTok Posts                4600 

# 2. Data splitting
Here, we split the data as an example for the hyperparameter tuning and prediction sections below.

In [19]:
target = 'Explicit Track'
features = data.drop(columns=[target])
target_data = data[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(features, target_data, test_size=0.2, random_state=42)

In [20]:
X_train

Unnamed: 0,Track,Album Name,Artist,Release Date,ISRC,All Time Rank,Track Score,Spotify Streams,Spotify Playlist Count,Spotify Playlist Reach,...,AirPlay Spins,SiriusXM Spins,Deezer Playlist Count,Deezer Playlist Reach,Amazon Playlist Count,Pandora Streams,Pandora Track Stations,Soundcloud Streams,Shazam Counts,TIDAL Popularity
1898,"90-90 Nabbe Nabbe (From ""Jatt Nuu Chudail Takri"")","90-90 Nabbe Nabbe (From ""Jatt Nuu Chudail Takri"")",Gippy Grewal,2/20/2024,INM432400019,1892,33.4,6576855,307,62869,...,91,,,,,,,,43588,
1370,For My Hand (feat. Ed Sheeran),"Love, Damini",Burna Boy,7/1/2022,USAT22204903,1366,40.1,264048274,62386,29271052,...,30156,,13,83804,27,5182492,1615,,2763102,
3038,"She Don't Know (From ""Blessed"")","She Don't Know (From ""Blessed"")",Millind Gaba,1/9/2019,INS181900027,3030,25.2,32433183,3830,877033,...,1,,,,,,,,200459,
2361,Je M'appelle,Je M'appelle,Benzz,4/21/2022,GBUM72202219,2358,29.4,231745345,83611,6994071,...,576,,6,18881,7,12900,99,,1068455,
156,Last Night,3 Songs At A Time Sampler,Morgan Wallen,1/31/2023,USUG12300802,157,121.9,922010059,107189,61174155,...,18364,1846,7,33712,38,307115852,200650,13022144,2665347,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4426,ýýýH+3+ýýýýýýý7luCJIo0T6...,ýýýH+3+ýýýýýýý7luCJIo0T6...,vyrval,2/7/2024,US3DF2408834,4404,19.9,27480893,5435,4137392,...,,,,,,8215,44,,452466,
466,HUMBLE.,HUMBLE.,Kendrick Lamar,3/31/2017,USUM71703085,466,71.4,2226869580,311766,77191099,...,84979,692,111,657065,94,562540114,476433,,7692756,
3092,Chasing That Feeling,The Name Chapter: FREEFALL,TOMORROW X TOGETHER,10/13/2023,USA2P2342340,3089,24.9,76642740,6264,8512117,...,907,31,8,237408,19,410616,131,,142567,
3772,All By Myself,All By Myself,Alok,10/7/2022,DEE862201655,3764,22.1,153478699,23576,35626223,...,157225,84,74,2068935,18,106577,65,45258,1187709,


# 3. Randomized search CV and prediction with optimal hyperparameters
Includes preprocessing in each fold that is made for each hyperparameter combination.

In [21]:
# Create preprocessing pipeline
nan_remover = ml.NanRemover()
integer_transformer = ml.IntegerTransformer()
standardizer = ml.Standardizer()
features = ml.FeatureEngineering()
pipeline = ml.PreprocessingPipeline([nan_remover, integer_transformer, standardizer])

# Create model (can be any model that supports fit and predict)
logreg = LogisticRegression(max_iter = 5000,
                            penalty = 'l2',
                            fit_intercept = True)

# Create pipeline
pipeline = ml.MyPipeline(model = logreg, preprocessing = pipeline)

In [22]:
# Prediction BEFORE tuning
pipeline.fit(X_train, y_train)
print('\nPrediction before hyperparameter tuning:\n')
print(pipeline.predict(X_test))
print('\n Hyperparameters used for the prediction:\n')
print(pipeline.model.get_params())


Prediction before hyperparameter tuning:

[0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0
 0 0 0 1 1 1 0 1 0 0 1 1 0 0 1 1 1 0 0 0 0 1 1 0 0 1 0 0 0 1 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 1 0 0 0 0 1 1 0 0 1 0 1 1 0 1 0 0 0 0 1
 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 1 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 1 0 0 0
 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1
 1 1 0 1 0 0 1 0 1 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 1 0 1 1 1 0 0 0 0 1 0 0 0 1 0 0 0 1 0
 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0
 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [23]:
# Hyperparameter tuning (could be implemented with RandomizedSearch or GridSearch)
search_strategy = RandomizedSearchCV(
    pipeline,
    param_distributions = {
        'model__C': uniform(loc=0.01, scale=10),
    },
    n_iter = 10,
    random_state=0
)
pipeline.tune(X = X_train, y = y_train, strategy = search_strategy)

The model has been tuned with the optimal hyperparameters.


{'C': 8.927730007820797}

In [24]:
# Prediction AFTER tuning
pipeline.fit(X_train, y_train)
print('\nPrediction before hyperparameter tuning:\n')
print(pipeline.predict(X_test))
print('\n (Optimal) Hyperparameters used for the prediction:\n')
print(pipeline.model.get_params())


Prediction before hyperparameter tuning:

[0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0
 0 0 0 1 1 1 0 1 0 0 1 1 0 0 1 1 1 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 0 0 1 0 1 1 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0
 0 0 0 0 1 0 1 0 0 1 1 0 0 0 1 0 0 1 0 0 0 0 1 1 0 0 1 0 1 1 0 0 0 0 0 0 1
 0 1 1 0 1 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 1 1 1 0 0 0 1 0 0 0
 0 1 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1
 1 1 0 1 0 0 1 0 1 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 1 0 1 1 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0
 0 1 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 1 1 1 0 0 1 0 0 0 1 0 0 0 0 0 0 1 1 0
 0 1 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 

There is no step evaluating the model's performance on a set of metrics given that this step is already implemented inside the tuning step.