In [6]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [7]:
#load the dataset
df=pd.read_csv('spotify.csv')
df.head()

Unnamed: 0,id,name,duration,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,danceability,timestamp
0,4ZtFanR9U6ndgddUvNcjcG,Good 4 U Olivia Rodrigo,2.97,0.664,9,-5.044,1,0.154,0.335,0.0,0.0849,0.688,166.928,0.563,08:00:00
1,5fxyZf6m2xHeSrOzUfcJrq,Stay The Kid LAROI & Justin Bieber,2.3,0.506,8,-11.275,1,0.0589,0.379,0.868,0.11,0.454,170.054,0.564,08:00:00
2,5nujrmhLynf4yMoMtj8AQF,Levitating Dua Lipa feat. DaBaby,3.38,0.825,6,-3.787,0,0.0601,0.00883,0.0,0.0674,0.915,102.977,0.702,09:20:00
3,4iJyoBOLtHqaGxP12qzhQI,Peaches Justin Bieber feat. Daniel Caesar & Gi...,3.3,0.696,0,-6.181,1,0.119,0.321,0.0,0.42,0.464,90.03,0.677,10:20:00
4,1SC5rEoYDGUK4NfG82494W,Montero (Call Me By Your Name) Lil Nas X,2.3,0.503,8,-6.725,0,0.22,0.293,0.0,0.405,0.71,178.781,0.593,11:20:00


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                100 non-null    object 
 1   name              100 non-null    object 
 2   duration          100 non-null    float64
 3   energy            100 non-null    float64
 4   key               100 non-null    int64  
 5   loudness          100 non-null    float64
 6   mode              100 non-null    int64  
 7   speechiness       100 non-null    float64
 8   acousticness      100 non-null    float64
 9   instrumentalness  100 non-null    float64
 10  liveness          100 non-null    float64
 11  valence           100 non-null    float64
 12  tempo             100 non-null    float64
 13  danceability      100 non-null    float64
 14  timestamp         100 non-null    object 
dtypes: float64(10), int64(2), object(3)
memory usage: 11.8+ KB


In [9]:
df.isnull().sum()

Unnamed: 0,0
id,0
name,0
duration,0
energy,0
key,0
loudness,0
mode,0
speechiness,0
acousticness,0
instrumentalness,0


In [10]:
l_id = LabelEncoder()
df['n_id'] = l_id.fit_transform(df['id'])

In [11]:
l_name = LabelEncoder()
df['n_name'] = l_name.fit_transform(df['name'])

In [12]:
data = df.drop(['id', 'name'], axis='columns')

In [13]:
data.head()

Unnamed: 0,duration,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,danceability,timestamp,n_id,n_name
0,2.97,0.664,9,-5.044,1,0.154,0.335,0.0,0.0849,0.688,166.928,0.563,08:00:00,47,27
1,2.3,0.506,8,-11.275,1,0.0589,0.379,0.868,0.11,0.454,170.054,0.564,08:00:00,60,67
2,3.38,0.825,6,-3.787,0,0.0601,0.00883,0.0,0.0674,0.915,102.977,0.702,09:20:00,62,44
3,3.3,0.696,0,-6.181,1,0.119,0.321,0.0,0.42,0.464,90.03,0.677,10:20:00,51,55
4,2.3,0.503,8,-6.725,0,0.22,0.293,0.0,0.405,0.71,178.781,0.593,11:20:00,16,47


In [15]:
#covert timestamp into datetime
data['timestamp'] = pd.to_datetime(data['timestamp'])
#extract time-based features
data['hour'] = data['timestamp'].dt.hour
data['day_of_week'] = data['timestamp'].dt.dayofweek
data['month'] = data['timestamp'].dt.month

In [16]:
#calculate playcount
data['play_count'] = data.groupby(['n_id', 'n_name'])['timestamp'].transform('count')

In [17]:
#calculate repeated plays
data['repeated_plays'] = (data['play_count'] > 1).astype(int)

In [18]:
data.head(2)

Unnamed: 0,duration,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,danceability,timestamp,n_id,n_name,hour,day_of_week,month,play_count,repeated_plays
0,2.97,0.664,9,-5.044,1,0.154,0.335,0.0,0.0849,0.688,166.928,0.563,2025-01-23 08:00:00,47,27,8,3,1,1,0
1,2.3,0.506,8,-11.275,1,0.0589,0.379,0.868,0.11,0.454,170.054,0.564,2025-01-23 08:00:00,60,67,8,3,1,1,0


In [19]:
#select features and target
x = data[['n_id', 'n_name', 'duration', 'energy', 'key', 'loudness', 'mode', 'speechiness',
'instrumentalness', 'liveness', 'valence', 'tempo', 'danceability', 'hour', 'day_of_week',
'month', 'play_count']]
y = data['repeated_plays']

In [20]:
#scale numerical features
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [21]:
#seperate the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.2, random_state=42)

In [22]:
#create and train the LOGISTIC REGRESSION MODEL
model = LogisticRegression()
model.fit(x_train, y_train)

In [23]:
y_pred = model.predict(x_test)
print("Model predictions:", y_pred)

Model predictions: [0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0]


In [24]:
accuracy=accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0
