In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
import mlflow
from mlflow import log_metric, log_param, log_artifact
import mlflow.keras
    

POPULARITY SYSTEM

In [None]:

# MLflow experiment name
mlflow.set_experiment('Spotify Popularity Prediction')

# Load the data
spotify_data_path = 'sorted_spotify_data.csv'
spotify_data = pd.read_csv(spotify_data_path)

# Binarize the 'popularity' column for classification
spotify_data['high_popularity'] = (spotify_data['popularity'] > 50).astype(int)
    

In [None]:

# For simplicity, select a few numeric features to use
features = ['danceability', 'energy']

X = spotify_data[features]
y = spotify_data['high_popularity']
    

In [None]:

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Convert labels to categorical one-hot encoding
y_train_categorical = to_categorical(y_train, num_classes=2)
y_test_categorical = to_categorical(y_test, num_classes=2)
    

In [None]:

# Start an MLflow run
with mlflow.start_run():
    # Define the model architecture
    model = Sequential([
        Dense(64, input_shape=(X_train.shape[1],), activation='relu'),
        Dense(32, activation='relu'),
        Dense(2, activation='softmax')
    ])
    
    # Compile the model
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    # Log model parameters
    log_param("num_layers", 2)
    log_param("num_units_layer1", 64)
    log_param("num_units_layer2", 32)
    log_param("optimizer", 'adam')
    
    # Train the model and log the training process
    history = model.fit(X_train, y_train_categorical, epochs=50, batch_size=32, verbose=1)
    
    # Log training metrics
    log_metric("train_accuracy", max(history.history['accuracy']))
    log_metric("train_loss", min(history.history['loss']))
    
    # Evaluate the model and log the final metrics
    results = model.evaluate(X_test, y_test_categorical)
    log_metric("test_loss", results[0])
    log_metric("test_accuracy", results[1])
    
    # Log the model
    mlflow.keras.log_model(model, "model")
    print(results)

RECOMMENDING SYSTEM

In [2]:
import pandas as pd

# Load the CSV file containing user preferences and song information
file_path = 'recommended_tracks.csv'  # Update this path
data = pd.read_csv(file_path)

# Display the first few rows to understand the data structure
print(data.head())

                                          track_name  popularity  \
0                                       Cruel Summer         100   
1                Seven (feat. Latto) (Explicit Ver.)          99   
2                                               LALA          99   
3  What Was I Made For? [From The Motion Picture ...          98   
4            Dance The Night - From Barbie The Album          97   

     artists_names                        artist_genres  danceability  energy  \
0     Taylor Swift                                  pop         0.552  0.7020   
1  Jung Kook;Latto                                k-pop         0.802  0.8320   
2      Myke Towers  reggaeton;trap latino;urbano latino         0.708  0.7370   
3    Billie Eilish               art pop;electropop;pop         0.444  0.0911   
4         Dua Lipa                 dance pop;pop;uk pop         0.671  0.8450   

     liked  
0     like  
1  dislike  
2     like  
3     like  
4     like  


In [None]:

def get_disliked_artists(df):
    # Assuming there is a 'liked' column where 0 represents dislike
    # 'artists_names' should be the column with the artist's name
    # We also assume that the artist names are already encoded as integers
    disliked_artists = df[df['liked'] == 0]['artists_names'].unique().tolist()
    return disliked_artists


In [None]:

def recommend_songs(song_index, cosine_sim, df_preprocessed, disliked_artists, num_recommendations=5):
    sim_scores = list(enumerate(cosine_sim[song_index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Filter out songs by disliked artists
    recommended_indices = [
        i[0] for i in sim_scores if df_preprocessed.iloc[i[0]]['artists_names'] not in disliked_artists
    ]
    
    # Get the top N recommendations excluding the song itself
    recommended_indices = recommended_indices[1:num_recommendations+1]
    
    return df_preprocessed['track_name'].iloc[recommended_indices].tolist()
