In [12]:
import pandas as pd
import pickle
from fpgrowth_py import fpgrowth
from pathlib import Path
from datetime import datetime
import os

DATASET_NAME = os.getenv("DATASET_NAME", "ds1")
DATASET_FILE_PATH = os.getenv("DATASET_FILE_PATH", "data/2023_spotify_ds1.csv")
MODEL_PATH = os.getenv("MODEL_PATH", "models/model.pkl")
MIN_SUP_RATIO = os.getenv("MIN_SUP_RATIO", 0.04)
MIN_CONF = os.getenv("MIN_CONF", 0.01)

MODEL_FILE = Path(MODEL_PATH)
MODEL_FILE.parent.mkdir(parents=True, exist_ok=True)

df = pd.read_csv(DATASET_FILE_PATH)

playlists_songs = df.groupby("pid")["track_name"].apply(lambda x: list(set(x))).tolist()

freq_items_set, rules = fpgrowth(playlists_songs, minSupRatio=MIN_SUP_RATIO, minConf=MIN_CONF)

with open(MODEL_FILE, "wb") as f:
    pickle.dump(
        {
            "freq": freq_items_set,
            "rules": rules,
            "datetime": datetime.now().isoformat(),
            "dataset": DATASET_NAME,
        },
        f,
    )

print(f"Model saved in {MODEL_FILE}")

Model saved in models/model.pkl


In [2]:
len(rules)

55880

In [13]:
def recommend(rules: list, songs: list[str], k=10) -> list[str]:
    recommendations: dict[str, float] = {}
    
    for antecedent, consequent, confidence in rules:
        if set(antecedent).issubset(songs):
            for c in consequent:
                if c not in songs:
                    recommendations[c] = recommendations.get(c, 0) + confidence

    return sorted(recommendations, key=lambda x: recommendations[x], reverse=True)[:k]

In [14]:
recommend(rules, ["Back To Back"])

['Jumpman',
 'No Role Modelz',
 'Hotline Bling',
 'Energy',
 'White Iverson',
 'One Dance',
 'Antidote',
 'Broccoli (feat. Lil Yachty)',
 'Big Rings',
 '679 (feat. Remy Boyz)']