In [None]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

data_path = Path("../../data").resolve()
os.makedirs(data_path, exist_ok=True)

tracks_df = pd.read_parquet(data_path / "tracks.pq")
genres = tracks_df["artist_genres"].explode().rename("genre").dropna()

vectorizer = CountVectorizer(ngram_range=(1, 3))
vectorizer.fit_transform(genres)
bag_of_words = vectorizer.transform(genres)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
words_freq_df = pd.DataFrame(words_freq, columns=["genre", "count"])
words_freq_df["freq"] = words_freq_df["count"] / words_freq_df["count"].sum()
words_freq_df = words_freq_df[
    (words_freq_df["genre"] != "hop")
    & (words_freq_df["genre"] != "hip")
    ]
words_freq_df["pop_genre"] = np.where(words_freq_df.index <= 23, 1, 0)
top_word = words_freq_df[words_freq_df["pop_genre"] == 1]["genre"].tolist()

df_ = tracks_df[["id", "artist_genres"]]
df_["artist_genres"] = df_["artist_genres"].apply(lambda g: "; ".join(list(g)) if g is not None else "other")
df_["selected_genre"] = "other"

for word in top_word:
    mask = df_["artist_genres"].str.contains(word)
    df_[word] = np.where(mask, 1, 0)
    df_["selected_genre"] = np.where(df_["artist_genres"].str.contains(word), word, df_["selected_genre"])

df_.to_parquet(data_path / "track_genres.csv")