In [1]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

data_path = Path("../../data").resolve()
os.makedirs(data_path, exist_ok=True)

tracks_df = pd.read_parquet(data_path / "tracks_v3.pq")
genres = tracks_df["artist_genres"].explode().rename("genre").dropna()

vectorizer = CountVectorizer(ngram_range=(1, 3))
vectorizer.fit_transform(genres)
bag_of_words = vectorizer.transform(genres)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
words_freq_df = pd.DataFrame(words_freq, columns=["genre", "count"])
words_freq_df["freq"] = words_freq_df["count"] / words_freq_df["count"].sum()
words_freq_df = words_freq_df[
    (words_freq_df["genre"] != "hop")
    & (words_freq_df["genre"] != "hip")
]
words_freq_df["pop_genre"] = np.where(words_freq_df.index <= 23, 1, 0)
top_word = words_freq_df[words_freq_df["pop_genre"] == 1]["genre"].tolist()

df_ = tracks_df[["id", "artist_genres"]]
df_["artist_genres"] = df_["artist_genres"].apply(lambda g: "; ".join(list(g)) if g is not None else "other")
df_["selected_genre"] = "other"

for word in top_word:
    mask = df_["artist_genres"].str.contains(word)
    df_[word] = np.where(mask, 1, 0)
    df_["selected_genre"] = np.where(df_["artist_genres"].str.contains(word), word, df_["selected_genre"])

df_.to_csv(data_path / "track_genres.csv", index=False)
df_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_["artist_genres"] = df_["artist_genres"].apply(lambda g: "; ".join(list(g)) if g is not None else "other")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_["selected_genre"] = "other"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_[word] = np.where(mask, 1, 0)
A value is trying to be set on

Unnamed: 0,id,artist_genres,selected_genre,pop,rap,hip hop,dance,trap,rock,dance pop,...,contemporary,indie,latino,trap latino,post,alternative,edm,modern,teen,teen pop
0,10iesoGb4mCYTcur1QfdO9,francoton; rap marseille,rap,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0DlHQxrVijRqVUfuS6PKaY,funk carioca; funk das antigas; sertanejo pop;...,trap,1,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2QtM6ZVLx13TBOO1iwmXYy,other,other,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,27hFQQS3cVUmIK3ser5bpu,acoustic cover; viral pop,pop,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1YcqYmviKagJ68DOmjSWQW,czsk hip hop; czsk hip hop; slovak hip hop,hip hop,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14962,23CsjGOYDQiJpX44BSn9TM,latin; latin arena pop; latin pop; mexican pop...,trap latino,1,1,0,0,1,0,0,...,0,0,1,1,0,0,0,0,0,0
14963,6I6NX6tjGsxFAsIfGzY9lJ,conscious hip hop; hip hop; north carolina hip...,hip hop,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14964,41BzKiCrKpUQHS3TP0jmdr,latin; reggaeton; reggaeton colombiano,reggaeton,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14965,5V7mTIcXVU9k2JNu0mE6vy,neo mellow; piano rock; pixie; pop punk; pop rock,rock,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
