In [3]:
import typing
import pandas as pd
import numpy as np

from gensim.parsing.preprocessing import remove_stopwords
import string
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

from sklearn.cluster import KMeans
from sklearn.model_selection import GridSearchCV

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

from matplotlib import pyplot as plt
import seaborn as sns

from typing import Tuple, List, Dict, Union

### Helper Functions

In [4]:
def convert_tuple_to_dict(tup: Tuple, di: Dict) -> Dict:
    for a, b in tup:
        di[a] = b
    return di

In [102]:
def group_similar_genres(
    choices: Union[List, pd.Series], 
    most_popular_list: Union[List, pd.Series], 
    treshold: float = 70
) -> pd.DataFrame:
    """
    Calculates fuzzy ratio between elements in lists
    """
    values = []
    
    for element in choices:
        if element == '':
            values.append('')
        else: 
            fuzzy_result = process.extract(element, most_popular_list, scorer=fuzz.ratio)
            
            _dict = {}
            fuzzy_dict = convert_tuple_to_dict(fuzzy_result, _dict)
            
            max_value_key = max(fuzzy_dict, key=fuzzy_dict.get)
            max_value_pair = fuzzy_dict[max_value_key]
            
            if max_value_pair >= treshold:
                values.append(max_value_key)
            else:
                values.append(element)
        
    similar_genre_df = pd.DataFrame(data = {'fuzzy_genre': values})
    return similar_genre_df



### Analysis

In [51]:
df = pd.read_csv('../complete_data.csv')
df.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,instrumentalness,liveness,valence,tempo,time_signature,followers,genre_artist,name_artist,popularity_artist,duration_mins
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],45tIt06XoI0Iio4LBEVpls,1922-02-22,0.645,0.445,...,0.744,0.151,0.127,104.851,3,91.0,[''],Uli,4.0,2.11505
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],14jtPCOoNZwquk5wd9DxrY,1922-06-01,0.695,0.263,...,0.0,0.148,0.655,102.009,1,3.0,[''],Fernando Pessoa,0.0,1.636667
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,['Ignacio Corsini'],5LiOoJbxVSAMkBS2fUm3X2,1922-03-21,0.434,0.177,...,0.0218,0.212,0.457,130.418,5,3528.0,"['tango', 'vintage tango']",Ignacio Corsini,23.0,3.027333
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,['Ignacio Corsini'],5LiOoJbxVSAMkBS2fUm3X2,1922-03-21,0.321,0.0946,...,0.918,0.104,0.397,169.98,3,3528.0,"['tango', 'vintage tango']",Ignacio Corsini,23.0,2.94845
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,['Dick Haymes'],3BiJGZsyX9sJchTqcSA7Su,1922-01-01,0.402,0.158,...,0.13,0.311,0.196,103.22,4,11327.0,"['adult standards', 'big band', 'easy listenin...",Dick Haymes,35.0,2.718


In [52]:
def str_to_list(row):
    """convert a string List into a List"""
    row = str(row).strip("[]").replace("'","").split(", ")
    return row

In [53]:
genres = df[['id', 'genre_artist']]

In [54]:
genres['genre_artist'] = genres['genre_artist'].apply(str_to_list)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [55]:
genres_list = genres['genre_artist'].to_list()

In [56]:
flat_list = [item for sublist in genres_list for item in sublist]

In [57]:
exploded_genre = genres.explode('genre_artist')
exploded_genre['genre_artist'].value_counts()

                                 49168
rock                             31819
adult standards                  25755
classic rock                     23657
mellow gold                      19224
                                 ...  
deep neo-synthpop                    1
danish folk                          1
experimental jazz                    1
danish experimental                  1
chinese classical performance        1
Name: genre_artist, Length: 4516, dtype: int64

In [58]:
exploded_genre['genre_artist'].value_counts().nlargest(5000)


                                 49168
rock                             31819
adult standards                  25755
classic rock                     23657
mellow gold                      19224
                                 ...  
christian afrobeat                   1
georgian folk                        1
cornwall indie                       1
togolese pop                         1
chinese classical performance        1
Name: genre_artist, Length: 4516, dtype: int64

In [59]:
exploded_genre.describe()

Unnamed: 0,id,genre_artist
count,1937354,1937354.0
unique,586672,4516.0
top,2SpHd4lGMrJMIQDf92V6VP,
freq,21,49168.0


In [60]:
text_analysis = genres
text_analysis['genres'] = text_analysis['genre_artist'].apply(' '.join)
text_analysis

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,id,genre_artist,genres
0,35iwgR4jXetI318WEWsa1Q,[],
1,021ht4sdgPcrDgSk7JTbKY,[],
2,07A5yehtSnoedViJAZkNnc,"[tango, vintage tango]",tango vintage tango
3,08FmqUhxtyLTn6pAh6bk45,"[tango, vintage tango]",tango vintage tango
4,08y9GfoqCWfOGsKdwojr5e,"[adult standards, big band, easy listening, lo...",adult standards big band easy listening lounge...
...,...,...,...
586667,5rgu12WBIHQtvej2MdHSH0,[chinese viral pop],chinese viral pop
586668,0NuWgxEp51CutD2pJoF4OM,"[alt z, alternative r&b, bedroom pop, indie ca...",alt z alternative r&b bedroom pop indie cafe p...
586669,27Y1N4Q4U3EfDU5Ubw8ws2,"[alt z, electropop, indie pop, la indie, pop, ...",alt z electropop indie pop la indie pop post-t...
586670,45XJsGpFTyzbzeWK8VzR8S,"[chill r&b, indie cafe pop, singaporean pop]",chill r&b indie cafe pop singaporean pop


In [61]:
# text_analysis = text_analysis.replace(r'', np.NaN)
# text_analysis


### Kmeans  approach

In [15]:
raw_text = text_analysis['genres']

In [16]:
stop_words_removed = [remove_stopwords(x) \
        .translate(str.maketrans('','',string.punctuation)) \
        .translate(str.maketrans('','',string.digits)) \
        for x in raw_text]
stop_words_removed_series = pd.DataFrame(data={'cleaned_genre': stop_words_removed})

In [17]:
vectorizer_ntf = TfidfVectorizer(ngram_range=(1,2), lowercase=True)
X_ntf = vectorizer_ntf.fit_transform(stop_words_removed_series['cleaned_genre'])


In [18]:
tfidf_df = pd.DataFrame(X_ntf.toarray(), columns=[i for i in vectorizer_ntf.get_feature_names_out()])
tfidf_df


Unnamed: 0,abc,abc paulista,abstract,abstract ambient,abstract bass,abstract beats,abstract big,abstract deep,abstract hip,abstract idm,...,zouk,zouk riddim,zouk zouk,zuliana,zuliana latin,zuliana musica,zuliana pop,zurich,zurich indie,zydeco
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
586667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
586668,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
586669,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
586670,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
# #Test increments of 100 clusters using elbow method
# sse={}
# for k in np.arange(100,300,50):
#     kmeans = KMeans(n_clusters=k, max_iter=500).fit(tfidf_df)
#     sse[k] = kmeans.inertia_
# plt.plot(list(sse.keys()),list(sse.values()))
# plt.xlabel('Values for K')
# plt.ylabel('SSE')
# plt.show();

In [20]:
# X_fuzz = pd.crosstab(
#    [stop_words_removed_series['cleaned_genre'].index, stop_words_removed_series['cleaned_genre']], 
#    stop_words_removed_series['cleaned_genre']
#    ).apply(lambda col: [fuzz.token_sort_ratio(col.name, x) for x in col.index.get_level_values(1)])

### Fuzzy approach

In [103]:
exploded_genre

Unnamed: 0,id,genre_artist
0,35iwgR4jXetI318WEWsa1Q,
1,021ht4sdgPcrDgSk7JTbKY,
2,07A5yehtSnoedViJAZkNnc,tango
2,07A5yehtSnoedViJAZkNnc,vintage tango
3,08FmqUhxtyLTn6pAh6bk45,tango
...,...,...
586670,45XJsGpFTyzbzeWK8VzR8S,indie cafe pop
586670,45XJsGpFTyzbzeWK8VzR8S,singaporean pop
586671,5Ocn6dZ3BJFPWh4ylwFXtn,chicha
586671,5Ocn6dZ3BJFPWh4ylwFXtn,cumbia


In [122]:
exploded_genre['genre_artist'].value_counts()

                                 49168
rock                             31819
adult standards                  25755
classic rock                     23657
mellow gold                      19224
                                 ...  
deep neo-synthpop                    1
danish folk                          1
experimental jazz                    1
danish experimental                  1
chinese classical performance        1
Name: genre_artist, Length: 4516, dtype: int64

In [148]:
most_listened_genres = exploded_genre['genre_artist'].value_counts().nlargest(1000)
most_listened_genres

                    49168
rock                31819
adult standards     25755
classic rock        23657
mellow gold         19224
                    ...  
minimalism            266
argentine punk        264
swedish idol pop      263
happy hardcore        263
dabke                 262
Name: genre_artist, Length: 1000, dtype: int64

In [42]:
text_analysis[text_analysis['genres'] == '']

Unnamed: 0,id,genre_artist,genres
0,35iwgR4jXetI318WEWsa1Q,[],
1,021ht4sdgPcrDgSk7JTbKY,[],
6,0Dd9ImXtAtGwsmsAD69KZT,[],
8,0IgI1UCz84pYeVetnl1lGP,[],
10,0OYGe21oScKJfanLyM7daU,[],
...,...,...,...
586584,4xQeWGPpOBConZoRCihn2t,[],
586607,1zmMf0f4YchxURhvY30L8g,[],
586617,3dVKdRbGbQ9KyuMpFsvrlu,[],
586624,4qDxTUIjmefJIQEJkd05cc,[],


In [104]:
text_analysis

Unnamed: 0,id,genre_artist,genres
0,35iwgR4jXetI318WEWsa1Q,[],
1,021ht4sdgPcrDgSk7JTbKY,[],
2,07A5yehtSnoedViJAZkNnc,"[tango, vintage tango]",tango vintage tango
3,08FmqUhxtyLTn6pAh6bk45,"[tango, vintage tango]",tango vintage tango
4,08y9GfoqCWfOGsKdwojr5e,"[adult standards, big band, easy listening, lo...",adult standards big band easy listening lounge...
...,...,...,...
586667,5rgu12WBIHQtvej2MdHSH0,[chinese viral pop],chinese viral pop
586668,0NuWgxEp51CutD2pJoF4OM,"[alt z, alternative r&b, bedroom pop, indie ca...",alt z alternative r&b bedroom pop indie cafe p...
586669,27Y1N4Q4U3EfDU5Ubw8ws2,"[alt z, electropop, indie pop, la indie, pop, ...",alt z electropop indie pop la indie pop post-t...
586670,45XJsGpFTyzbzeWK8VzR8S,"[chill r&b, indie cafe pop, singaporean pop]",chill r&b indie cafe pop singaporean pop


In [201]:
testing_rows = text_analysis['genre_artist']
most_popular_genres = most_listened_genres.index.to_list()[1:]

                    49168
rock                31819
adult standards     25755
classic rock        23657
mellow gold         19224
                    ...  
minimalism            266
argentine punk        264
swedish idol pop      263
happy hardcore        263
dabke                 262
Name: genre_artist, Length: 1000, dtype: int64

In [194]:
# testing_choices = text_analysis['genre_artist'][5]
testing_choices = text_analysis['genre_artist'][5]

most_popular_genres = most_listened_genres.index.to_list()[1:]
testing_choices

['adult standards', 'big band', 'easy listening', 'lounge', 'swing']

In [204]:
def group_similar_genres(
    choices: Union[List, pd.Series], 
    most_popular_list: Union[List, pd.Series], 
    treshold: float = 70
) -> pd.DataFrame:
    """
    Calculates fuzzy ratio between elements in lists
    """
    values = []
    
    for element in choices:
        
        if element == [''] or element == '':
            values.append([])
        else: 
            # print(element)
            fuzzy_result = process.extractBests(' '.join(element), most_popular_list, scorer=fuzz.ratio, limit=1)
            # print('fuzzy_result: ',fuzzy_result)
            # _dict = {}
            # fuzzy_dict = convert_tuple_to_dict(fuzzy_result, _dict)
            # max_value_key = max(fuzzy_dict, key=fuzzy_dict.get)
            # max_value_pair = fuzzy_dict[max_value_key]
            
            max_value_key = fuzzy_result[0][0]
            max_value_pair = fuzzy_result[0][1]
            
            if max_value_pair >= treshold:
                values.append(max_value_key)
            else:
                values.append(', '.join(element))
        # print(values)
        
    similar_genre_df = pd.DataFrame(data = {'fuzzy_genre': values})
    return similar_genre_df

# lol = [['adult standards', 'big band'], ['adult standards', 'big band', 'easy listening', 'lounge', 'swing']]
# group_similar_genres(lol, most_popular_genres, 60)

In [205]:
grouped_genred_df = group_similar_genres(testing_rows, most_popular_genres, treshold=50)

In [233]:
exploded_na_fuzzy = grouped_genred_df.explode('fuzzy_genre')
exploded_na_fuzzy.isna().sum()

fuzzy_genre    49168
dtype: int64

In [241]:
exploded_na_fuzzy.value_counts()

fuzzy_genre                                                                              
nan                                                                                          11875
hoerspiel                                                                                     8198
adult standards                                                                               6420
classic bollywood                                                                             6092
classic italian pop                                                                           4907
                                                                                             ...  
dirty south rap, gangster rap, hip hop, houston rap, pop rap, rap, southern hip hop, trap        1
german post-rock, instrumental post-rock, post-metal, post-rock                                  1
latin alternative, mexican rock, reggae en espanol, reggae mexicano                              1
latin alternative, 

In [239]:
exploded_fuzy = exploded_na_fuzzy[exploded_na_fuzzy.isna() == True]
exploded_fuzy

Unnamed: 0,fuzzy_genre
0,
1,
2,
3,
4,
...,...
586667,
586668,
586669,
586670,


In [95]:
# grouped_genred_df.to_csv('fuzzy_genre_test.csv', index=False)

In [116]:
fuzzy_list = grouped_genred_df['fuzzy_genre'].to_list()

In [117]:
flat_fuzzy_list = [item for sublist in genres_list for item in sublist]

In [119]:
flat_fuzzy_df = pd.DataFrame(data={'flat_fuzzy_genres': flat_fuzzy_list})

In [124]:
flat_fuzzy_df.value_counts()

flat_fuzzy_genres   
                        49168
rock                    31819
adult standards         25755
classic rock            23657
mellow gold             19224
                        ...  
dub brasileiro              1
bern indie                  1
bible                       1
classical percussion        1
singing bowl                1
Length: 4516, dtype: int64