In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
from string import punctuation

In [3]:
pd.set_option('display.max_rows', 100)

In [4]:
wd = pd.read_csv('./WikiData/musicalgroup_genre.csv',usecols=[1,3], header=0, names=['artist','genre'])

wd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55139 entries, 0 to 55138
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   artist  55139 non-null  object
 1   genre   55139 non-null  object
dtypes: object(2)
memory usage: 861.7+ KB


In [5]:
ts = pd.read_csv('./trebi/csv/songs.csv',
                 sep=';',
                 usecols=[0,2,4],
                 header=0,
                 names=['songID','artist','genre'])
ts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 579878 entries, 0 to 579877
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   songID  579878 non-null  object
 1   artist  579878 non-null  object
 2   genre   579878 non-null  object
dtypes: object(3)
memory usage: 13.3+ MB


In [6]:
song_to_artist = dict(ts[['songID','artist']].values)

In [7]:
tt = pd.read_csv('./trebi/csv/tags.csv', sep=';', usecols=[0,1], names=['songID','genre'])
tt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1979394 entries, 0 to 1979393
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   songID  object
 1   genre   object
dtypes: object(2)
memory usage: 30.2+ MB


## combine & tidy

In [203]:
gg = pd.concat([wd,ts,tt])
gg['genre'] = gg['genre'].str.strip().str.casefold()
gg = gg.drop_duplicates()

In [204]:
gg.nunique()

artist    193411
genre       2308
songID    203844
dtype: int64

In [205]:
demonyms = ['anatolian', 'argentine', 'basque',
            'boston', 'brazilian', 'bulgarian', 'catalan',
            'chilean', 'colombian', 'czech', 'dutch',
            'french', 'hungarian', 'indian',
            'kiwi', 'kraut', 'norwegian', 'peruvian', 'portuguese',
            'romanian', 'russian', 'scottish', 'slovenian', 'spanish',
            'suomi', 'swiss', 'swedish', 'turkish', 'ukrainian', 'venezuelan',
            'welsh', 'yugoslav','slovak','estonian','icelandic',
            'indonesian', 'albuquerque', 'canadian', 'la',
            'vancouver', 'australian', 'perth', 'german', 'brooklyn',
            'vienna', 'portland', 'vegas', 'mexican', 'sheffield',
            'ok', 'rva', 'athens', 'belgian',
            'slc', 'finnish', 'chicago', 'polish', 'seattle',
            'leeds', 'nz', 'thai', 'louisville', 'danish',
            'denver', 'michigan', 'stl', 'kc', 'triangle',
            'greek', 'irish', 'bay area', 'thai', 'pakistani',
            'polynesian', 'latvian', 'arab', 'korean', 'brazilian',
            'taiwanese', 'faroese', 'austrian', 'persian', 'chinese',
            'italian', 'american', 'puerto rican', 'detroit', 'louisiana',
            'new orleans', 'uk'
           ]

In [206]:
for nym in demonyms:
    mask = gg['genre'].str.contains(nym,na=False)
    match = gg.loc[mask,'genre'].str.extract('(.*)('+nym+'[ -])(.*)')
    gg.loc[mask,'genre'] = (match[0]+match[2]).str.strip()

In [207]:
gg.nunique()

artist    193411
genre       1835
songID    203844
dtype: int64

In [208]:
mask = gg['genre'].str.contains("music$",na=False)

gg.loc[mask,'genre'] = gg.loc[mask,'genre'].str.extract("(.*)([ -]music$)")[0]

In [209]:
gg.nunique()

artist    193411
genre       1777
songID    203844
dtype: int64

In [210]:
hyphens = gg['genre'].str.contains("-", na=False)
nohy = gg.loc[~hyphens,'genre'].values
mask = gg['genre'].str.replace("-"," ").isin(nohy)
gg.loc[hyphens&mask,'genre'] = gg.loc[hyphens&mask,'genre'].str.replace("-"," ")

In [211]:
gg['genre'] = gg['genre'].str.replace("hip hop", "hip-hop")
gg['genre'] = gg['genre'].str.replace("rock steady", "rocksteady")


In [212]:
gg.nunique()

artist    193411
genre       1762
songID    203844
dtype: int64

In [213]:
gg.loc[gg['genre'].str.contains('(?<!deep)[ ]indie$',na=False),'genre'] = 'indie'

gg.loc[gg['genre'].notna()&gg['genre'].str.contains(r'electronic dance'), 'genre'] = 'edm'

gg.loc[gg['genre'].str.contains('[ -]singer[- ]songwriter',na=False),'genre'] = 'singer-songwriter'

In [214]:
gg.nunique()

artist    193411
genre       1755
songID    203844
dtype: int64

In [215]:
letters = '[^\s'+punctuation.replace('&','')+']+'
word = letters+'-?'+letters

In [216]:
tlg_mask = gg['genre'].str.fullmatch(letters+'-?'+letters,na=False)
tlgs = gg.loc[tlg_mask,'genre'].unique()
tlg_counts = gg.loc[tlg_mask,'genre'].value_counts()

In [217]:
sg1_mask = gg['genre'].str.fullmatch(word+' '+word,na=False)
sg1s = gg.loc[sg1_mask,'genre'].unique()
sg1_counts = gg.loc[sg1_mask,'genre'].value_counts()

In [218]:
sg2_mask = gg['genre'].str.fullmatch(word+' '+word+' '+word,na=False)
sg2s = gg.loc[sg2_mask,'genre'].unique()
sg2_counts = gg.loc[sg2_mask,'genre'].value_counts()

In [219]:
tlg_counts

rock          13161
pop           10533
indie          9049
hip-hop        6307
jazz           3500
              ...  
percussion        1
ragamuffin        1
plena             1
q4444866          1
p-pop             1
Name: genre, Length: 718, dtype: int64

In [220]:
sg1_counts

alternative rock    4176
punk rock           3729
indie rock          3250
pop rock            2088
indie pop           1882
                    ... 
free tekno             1
cumbia amazónica       1
ritual ambient         1
moroccan chaabi        1
heavy psych            1
Name: genre, Length: 803, dtype: int64

In [221]:
sg2_counts

melodic death metal      576
deep indie pop           493
rhythm and blues         474
technical death metal    338
deep acoustic pop        324
                        ... 
awa dance festival         1
canti del maggio           1
florida death metal        1
music of morocco           1
scrumpy and western        1
Name: genre, Length: 198, dtype: int64

In [222]:
gg.loc[
    gg['artist'].isna()
    &gg['songID'].isin(song_to_artist.keys()),
    'artist'
] = gg.loc[
    gg['artist'].isna()
    &gg['songID'].isin(song_to_artist.keys()),
    'songID'].replace(song_to_artist)

gg = gg.drop_duplicates(['artist','genre'])

In [230]:
gg.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 299259 entries, 0 to 989697
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   artist  299257 non-null  object
 1   genre   277549 non-null  object
 2   songID  244313 non-null  object
dtypes: object(3)
memory usage: 9.1+ MB


In [244]:
wd.loc[wd['artist'].str.contains('Brown')]

Unnamed: 0,artist,genre
459,Zac Brown Band,country music
831,The Crazy World of Arthur Brown,psychedelic rock
1241,"Ray, Goodman & Brown",rhythm and blues
4267,Buster Brown,pop
4268,Buster Brown,hard rock
5059,Blue King Brown,reggae
5948,Charlie Brown Jr.,reggae
5949,Charlie Brown Jr.,hardcore punk
5950,Charlie Brown Jr.,alternative rock
5951,Charlie Brown Jr.,ska


In [224]:
gc = gg.melt(['genre'],
             value_name='song/artist'
            ).drop('variable',
                   axis=1
                  ).value_counts('genre'
                                ).reset_index().rename(columns={0:'count'})

gc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1755 entries, 0 to 1754
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   genre   1755 non-null   object
 1   count   1755 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 27.5+ KB


In [235]:
gc.head(100)

Unnamed: 0,genre,count
0,rock,22606
1,pop,17356
2,indie,14796
3,hip-hop,11434
4,alternative rock,7834
5,punk rock,7458
6,jazz,6370
7,indie rock,6014
8,folk,5352
9,punk,4180
