In [2]:
import pandas as pd
import json
from collections import defaultdict

In [18]:
MUSIC_BRAINZ_ENTITY_TYPE = ['artist', 'release', 'genre', 'instrument'] # release is for album, release_group is not included
CORPUS_TYPE = ['wiki', 'music']

In [4]:
sum_df = pd.DataFrame(index = MUSIC_BRAINZ_ENTITY_TYPE)
sum_df.index.name = 'entity_type'

for entity in MUSIC_BRAINZ_ENTITY_TYPE:
    for corpus_type in CORPUS_TYPE:
        # entity frequency
        freq_csv_path = f'{corpus_type}_corpus/musicbrainz_{entity}_{corpus_type}_corpus.csv'
        freq_df = pd.read_csv(freq_csv_path)

        sum_df.loc[entity, f'{corpus_type}_num'] = len(freq_df[freq_df['frequency'] > 0])
        sum_df.loc[entity, f'{corpus_type}_freq'] = freq_df['frequency'].sum()

    sum_df.loc[entity, 'musicbrainz_vocab'] = len(freq_df['num_of_docs_occured'])

In [5]:
sum_df = sum_df.astype(float)
pd.options.display.float_format = '{:,.0f}'.format
sum_df

Unnamed: 0_level_0,wiki_num,wiki_freq,music_num,music_freq,musicbrainz_vocab
entity_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
artist,937167,17241299062,781830,5425560169,2008511
release,745390,23814193888,680993,7495287114,2348437
genre,1556,42676598,1581,21110527,1808
instrument,942,50004721,931,17595725,1019


In [6]:
# sum_df.astype(int).to_csv('entity_sum.csv')

In [14]:
class Corpus:
    def __init__(self, corpus_type):
        self.corpus_type = corpus_type
        self.corpus = self._get_corpus()
        self.corpus_token_size = self._get_corpus_token_size()

    def _get_corpus(self):
        with open(f'../dataset/{self.corpus_type}_corpus.jsonl') as f:
            self.corpus = [json.loads(line) for line in f]
        return self.corpus

    def _get_corpus_token_size(self):
        self.token_size = sum(len(entity['text'].split(' ')) for entity in self.corpus)
        return self.token_size

In [19]:
corpus_dict = defaultdict()
for corpus_type in CORPUS_TYPE:
    corpus_dict[corpus_type] = Corpus(corpus_type)

In [21]:
for corpus_type in CORPUS_TYPE:
    print(corpus_type)
    print('vocab size:', len(corpus_dict[corpus_type].corpus))
    print('Token size:', corpus_dict[corpus_type].token_size)
    print('-' * 30)

wiki
vocab size: 6458670
Token size: 2999251565
------------------------------
music
vocab size: 12465805
Token size: 1087101288
------------------------------


In [29]:
GTZAN_ENTITY_TYPE = ['artist', 'album']
gtzan_sum_df = pd.DataFrame(index = GTZAN_ENTITY_TYPE)
gtzan_sum_df.index.name = 'entity_type'

for entity in GTZAN_ENTITY_TYPE:
    for corpus_type in CORPUS_TYPE:
        # entity frequency
        freq_csv_path = f'{corpus_type}_corpus/gtzan_{entity}_{corpus_type}_corpus.csv'
        freq_df = pd.read_csv(freq_csv_path)

        gtzan_sum_df.loc[entity, f'{corpus_type}_num'] = len(freq_df[freq_df['frequency'] > 0])
        gtzan_sum_df.loc[entity, f'{corpus_type}_freq'] = freq_df['frequency'].sum()
        # gtzan_sum_df.loc[entity, f'{corpus_type}_freq/tokensize'] = \
            # gtzan_sum_df.loc[entity, f'{corpus_type}_freq'] / corpus_dict[corpus_type].token_size

    gtzan_sum_df.loc[entity, 'gtzan_vocab'] = len(freq_df['num_of_docs_occured'])

gtzan_sum_df = gtzan_sum_df.astype(float)
pd.options.display.float_format = '{:,.0f}'.format
gtzan_sum_df

Unnamed: 0_level_0,wiki_num,wiki_freq,music_num,music_freq,gtzan_vocab
entity_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
artist,249,2492119,246,1686833,313
album,271,5689815,287,4996432,489


In [28]:
for entity in GTZAN_ENTITY_TYPE:
    for corpus_type in CORPUS_TYPE:
        print(f'{entity}\t{corpus_type}/token:\t\t', gtzan_sum_df.loc[entity, f'{corpus_type}_freq'] / corpus_dict[corpus_type].token_size)

artist	wiki/token:		 0.0008309136282804607
artist	music/token:		 0.0015516796996012758
album	wiki/token:		 0.0018970782799274792
album	music/token:		 0.004596105307898412
