In [2]:
import pandas as pd
import json
from collections import defaultdict

In [3]:
MUSIC_BRAINZ_ENTITY_TYPE = ['artist', 'release', 'genre', 'instrument'] # release is for album, release_group is not included
CORPUS_TYPE = ['wiki', 'music']

In [4]:
sum_df = pd.DataFrame(index = MUSIC_BRAINZ_ENTITY_TYPE)
sum_df.index.name = 'entity_type'

for entity in MUSIC_BRAINZ_ENTITY_TYPE:
    for corpus_type in ['wiki', 'music']:
        # entity frequency
        freq_csv_path = f'{corpus_type}_corpus/musicbrainz_{entity}_{corpus_type}_corpus.csv'
        freq_df = pd.read_csv(freq_csv_path)

        sum_df.loc[entity, f'{corpus_type}_num'] = len(freq_df[freq_df['frequency'] > 0])
        sum_df.loc[entity, f'{corpus_type}_freq'] = freq_df['frequency'].sum()

    sum_df.loc[entity, 'musicbrainz_vocab'] = len(freq_df['num_of_docs_occured'])

In [5]:
sum_df = sum_df.astype(float)
pd.options.display.float_format = '{:,.0f}'.format
sum_df

Unnamed: 0_level_0,wiki_num,wiki_freq,music_num,music_freq,musicbrainz_vocab
entity_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
artist,937167,17241299062,781830,5425560169,2008511
release,745390,23814193888,680993,7495287114,2348437
genre,1556,42676598,1581,21110527,1808
instrument,942,50004721,931,17595725,1019


In [6]:
# sum_df.astype(int).to_csv('entity_sum.csv')

In [7]:
def get_corpus(corpus_type):
    with open(f'../dataset/{corpus_type}_corpus.jsonl') as f:
        corpus = [json.loads(line) for line in f]
    return corpus

def get_corpus_token_size(corpus):
    token_size = sum(len(entity['text'].split(' ')) for entity in corpus)
    return token_size

In [8]:
wiki_corpus = get_corpus('wiki')
music_corpus = get_corpus('music')

In [10]:
wiki_token_size = get_corpus_token_size(wiki_corpus)
music_token_size = get_corpus_token_size(music_corpus)

In [12]:
print('Vocab size')
print(f'wiki: {len(wiki_corpus):,}, music: {len(music_corpus):,}')
print('-' * 30)

print('Token size')
print(f'wiki: {wiki_token_size:,}, music: {music_token_size:,}')

Vocab size
wiki: 6,458,670, music: 12,465,805
------------------------------
Token size
wiki: 2,999,251,565, music: 1,087,101,288
