# Computing the frequency stats of each instrument



In [41]:
import pandas as pd

# open model_statistics file computed by the midi_stats function
path = '../data/music_picks/model_statistics.csv'
stats = pd.read_csv(path)

# define paths to the zip files containing tokenized data
from pathlib import Path
zip_input_path = Path('/Users/louis.demetz/Documents/Code/the-jam-machine/data/dsr_mmmtrack_8bars_d-2048/train')
zip_output_path = zip_input_path

stats.head(5)

Unnamed: 0,md5,n_instruments,n_unique_instruments,instrument_names,instrument_families,number_of_instrument_families,n_notes,n_unique_notes,average_n_unique_notes_per_instrument,average_note_duration,...,four_to_the_floor,n_time_signature_changes,track_length_in_seconds,lyrics_nb_words,lyrics_unique_words,lyrics_bool,artist,title,genre,source
0,83834e5dbc0490e99c560c28325ececf,9,5,"[['Acoustic Guitar (steel)', 'Pad 1 (new age)'...","[['Guitar', 'Reed', 'Synth Effects', 'Synth Pa...",4,3134,30,5.666667,0.307965,...,True,1.0,215.062748,,,False,Depeche Mode,Dreaming of Me.1,alternative-indie,electronic
1,993add9109fc380a30214a2753a78773,6,4,"[['Acoustic Grand Piano', 'Distortion Guitar',...","[['Synth Lead', 'Piano', 'Guitar', 'Bass']]",4,1319,22,5.333333,0.243039,...,True,1.0,99.4,,,False,Gorillaz,Punk,pop,electronic
2,8172e8b45ddd610f5dadf533ce057582,7,4,"[['Lead 5 (charang)', 'Acoustic Grand Piano', ...","[['Ensemble', 'Bass', 'Piano', 'Synth Lead']]",4,3320,21,5.142857,0.293524,...,True,1.0,256.0,,,False,Depeche Mode,Photographic,alternative-indie,electronic
3,e4ac8a50c8d65b94604e0c81778004a9,14,12,"[['Pad 3 (polysynth)', 'Synth Drum', 'Pan Flut...","[['Pipe', 'Bass', 'Sound Effects', 'Percussive...",9,5035,35,7.785714,0.279565,...,True,1.0,210.055485,,,False,David Guetta,Without You,dance-eletric,electronic
4,7e3bdfb46bdc79afc639e1560506c27f,14,13,"[['Lead 8 (bass + lead)', 'Reverse Cymbal', 'C...","[['Bass', 'Piano', 'Guitar', 'Synth Lead', 'Or...",9,7433,52,13.428571,0.280949,...,,,314.484621,,,False,Moby,Moby Dick.1,rock,electronic


## MIDI statistics

### Instruments

#### Individual instruments

In [42]:
# turn strings into lists
inst = stats.instrument_names.map(lambda x: eval(x)[0])
# turn a list of lists into a single list
inst_list = [item for sublist in inst for item in sublist]
pd.Series(inst_list).value_counts()

Acoustic Grand Piano       4289
Electric Bass (finger)     2191
Distortion Guitar          1838
Overdriven Guitar          1588
Electric Guitar (clean)    1430
                           ... 
Bagpipe                      26
Dulcimer                     26
Tinkle Bell                  20
Shanai                       20
Fiddle                       13
Length: 128, dtype: int64

#### Combinations of instruments

In [43]:
inst = stats.instrument_names.map(lambda x: eval(x)[0]).to_list()
for i in inst:
    i.sort()
pd.Series(inst).value_counts().head(20)

[Acoustic Grand Piano]                                                                        103
[Acoustic Guitar (steel)]                                                                      63
[Acoustic Grand Piano, Distortion Guitar, Electric Bass (finger)]                              60
[Acoustic Grand Piano, Distortion Guitar, Electric Bass (pick)]                                57
[Distortion Guitar]                                                                            55
[Acoustic Grand Piano, Electric Bass (finger), Overdriven Guitar]                              54
[Acoustic Guitar (nylon)]                                                                      53
[Electric Guitar (clean)]                                                                      42
[Acoustic Grand Piano, Distortion Guitar, Electric Bass (pick), Overdriven Guitar]             31
[Acoustic Grand Piano, Distortion Guitar, Electric Bass (finger), Overdriven Guitar]           31
[Overdriven Guitar] 

### Families

#### Individual families

In [44]:
# turn strings into lists
fam = stats.instrument_families.map(lambda x: eval(x)[0])
# turn a list of lists into a single list
fam_list = [item for sublist in fam for item in sublist]
pd.Series(fam_list).value_counts()

Guitar                  4772
Piano                   4668
Bass                    4560
Ensemble                2640
Reed                    1395
Synth Lead              1380
Organ                   1158
Brass                   1116
Synth Pad                956
Pipe                     829
Chromatic Percussion     702
Strings                  697
Percussive               691
Sound Effects            540
Synth Effects            532
Ethnic                   284
dtype: int64

#### Combinations of families

In [45]:
fam = stats.instrument_families.map(lambda x: eval(x)[0]).to_list()
for f in fam:
    f.sort()
pd.Series(fam).value_counts().head(20)

[Bass, Guitar, Piano]                           633
[Guitar]                                        378
[Bass, Guitar]                                  149
[Bass, Ensemble, Guitar, Piano]                 143
[Bass, Guitar, Piano, Reed]                     125
[Guitar, Piano]                                 124
[Piano]                                         115
[Bass, Ensemble, Guitar, Piano, Reed]           111
[Bass, Guitar, Organ, Piano]                     85
[Bass, Guitar, Piano, Synth Lead]                77
[Bass, Ensemble, Guitar, Piano, Synth Lead]      61
[Bass, Ensemble, Guitar, Organ, Piano]           52
[Bass, Guitar, Piano, Pipe]                      50
[Bass, Brass, Guitar, Piano]                     47
[Guitar, Organ]                                  42
[Bass, Brass, Ensemble, Guitar, Piano]           41
[Bass, Ensemble, Guitar, Piano, Pipe]            41
[Bass, Brass, Ensemble, Guitar, Piano, Reed]     39
[Bass, Guitar, Organ, Piano, Reed]               39
[Bass, Guita

## Token statistics

In [46]:
# load tokenized data

# change sys path
import sys
sys.path.append('/Users/louis.demetz/Documents/Code/the-jam-machine')

# unzip files
from source.utils import FileCompressor
fc = FileCompressor(zip_input_path, zip_output_path)
fc.unzip()

# load text files
from source.utils import get_files
files = get_files(zip_input_path, extension='txt')
files = [f for f in files if str(f.stem).startswith('token')]
files

# Load all tokenized text files into a single list of strings
data = []
for f in files:
    with open(f, 'r') as f:
        d = [line.rstrip() for line in f]
        data += d

# Extract individual instruments and grouped instruments
single_instruments = []
grouped_instruments = []
for sequence in data:
    sequence = sequence.split()
    split = [token for token in sequence if token.startswith('INST')]
    single_instruments += split
    grouped_instruments.append(split)

unzip took 7.58 seconds to run.


### Instruments

#### Individual instruments

In [47]:
# count individual instruments
import pandas as pd
pd.Series(single_instruments).value_counts().head(20)

INST=DRUMS    397289
INST=30       169668
INST=33       136460
INST=29       130221
INST=0         92314
INST=27        90248
INST=25        74041
INST=34        56479
INST=48        48395
INST=26        47042
INST=24        39913
INST=35        39003
INST=38        34643
INST=28        33594
INST=52        27536
INST=50        27424
INST=81        27421
INST=1         24202
INST=39        22320
INST=32        22252
dtype: int64

#### Combinations of instruments

In [48]:
for i in grouped_instruments:
    i.sort()
pd.Series(grouped_instruments).value_counts().head(20)

[INST=30, INST=34, INST=DRUMS]             2893
[INST=30, INST=33, INST=DRUMS]             2260
[INST=30, INST=30]                         2234
[INST=30, INST=30, INST=33, INST=DRUMS]    2203
[INST=29, INST=33, INST=DRUMS]             2157
[INST=25]                                  2051
[INST=0]                                   2042
[INST=24, INST=24]                         1747
[INST=30, INST=30, INST=34, INST=DRUMS]    1744
[INST=29, INST=29]                         1564
[INST=27, INST=27]                         1496
[INST=29, INST=30, INST=34, INST=DRUMS]    1459
[INST=29, INST=29, INST=33, INST=DRUMS]    1436
[INST=24]                                  1399
[INST=0, INST=0]                           1259
[INST=27, INST=29, INST=33, INST=DRUMS]    1213
[INST=29, INST=30, INST=33, INST=DRUMS]    1210
[INST=27]                                  1134
[INST=30, INST=DRUMS]                      1054
[INST=DRUMS]                               1050
dtype: int64

### Families

In [49]:
# turn the text files with the instrument numbers into family numbers
import sys
sys.path.append('/Users/louis.demetz/Documents/Code/the-jam-machine/source')
# initialize and configure familizer
from source.familizer import Familizer
new_data = Familizer()
new_data.operation = 'family'
familized, single_families, grouped_families = [], [], []

for sequence in data:
    familized.append(new_data.replace_instrument_in_text(sequence))

for sequence in familized:
    # familize dataset
    # prepare datasets for statistics
    sequence = sequence.split()
    split = [token for token in sequence if token.startswith('INST')]
    single_families += split
    grouped_families.append(split)

#### Individual families

In [50]:
pd.Series(single_families).value_counts()

INST=3        589054
INST=DRUMS    397289
INST=4        328748
INST=0        185044
INST=6        177908
INST=10        92432
INST=8         63744
INST=7         60848
INST=11        52854
INST=2         47314
INST=9         34104
INST=5         31049
INST=12        29414
INST=1         27205
INST=14        23596
INST=13        15709
INST=15        14897
dtype: int64

#### Combinations of families

In [51]:
for i in grouped_families:
    i.sort()
pd.Series(grouped_families).value_counts().head(20)

[INST=3, INST=3, INST=4, INST=DRUMS]                            15219
[INST=3, INST=4, INST=DRUMS]                                    11549
[INST=3, INST=3]                                                 9698
[INST=3]                                                         6611
[INST=3, INST=3, INST=3, INST=4, INST=DRUMS]                     6598
[INST=3, INST=3, INST=4, INST=4, INST=DRUMS]                     4595
[INST=3, INST=3, INST=3, INST=3]                                 3988
[INST=3, INST=3, INST=3]                                         3864
[INST=3, INST=3, INST=DRUMS]                                     3423
[INST=0]                                                         2835
[INST=3, INST=3, INST=4]                                         2694
[INST=3, INST=4]                                                 2681
[INST=3, INST=3, INST=3, INST=3, INST=4, INST=DRUMS]             2659
[INST=3, INST=DRUMS]                                             2524
[INST=3, INST=4, INS