# Computing the frequency stats of each instrument



In [1]:
import pandas as pd

# open model_statistics file
path = '../data/music_picks/model_statistics.csv'
stats = pd.read_csv(path)

stats.head(5)

Unnamed: 0,md5,n_instruments,n_unique_instruments,instrument_names,instrument_families,number_of_instrument_families,n_notes,n_unique_notes,average_n_unique_notes_per_instrument,average_note_duration,...,four_to_the_floor,n_time_signature_changes,track_length_in_seconds,lyrics_nb_words,lyrics_unique_words,lyrics_bool,artist,title,genre,source
0,83834e5dbc0490e99c560c28325ececf,9,5,"[['Acoustic Guitar (steel)', 'Pad 1 (new age)'...","[['Guitar', 'Reed', 'Synth Effects', 'Synth Pa...",4,3134,30,5.666667,0.307965,...,True,1.0,215.062748,,,False,Depeche Mode,Dreaming of Me.1,alternative-indie,electronic
1,993add9109fc380a30214a2753a78773,6,4,"[['Acoustic Grand Piano', 'Distortion Guitar',...","[['Synth Lead', 'Piano', 'Guitar', 'Bass']]",4,1319,22,5.333333,0.243039,...,True,1.0,99.4,,,False,Gorillaz,Punk,pop,electronic
2,8172e8b45ddd610f5dadf533ce057582,7,4,"[['Lead 5 (charang)', 'Acoustic Grand Piano', ...","[['Ensemble', 'Bass', 'Piano', 'Synth Lead']]",4,3320,21,5.142857,0.293524,...,True,1.0,256.0,,,False,Depeche Mode,Photographic,alternative-indie,electronic
3,e4ac8a50c8d65b94604e0c81778004a9,14,12,"[['Pad 3 (polysynth)', 'Synth Drum', 'Pan Flut...","[['Pipe', 'Bass', 'Sound Effects', 'Percussive...",9,5035,35,7.785714,0.279565,...,True,1.0,210.055485,,,False,David Guetta,Without You,dance-eletric,electronic
4,7e3bdfb46bdc79afc639e1560506c27f,14,13,"[['Lead 8 (bass + lead)', 'Reverse Cymbal', 'C...","[['Bass', 'Piano', 'Guitar', 'Synth Lead', 'Or...",9,7433,52,13.428571,0.280949,...,,,314.484621,,,False,Moby,Moby Dick.1,rock,electronic


## MIDI statistics

### Instruments

#### Individual instruments

In [2]:
# turn strings into lists
inst = stats.instrument_names.map(lambda x: eval(x)[0])
# turn a list of lists into a single list
inst_list = [item for sublist in inst for item in sublist]
pd.Series(inst_list).value_counts()

Acoustic Grand Piano       4289
Electric Bass (finger)     2191
Distortion Guitar          1838
Overdriven Guitar          1588
Electric Guitar (clean)    1430
                           ... 
Bagpipe                      26
Dulcimer                     26
Tinkle Bell                  20
Shanai                       20
Fiddle                       13
Length: 128, dtype: int64

#### Combinations of instruments

In [3]:
inst = stats.instrument_names.map(lambda x: eval(x)[0]).to_list()
for i in inst:
    i.sort()
pd.Series(inst).value_counts()

[Acoustic Grand Piano]                                                                                                                    103
[Acoustic Guitar (steel)]                                                                                                                  63
[Acoustic Grand Piano, Distortion Guitar, Electric Bass (finger)]                                                                          60
[Acoustic Grand Piano, Distortion Guitar, Electric Bass (pick)]                                                                            57
[Distortion Guitar]                                                                                                                        55
                                                                                                                                         ... 
[Acoustic Guitar (steel), Distortion Guitar, Electric Bass (pick), Glockenspiel]                                                            1
[Acous

### Families

#### Individual families

In [4]:
# turn strings into lists
fam = stats.instrument_families.map(lambda x: eval(x)[0])
# turn a list of lists into a single list
fam_list = [item for sublist in fam for item in sublist]
pd.Series(fam_list).value_counts()

Guitar                  4772
Piano                   4668
Bass                    4560
Ensemble                2640
Reed                    1395
Synth Lead              1380
Organ                   1158
Brass                   1116
Synth Pad                956
Pipe                     829
Chromatic Percussion     702
Strings                  697
Percussive               691
Sound Effects            540
Synth Effects            532
Ethnic                   284
dtype: int64

#### Combinations of families

In [5]:
fam = stats.instrument_families.map(lambda x: eval(x)[0]).to_list()
for f in fam:
    f.sort()
pd.Series(fam).value_counts()

[Bass, Guitar, Piano]                                                                                              633
[Guitar]                                                                                                           378
[Bass, Guitar]                                                                                                     149
[Bass, Ensemble, Guitar, Piano]                                                                                    143
[Bass, Guitar, Piano, Reed]                                                                                        125
                                                                                                                  ... 
[Bass, Chromatic Percussion, Ensemble, Percussive, Piano, Synth Effects, Synth Lead, Synth Pad]                      1
[Ensemble, Guitar, Percussive, Piano, Pipe, Reed]                                                                    1
[Bass, Brass, Chromatic Percussion, Organ, Piano

## Token statistics

In [6]:
# load tokenized data

# define paths
from pathlib import Path
input_path = Path('/Users/louis.demetz/Documents/Code/the-jam-machine/data/dsr_mmmtrack_8bars_d-2048/validate')
output_path = input_path

# change sys path
import sys
sys.path.append('/Users/louis.demetz/Documents/Code/the-jam-machine')

# unzip files
from source.utils import FileCompressor
fc = FileCompressor(input_path, output_path)
fc.unzip()

# load text files
from source.utils import get_files
files = get_files(input_path, extension='txt')
files = [f for f in files if str(f.stem).startswith('token')]
files

# Load all tokenized text files into a single list of strings
data = []
for f in files:
    with open(f, 'r') as f:
        d = [line.rstrip() for line in f]
        data += d

# Extract individual instruments and grouped instruments
single_instruments = []
grouped_instruments = []
for sequence in data:
    sequence = sequence.split()
    split = [token for token in sequence if token.startswith('INST')]
    single_instruments += split
    grouped_instruments.append(split)

unzip took 2.35 seconds to run.


### Instruments

#### Individual instruments

In [7]:
# count individual instruments
import pandas as pd
pd.Series(single_instruments).value_counts()

INST=DRUMS    3504
INST=30       1480
INST=0        1438
INST=27       1364
INST=33       1311
              ... 
INST=50          6
INST=22          6
INST=10          6
INST=125         6
INST=88          5
Length: 76, dtype: int64

#### Combinations of instruments

In [8]:
for i in grouped_instruments:
    i.sort()
pd.Series(grouped_instruments).value_counts()

[INST=24, INST=24, INST=DRUMS]                                                 197
[INST=27, INST=27, INST=DRUMS]                                                 187
[INST=27, INST=27]                                                             145
[INST=33, INST=61, INST=67, INST=DRUMS]                                         93
[INST=0, INST=0, INST=DRUMS]                                                    93
                                                                              ... 
[INST=0, INST=18, INST=29, INST=30, INST=35, INST=73, INST=DRUMS]                1
[INST=1, INST=27, INST=27, INST=33, INST=4, INST=53, INST=73, INST=DRUMS]        1
[INST=25, INST=25, INST=29, INST=30, INST=30, INST=33, INST=54, INST=DRUMS]      1
[INST=29, INST=30, INST=33, INST=54, INST=DRUMS]                                 1
[INST=18, INST=29, INST=49, INST=DRUMS]                                          1
Length: 374, dtype: int64

### Families

In [32]:
# turn the text files with the instrument numbers into family numbers
import sys
sys.path.append('/Users/louis.demetz/Documents/Code/the-jam-machine/source')
# initialize and configure familizer
from source.familizer import Familizer
new_data = Familizer()
new_data.operation = 'family'
familized, single_families, grouped_families = [], [], []

for sequence in data:
    familized.append(new_data.replace_instrument_in_text(sequence))

for sequence in familized:
    # familize dataset
    # prepare datasets for statistics
    sequence = sequence.split()
    split = [token for token in sequence if token.startswith('INST')]
    single_families += split
    grouped_families.append(split)

#### Individual families

In [34]:
pd.Series(single_families).value_counts()

INST=3        5390
INST=DRUMS    3504
INST=4        2498
INST=0        1845
INST=6        1348
INST=8         880
INST=7         827
INST=9         752
INST=10        718
INST=11        471
INST=5         452
INST=14        297
INST=2         237
INST=12        236
INST=1         182
INST=15        158
dtype: int64

#### Combinations of families

In [35]:
for i in grouped_families:
    i.sort()
pd.Series(grouped_families).value_counts()

[INST=3, INST=3, INST=DRUMS]                             397
[INST=3, INST=3]                                         145
[INST=0, INST=0, INST=DRUMS]                              94
[INST=4, INST=7, INST=8, INST=DRUMS]                      93
[INST=3, INST=4, INST=9, INST=DRUMS]                      84
                                                        ... 
[INST=0, INST=0, INST=0, INST=0, INST=0]                   1
[INST=0, INST=0, INST=0, INST=0, INST=0, INST=0]           1
[INST=0, INST=3, INST=4, INST=9, INST=DRUMS]               1
[INST=0, INST=2, INST=3, INST=3, INST=4, INST=DRUMS]       1
[INST=1, INST=15, INST=3, INST=6, INST=9, INST=DRUMS]      1
Length: 327, dtype: int64