In [21]:
import pickle
import re
import pandas as pd

notes1 = pickle.load(open("pickle/notes1-100000.p", "rb" ))
notes2 = pickle.load(open("pickle/notes100000-200000.p", "rb" ))
notes = notes1 + notes2

links = pickle.load(open("pickle/links.p", "rb" ))

print(len(notes))
print(len(links))

137917
285557


In [36]:
def process_abc_note(note):
    lines = note.strip().splitlines()
    title = [x for x in lines if re.compile("T:").match(x)]
    title = title[0].split(":")[1].strip() if len(title)>0 else None

    time = [x for x in lines if re.compile("M:").match(x)]
    time = time[0].split(":")[1].strip() if len(time)>0 else None

    length = [x for x in lines if re.compile("L:").match(x)]
    length = length[0].split(":")[1].strip() if len(length)>0 else None

    tune_type = [x for x in lines if re.compile("R:").match(x)]
    tune_type = tune_type[0].split(":")[1].strip() if len(tune_type)>0 else None

    keyindex = [i for i, x in enumerate(lines) if re.compile("K:").match(x)]
    key = lines[keyindex[0]].split(":")[1].strip() if len(keyindex)>0 else None
    keyindex = 0 if len(keyindex)==0 else keyindex[0]

    tune = [x for x in lines[keyindex:] if re.compile(".*\\|").match(x)]
    tune = "".join(tune) if len(tune)>0 else None
    
    barsearch = re.compile(":?\|+:?\]?")
    bars = None
    if tune is not None:
        bars = [bar for bar in barsearch.split(tune) if bar!=""]
        barlines = barsearch.findall(tune)
        bars = [b+l for b, l in zip(bars,barlines)]

    features = {
        "title": title,
        "time": time,
        "length": length,
        "tune_type": tune_type,
        "key": key,
        "tune": tune,
        "bars": bars,
        "opening": None,
        "middle": None,
        "ending": None,
        "full_text": note
    }
    
    if bars is not None:
        features["opening"] = bars[:4]
        features["middle"] = bars[4:-4]
        features["ending"] = bars[-4:]
    
    features["complete"] = None not in features.values()
    return(features)

In [37]:
features = [process_abc_note(note) for note in notes]
pickle.dump(features, open("pickle/features.p", "wb"))
len(features)

137917

In [38]:
complete_features = [x for x in features if x['complete']]

pickle.dump(complete_features, open("pickle/complete_features.p", "wb"))
len(complete_features)

71468

In [39]:
df = pd.DataFrame(complete_features)

In [43]:
df['opening'][0]

['[P:A] G |', '"C"czc |', ' czd |', ' eze |']

In [45]:
df['ending'][0]

['1 b3 |', ' g3 :|', '2 fdb |', ' "C"c\'3 |]']

In [50]:
df.iloc()[0]

bars         [[P:A] G |, "C"czc |,  czd |,  eze |,  ezg |, ...
complete                                                  True
ending                   [1 b3 |,  g3 :|, 2 fdb |,  "C"c'3 |]]
full_text    \nX:1\nT:01 Und wenn's im Tal scho nachte tuet...
key                                          C %%MIDI gchordon
length                                                     1/4
middle       [ ezg |,  gzc |,  azg |,  ("G7"d3 |,  d)zg |, ...
opening                  [[P:A] G |, "C"czc |,  czd |,  eze |]
time                                                       3/4
title        01 Und wenn's im Tal scho nachte tuet. Walzer ...
tune         [P:A] G |"C"czc | czd | eze | ezg | gzc | azg ...
tune_type                                               Walzer
Name: 0, dtype: object

In [52]:
print(df['full_text'][0])


X:1
T:01 Und wenn's im Tal scho nachte tuet. Walzer  C  (wz07869)
S:Hanny Christen. Bd.V  Basel II, Jura. S.208  (wz07869) orig. C
T: Basel II. "Mundwilermusig", Tenniken. S.183-219.
S:Überliefert durch Hermann und Johannes Mundwiler und Ernst Otter.
%P:ABC
R:Walzer
M:3/4
L:1/4
K:C %%MIDI gchordon
[P:A] G |"C"czc | czd | eze | ezg | gzc | azg | ("G7"d3 | d)zg | bza | gzf | 
"C"egc' | gze | "G7"dza | gzf | ("C"e3 | e)zz [P:B] |: "C"G3 | EGc | e3 | c3 | "G7"dd/d/d | 
e2d |1 "C"cBA | G3 :|2 "C"c>cc | czG [P:C] |: "C"G2E | G2e | (c3 | c)zE | G2G | A2G | 
("G7"F3 | F)zd | d2d | d2f | (d3 | d)zc | BzA |1 BzA | ("C"G3 | G)zG :|2 GzB | 
"C"czz |: [P:D] "C"gag | ece | "G7"gdd/d/ | dBd | g2a | fed |1 b3 | g3 :|2 fdb | "C"c'3 |] 
W:
%W:Parts: ABCD
%W:Anmerkung H.C.:
%W:Anmerkung Hrsg,:
% wz07869
% Aug 18, 2018

