# Structure segmentation task

In [67]:
from choco_utils.dataloader import ChoCoHarteAnnotationsSectionCorpus
from segmentation.metrics import under_over_segmentation, pairwise_metrics

In [68]:
data = ChoCoHarteAnnotationsSectionCorpus("/home/n28div/university/thesis/choco/")
dataset = list(data)



In [69]:
sections = [ann.section for doc in dataset for ann in doc.annotations]
print(f"Total of {len(sections)} annotations - {len(set(sections))} unique")

Total of 21847 annotations - 168 unique


Unify most sections as possible.

In [70]:
import re
import functools
import string

SYMBOLS_RE = re.compile("[" + re.escape(string.punctuation) + "]")
NUMBERS_RE = re.compile("[" + re.escape(string.digits) + "]")
CONSECUTIVE_SPACES_RE = re.compile(r"\s+")
INTRO_RE = re.compile(r".*(intro|beginning).*")
VERSE_RE = re.compile(r".*(verse).*")
REFRAIN_RE = re.compile(r".*(refrain).*")
BRIDGE_RE = re.compile(r".*(bridge).*")
CHORUS_RE = re.compile(r".*(chorus|c).*")
INSTRUMENTAL_RE = re.compile(r".*(instumental|instrumental|ad\s?lib).*")
SOLO_RE = re.compile(r".*(solo|vocal).*")
OUTRO_RE = re.compile(r".*(outro|ending|fade\s?out|closing|end|coda).*")

def preprocess_section(section):
  # remove symbols, numbers and spaces
  section = SYMBOLS_RE.sub(" ", section)
  section = NUMBERS_RE.sub(" ", section)
  section = CONSECUTIVE_SPACES_RE.sub(" ", section)
  # remove word half, break
  section = section.replace("half", "")
  section = section.replace("break", "")
  # if [*beginning*, *intro*] -> intro
  section = "intro" if INTRO_RE.match(section) else section
  # if *verse* -> verse
  section = "verse" if VERSE_RE.match(section) else section
  # if *refrain* -> refrain
  section = "refrain" if REFRAIN_RE.match(section) else section
  # if *bridge* -> bridge
  section = "bridge" if BRIDGE_RE.match(section) else section
  # if *bridge* -> bridge
  section = "chorus" if CHORUS_RE.match(section) else section
  # if [*instrumental*, *ad lib*] -> instrumental
  section = "instrumental" if INSTRUMENTAL_RE.match(section) else section
  # if [*solo*, *vocal*] -> solo
  section = "solo" if SOLO_RE.match(section) else section
  # if [*outro*, *ending*, *fade out*, *closing*, *end*, *coda*] -> outro
  section = "outro" if OUTRO_RE.match(section) else section
  # everything not in [bridge, chorus, instrumental, interlude, intro, outro, refrain, solo, verse]
  # -> part
  section = section if section in ["bridge", "chorus", "instrumental", "interlude", "intro", "outro", "refrain", "solo", "verse"] else "part"
  # strip whitespaces
  section = section.strip()
  return section

In [73]:
from choco_utils.dataloader import HarteAnnotation, HarteSectionAnnotation
dataset = [[HarteSectionAnnotation(HarteAnnotation(*ann.chord), preprocess_section(ann.section)) for ann in doc.annotations] for doc in dataset]

In [74]:
sections = [ann.section for sample in dataset for ann in sample]
print(f"Total of {len(sections)} annotations - {len(set(sections))} unique")

Total of 21847 annotations - 10 unique


In [75]:
sections_frequency = collections.Counter(sections)
section_p = { s: c / sum(sections_frequency.values()) for s, c in sections_frequency.items() }

for section, freq in sorted(sections_frequency.items(), key=lambda t: t[1], reverse=True):
  print(f"p({section})".ljust(15, " "), f"= {freq / sum(sections_frequency.values()):0.4f} ({str(freq).rjust(5, ' ')} apparences)")

p(verse)        = 0.4529 ( 9894 apparences)
p(refrain)      = 0.1621 ( 3541 apparences)
p(bridge)       = 0.1210 ( 2644 apparences)
p(outro)        = 0.0850 ( 1856 apparences)
p(intro)        = 0.0674 ( 1473 apparences)
p(chorus)       = 0.0570 ( 1246 apparences)
p(instrumental) = 0.0271 (  592 apparences)
p(part)         = 0.0225 (  492 apparences)
p(solo)         = 0.0032 (   71 apparences)
p(interlude)    = 0.0017 (   38 apparences)


### Baseline

In [76]:
from sklearn.preprocessing import LabelEncoder

def contigous_clusters(segmentation):
  _, seg = np.unique(segmentation, return_inverse=True)
  return seg


# assign a number (its index) to each section and repeat that number for the constituents of that
# category
def extract_labels_from_sample(sample):
  annotated_seg = np.array(list(itertools.chain(*[itertools.repeat(name, len(content)) 
                                                  for name, _, _, content in sample["structure"]])))

  return contigous_clusters(annotated_seg)

le = LabelEncoder().fit(sections)

In [77]:
from sklearn.model_selection import train_test_split

# select 20% of data as testing
train, test = train_test_split(dataset, test_size=0.2, random_state=42)
len(train), len(test)

(180, 45)

In [78]:
from segmentation.form import FORM

pairwise = list()
under_over = list()
for sample in test:
  target = contigous_clusters(le.transform([ann.section.lower() for ann in sample if ann is not None]))
  pred = contigous_clusters(FORM([ann.chord.symbol for ann in sample if ann is not None]))
  pairwise.append(pairwise_metrics(target, pred))
  under_over.append(under_over_segmentation(target, pred))
  
precision, recall, f1 = np.array(pairwise).mean(axis=0)
under, over = np.array(under_over).mean(axis=0)

In [79]:
precision, recall, f1, under, over

(0.5977847975050611,
 0.4668899718385443,
 0.5073480807364139,
 0.47410357150806065,
 0.6019579515576962)

In [80]:
def window_sample(sample, context_size = 3):
  if context_size == -1:
    context_size = len(sample) - 1
  
  offset = tuple(range(-1 * context_size, context_size + 1))
  window =  list(mitertools.stagger(sample, offset))
  return window

In [81]:
def context_embedding(model, window, weighted_mean = False, weighted_distance = False, log_weight_distance = False):
  chords = map(lambda w: w.chord.symbol if w is not None else None, window)
  
  # map chords to embedding
  context = np.stack(list(map(lambda c: model[c] if c is not None else np.zeros(model.embedding_dim), chords)))
  
  weights = np.ones(len(window))
  
  if weighted_mean:
    weights *= np.array([w.chord.duration if w is not None else 0 for w in window])
    
  if weighted_distance:
    seq_len = len(window) // 2
    dist_weight = np.abs(np.arange(-1.0 * seq_len, seq_len + 1.0)) + 1
    if log_weight_distance:
      dist_weight = np.log(dist_weight + 1)
    weights *= 1 / dist_weight
  
  context = np.average(context, axis=0, weights=weights)
  
  return context

In [82]:
def embed_sample(model, sample, context_size = 4, weighted_mean = False, weighted_distance = False, log_weight_distance=False):
  windows = window_sample(sample, context_size=4)
  for annotated_chord, window in zip(sample, windows):
    context = context_embedding(model, window, weighted_mean=weighted_mean, weighted_distance=weighted_distance, log_weight_distance=log_weight_distance)
    chord_embedding = model[annotated_chord.chord.symbol]

    X = np.concatenate([chord_embedding, context])
    y = annotated_chord.section.lower()
    yield X, y

In [83]:
def embed_data(model, data, context_size = 4, weighted_mean = False, weighted_distance = False, log_weight_distance=False):
  embed_fn = partial(embed_sample, model, context_size=context_size, weighted_mean=weighted_mean, weighted_distance=weighted_distance, log_weight_distance=log_weight_distance)
  # map to all data sample the embed function, concatenate all those results and split
  # the arrays of 2-tuples into 2-tuple of two arrays
  return zip(*itertools.chain(*map(embed_fn, data)))

In [84]:
def segment_from_classifier(clf, embedding_model, sample, embed_params = {}):
  embedded_sample, _ = zip(*embed_sample(embedding_model, sample, **embed_params))
  embedded_sample = np.stack(embedded_sample)
  pred = clf.predict(embedded_sample)
  return pred

In [85]:
def postprocess_prediction(le, section_p, pred):
  for i, (p, c, n) in enumerate(mitertools.stagger(pred)):  
    if p != None and p != c != n:
      p_p = section_p[le.inverse_transform([p])[0]]
      p_n = section_p[le.inverse_transform([n])[0]]
      pred[i] = p if p_p > p_n else n
  return pred

### Using word2vec

#### As a classification problem: Decision Trees

In [86]:
from sklearn.preprocessing import LabelEncoder
from functools import partial

X, y = embed_data(word2vec_model, train, context_size=5)

X = np.stack(X)
le = LabelEncoder().fit(sections)
y = le.transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42, stratify=y)
len(X_train), len(X_test)

AttributeError: 'Word2Vec' object has no attribute 'embedding_dim'

In [27]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report


dt = DecisionTreeClassifier()
parameters = { }

clf = GridSearchCV(dt, parameters, cv=3, scoring=["f1_micro", "f1_macro"], refit="f1_macro")
clf.fit(X_train, y_train)

pred = clf.predict(X_test)
print(classification_report(y_test, pred, zero_division=0))

              precision    recall  f1-score   support

           0       0.68      0.70      0.69       200
           1       0.59      0.52      0.55        81
           2       0.40      0.50      0.45        46
           3       0.00      0.00      0.00         4
           4       0.46      0.42      0.44       115
           5       0.53      0.56      0.55       119
           6       0.59      0.55      0.57        47
           7       0.69      0.73      0.71       285
           8       0.25      0.17      0.20         6
           9       0.83      0.82      0.82       775

    accuracy                           0.71      1678
   macro avg       0.50      0.50      0.50      1678
weighted avg       0.71      0.71      0.71      1678



In [28]:
f1 = list()
for sample in test:
  target = le.transform([ann.section.lower() for ann in sample])
  pred = segment_from_classifier(clf, word2vec_model, sample) 
  postprocess_prediction(le, section_p, pred)
  f1.append(pairwise_metrics(target, pred))
  
np.mean(f1)

0.47860203087723957

In [31]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

rf = RandomForestClassifier()
parameters = { "max_depth": np.arange(20, 50, 10), "n_estimators": [50, 100, 200] }

clf_rf = GridSearchCV(rf, parameters, cv=3, scoring=["f1_micro", "f1_macro"], refit="f1_macro", verbose=3)
clf_rf.fit(X_train, y_train)

pred = clf_rf.predict(X_test)
print(classification_report(y_test, pred, zero_division=0))

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV 1/3] END max_depth=20, n_estimators=50; f1_macro: (test=0.545) f1_micro: (test=0.724) total time=   9.5s
[CV 2/3] END max_depth=20, n_estimators=50; f1_macro: (test=0.534) f1_micro: (test=0.716) total time=   9.0s
[CV 3/3] END max_depth=20, n_estimators=50; f1_macro: (test=0.556) f1_micro: (test=0.723) total time=   8.9s
[CV 1/3] END max_depth=20, n_estimators=100; f1_macro: (test=0.562) f1_micro: (test=0.726) total time=  18.2s
[CV 2/3] END max_depth=20, n_estimators=100; f1_macro: (test=0.547) f1_micro: (test=0.719) total time=  17.9s
[CV 3/3] END max_depth=20, n_estimators=100; f1_macro: (test=0.556) f1_micro: (test=0.726) total time=  18.0s
[CV 1/3] END max_depth=20, n_estimators=200; f1_macro: (test=0.554) f1_micro: (test=0.731) total time=  35.9s
[CV 2/3] END max_depth=20, n_estimators=200; f1_macro: (test=0.541) f1_micro: (test=0.721) total time=  35.3s
[CV 3/3] END max_depth=20, n_estimators=200; f1_macro: (test=0.

In [48]:
pairwise = list()
under_over = list()
for sample in test:
  target = contigous_clusters(le.transform([ann.section.lower() for ann in sample]))
  pred = segment_from_classifier(clf_rf, word2vec_model, sample)
  pred = contigous_clusters(postprocess_prediction(le, section_p, pred))
  pairwise.append(pairwise_metrics(target, pred))
  under_over.append(under_over_segmentation(target, pred))
  
precision, recall, f1 = np.array(pairwise).mean(axis=0)
under, over = np.array(under_over).mean(axis=0)

  over_segmentation = max(0, 1 - (h_EA / (np.log2(N_P + 1e-50))))
  over_segmentation = max(0, 1 - (h_EA / (np.log2(N_P + 1e-50))))


In [50]:
precision, recall, f1, under, over

(0.502198154978753,
 0.78065718640416,
 0.588264810859806,
 0.7035475779025055,
 0.47162620834015473)

### Using fasttext

#### As a classification problem: Decision Trees

In [51]:
from sklearn.preprocessing import LabelEncoder
from functools import partial

X, y = embed_data(fasttext_model, train, context_size=5)

X = np.stack(X)
le = LabelEncoder().fit(sections)
y = le.transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42, stratify=y)
len(X_train), len(X_test)

(15093, 1678)

In [52]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

rf = RandomForestClassifier()
parameters = { "max_depth": np.arange(20, 50, 10), "n_estimators": [50, 100, 200] }

clf_rf = GridSearchCV(rf, parameters, cv=3, scoring=["f1_micro", "f1_macro"], refit="f1_macro", verbose=3)
clf_rf.fit(X_train, y_train)

pred = clf_rf.predict(X_test)
print(classification_report(y_test, pred, zero_division=0))

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV 1/3] END max_depth=20, n_estimators=50; f1_macro: (test=0.522) f1_micro: (test=0.714) total time=  10.7s
[CV 2/3] END max_depth=20, n_estimators=50; f1_macro: (test=0.513) f1_micro: (test=0.714) total time=  10.4s
[CV 3/3] END max_depth=20, n_estimators=50; f1_macro: (test=0.534) f1_micro: (test=0.713) total time=  10.4s
[CV 1/3] END max_depth=20, n_estimators=100; f1_macro: (test=0.541) f1_micro: (test=0.719) total time=  19.9s
[CV 2/3] END max_depth=20, n_estimators=100; f1_macro: (test=0.513) f1_micro: (test=0.706) total time=  26.0s
[CV 3/3] END max_depth=20, n_estimators=100; f1_macro: (test=0.554) f1_micro: (test=0.712) total time=  21.1s
[CV 1/3] END max_depth=20, n_estimators=200; f1_macro: (test=0.527) f1_micro: (test=0.717) total time=  39.1s
[CV 2/3] END max_depth=20, n_estimators=200; f1_macro: (test=0.489) f1_micro: (test=0.707) total time=  39.7s
[CV 3/3] END max_depth=20, n_estimators=200; f1_macro: (test=0.

In [53]:
pairwise = list()
under_over = list()
for sample in test:
  target = contigous_clusters(le.transform([ann.section.lower() for ann in sample]))
  pred = segment_from_classifier(clf_rf, word2vec_model, sample)
  pred = contigous_clusters(postprocess_prediction(le, section_p, pred))
  pairwise.append(pairwise_metrics(target, pred))
  under_over.append(under_over_segmentation(target, pred))
  
precision, recall, f1 = np.array(pairwise).mean(axis=0)
under, over = np.array(under_over).mean(axis=0)

  over_segmentation = max(0, 1 - (h_EA / (np.log2(N_P + 1e-50))))
  over_segmentation = max(0, 1 - (h_EA / (np.log2(N_P + 1e-50))))
  over_segmentation = max(0, 1 - (h_EA / (np.log2(N_P + 1e-50))))
  over_segmentation = max(0, 1 - (h_EA / (np.log2(N_P + 1e-50))))


In [54]:
precision, recall, f1, under, over

(0.48690790879971463,
 0.7605732692154131,
 0.5725045982339887,
 0.480847521621387,
 0.4142533483699257)

### Using harte2vec

#### As a classification problem: Decision Trees

In [55]:
from sklearn.preprocessing import LabelEncoder
from functools import partial

X, y = embed_data(harte2vec, train, context_size=5)

X = np.stack(X)
le = LabelEncoder().fit(sections)
y = le.transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42, stratify=y)
len(X_train), len(X_test)

(15093, 1678)

In [86]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

rf = RandomForestClassifier(class_weight={le.transform([k])[0]:v for k, v in sections_frequency.items()})
parameters = { "n_estimators": [100, 200] }

clf_rf = GridSearchCV(rf, parameters, cv=3, scoring=["f1_micro", "f1_macro"], refit="f1_macro", verbose=3)
clf_rf.fit(X_train, y_train)

pred = clf_rf.predict(X_test)
print(classification_report(y_test, pred, zero_division=0))

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV 1/3] END n_estimators=100; f1_macro: (test=0.543) f1_micro: (test=0.734) total time=  37.6s
[CV 2/3] END n_estimators=100; f1_macro: (test=0.523) f1_micro: (test=0.720) total time=  38.8s
[CV 3/3] END n_estimators=100; f1_macro: (test=0.546) f1_micro: (test=0.730) total time=  38.3s
[CV 1/3] END n_estimators=200; f1_macro: (test=0.533) f1_micro: (test=0.732) total time= 1.3min
[CV 2/3] END n_estimators=200; f1_macro: (test=0.511) f1_micro: (test=0.719) total time= 1.3min
[CV 3/3] END n_estimators=200; f1_macro: (test=0.562) f1_micro: (test=0.738) total time= 1.3min
              precision    recall  f1-score   support

           0       0.83      0.72      0.77       200
           1       0.71      0.56      0.63        81
           2       0.73      0.35      0.47        46
           3       0.50      0.25      0.33         4
           4       0.60      0.43      0.50       115
           5       0.82      0.57      0

In [87]:
pairwise = list()
under_over = list()
for sample in test:
  target = contigous_clusters(le.transform([ann.section.lower() for ann in sample]))
  pred = segment_from_classifier(clf_rf, harte2vec, sample)
  pred = contigous_clusters(postprocess_prediction(le, section_p, pred))
  pairwise.append(pairwise_metrics(target, pred))
  under_over.append(under_over_segmentation(target, pred))
  
precision, recall, f1 = np.array(pairwise).mean(axis=0)
under, over = np.array(under_over).mean(axis=0)

In [88]:
precision, recall, f1, under, over

(0.5189973525998562,
 0.7165302706138055,
 0.5805084127648278,
 0.6539111094267887,
 0.49847541141217294)