# Some imports

In [None]:
import sys
import operator
import shutil
import os

In [None]:
amdtk_path = '/scratch/owb/Downloads/amdtk'
if amdtk_path not in sys.path:
    sys.path.append(amdtk_path)

In [None]:
from amdtk import readMlf

# Paths and parameters

In [None]:
mlf_path = '/scratch/owb/Downloads/amdtk/recipes/wsj/data/score.ref'
class_file = ( '/scratch/owb/Downloads/amdtk/recipes/wsj_mllt_fmllr_lda/'
               'ploop_mfcc_c1_T100_sil0_s3_g2_a3_b3/bigram_ws/6_0.5_10/KnownN_1_UnkN_2/TimedSentences_Iter_150.classes' )

# Read MLF file

In [None]:
mlf = readMlf(mlf_path)

# Read classes file

In [None]:
def readClasses(class_file):
    classes = dict()
    with open(class_file) as fid:
        for line in fid.readlines():
            split_line = line.split()
            if len(split_line) == 2:
                current_class = '_'.join(split_line)
            if len(split_line) == 3:
                entry = (split_line[0], round(float(split_line[1])*100), round(float(split_line[2])*100))
                try:
                    classes[current_class].append(entry)
                except KeyError:
                    classes[current_class] = [entry]
    return classes

In [None]:
classes = readClasses(class_file)

# Find max overlap

In [None]:
def add_overlapping_sequence(mlf, clusters):
    new_clusters = dict()
    for cluster, elements in clusters.items():
        new_clusters[cluster] = []
        for utterance_id, element_start, element_end in elements:
            sequence = (unit for unit, unit_start, unit_end, unit_score, unit_aux in mlf[utterance_id] if
                        unit_start + (unit_end - unit_start)/2 > element_start and 
                        unit_start + (unit_end - unit_start)/2 < element_end)
            new_clusters[cluster].append((utterance_id, element_start, element_end, tuple(sequence)))
    return new_clusters

In [None]:
def get_max_overlapping_sequence(clusters):
    new_clusters = dict()
    to_remove = ['0', '1', '2']
    for cluster, elements in clusters.items():
        counter = dict()
        for element, start, end, sequence in elements:
            sequence = tuple(unit[:-1] if unit[-1] in to_remove else unit for unit in sequence)
            try:
                counter[sequence] += end - start
            except KeyError:
                counter[sequence] = end - start
        new_clusters[cluster] = max(counter.items(), key=operator.itemgetter(1))
    return new_clusters

In [None]:
classes_overlap = add_overlapping_sequence(mlf, classes)

In [None]:
classes_max_overlap = get_max_overlapping_sequence(classes_overlap)

# Rename directories

In [None]:
directory = ('/scratch/owb/Downloads/amdtk/recipes/wsj_mllt_fmllr_lda/'
             'ploop_mfcc_c1_T100_sil0_s3_g2_a3_b3/bigram_ws/'
             '6_0.5_10/KnownN_1_UnkN_2/TimedSentences_Iter_150.classes_wavs')
for cluster_label, (sequence, sequence_length) in classes_max_overlap.items():
    src = os.path.join(directory, cluster_label)
    dst = os.path.join(directory, '{}_{}'.format(cluster_label, '-'.join(ll)))
    shutil.move(src, dst)

Cleanup via shell:

    mkdir -p Single
    for dir in Class_*
    do
      if [ $(ls ${dir}|wc -l) = 1 ]
        then
          mv $dir Single
      fi
    done
    
    mkdir -p SIL
    for dir in Class_*
    do
      sequence=$(echo $dir|cut -d '_' -f 3)
      if [ "$sequence" = "" -o "$sequence" = "SIL" -o "$sequence" = "SPN" -o "$sequence" = "NSN" ]
      then
        mv $dir SIL
      fi
    done