In [1]:
vocalsound_label = {'cough', 'laugh', 'sigh', 'sneeze', 'sniff', 'throatclearing'}

In [2]:
from glob import glob
from bs4 import BeautifulSoup
import soundfile as sf
from collections import defaultdict
import os
from multiprocess import Pool
import itertools
from tqdm import tqdm

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)

def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))

timestamps = [i * 0.02 for i in range(1500 + 1)]

In [3]:
!rm -rf ami-array-chunk
!mkdir ami-array-chunk

In [4]:
groups = defaultdict(list)
for f in glob('words/*.xml'):
    i = os.path.split(f)[1].split('.')[0]
    groups[i].append(f)

In [17]:
import random

def loop(groups_name):
    groups_name, _ = groups_name
    
    data = []
    for g in tqdm(groups_name):
        f_audio = os.path.join(g, 'audio/*Array*')
        f_audio = glob(f_audio)
        
        if not len(f_audio):
            continue
            
        ys = []
        for f_audio_ in random.sample(f_audio, 1):
            ys.append(sf.read(f_audio_))

        all_segments = []
        for f in groups[g]:
            with open(f) as fopen:
                d = fopen.read()
            soup = BeautifulSoup(d, "xml")
            elements = soup.find_all(['vocalsound', 'w'])
            for element in elements:
                try:
                    all_segments.append({
                        'element': element,
                        'start': float(element.attrs['starttime']),
                        'end': float(element.attrs['endtime']),
                    })
                except:
                    pass

        all_segments = sorted(all_segments, key = lambda x: x['start'])
        chunks, temp, = [], []
        initial = 0
        for element in all_segments:
            element = element['element']
            if initial == 0:
                initial = float(element.attrs['starttime'])
                temp.append(element)
            else:
                l = (float(element.attrs['endtime']) - initial)
                if l > 30:
                    chunks.append(temp)
                    initial = float(element.attrs['starttime'])
                    temp = [element]
                else:
                    temp.append(element)

        if len(temp):
            chunks.append(temp)

        for no, chunk in enumerate(chunks):
            start = float(chunk[0].attrs['starttime'])
            end = float(chunk[-1].attrs['starttime'])
            segments = []
            for c in chunk:
                start_time = float(c.attrs.get('starttime')) - start
                end_time = float(c.attrs.get('endtime')) - start
                text = c.get_text()
                if c.name == 'vocalsound':
                    if 'other' in c.attrs['type']:
                        continue
                    else:
                        t = c.attrs['type']
                        segments.append({
                            'text': f'<|{t}|>',
                            'start': start_time,
                            'end': end_time,
                        })
                if c.name == 'w':
                    if 'punc' in c.attrs:
                        try:
                            segments[-1]['text'] += text
                            segments[-1]['end'] = end_time
                        except:
                            pass
                    else:
                        segments.append({
                            'text': text,
                            'start': start_time,
                            'end': end_time,
                        })

            for n in range(len(ys)):
                y_, sr_ = ys[n]
                y__ = y_[int(start * sr_): int(end * sr_)]
                new_f = os.path.join('ami-array-chunk', f'{g}-{no}-{n}.mp3')
                sf.write(new_f, y__, 16000)
                data.append({
                    'audio_filename': new_f,
                    'segments': segments,
                })
                
    return data

In [13]:
groups_name = list(groups.keys())

In [14]:
data = loop((groups_name[:1], 0))

  0%|                                                                                                                     | 0/1 [00:07<?, ?it/s]


In [18]:
data = multiprocessing(groups_name, loop, cores = 10)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 17/17 [02:08<00:00,  7.54s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 17/17 [02:08<00:00,  7.55s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 17/17 [02:11<00:00,  7.75s/it]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.96s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 17/17 [02:21<00:00,  8.33s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 17/17 [02:22<00:00,  8.37s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 17/17 [02:23<00:

In [19]:
len(data)

11555

In [25]:
import json

with open('ami.json', 'w') as fopen:
    json.dump(data, fopen)

In [45]:
from unidecode import unidecode

In [46]:
formatted = []
for d in data:
    align = d['segments']
    if not len(align):
        continue
    segments, temp = [], [align[0]]
    last_t = align[0]['end']
    for c in align[1:]:
        if (c['start'] - last_t) > 0.25:
            segments.append(temp)
            temp = []

        last_t = c['end']
        temp.append(c)

    if len(temp):
        segments.append(temp)
        
    ts = []
    for s in segments:
        start = min(timestamps, key=lambda t: abs(t - s[0]['start']))
        end = min(timestamps, key=lambda t: abs(t - s[-1]['end']))
        w = ' '.join([c['text'] for c in s])
        w = unidecode(w)
        t = f"<|{start:.2f}|> {w}<|{end:.2f}|>"
        ts.append(t)

    ts = ''.join(ts)
    new_text = text = f"<|startoftranscript|><|en|><|transcribenonverbal|>{ts}<|endoftext|>"
    formatted.append({
        'audio_filename': d['audio_filename'],
        'new_text': new_text,
        'source': 'AMI'
    })

In [49]:
import pandas as pd

pd.DataFrame(formatted).to_parquet('ami.parquet')

In [1]:
from huggingface_hub import HfApi
api = HfApi()

In [50]:
api.upload_file(
    path_or_fileobj="ami.parquet",
    path_in_repo="data/ami-00000-of-00001.parquet",
    repo_id="mesolitica/Speech-Nonverbal-Whisper",
    repo_type="dataset",
)

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/Speech-Nonverbal-Whisper/commit/295f3445439fd408771185dc2e63c2c851a28ba4', commit_message='Upload data/ami-00000-of-00001.parquet with huggingface_hub', commit_description='', oid='295f3445439fd408771185dc2e63c2c851a28ba4', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mesolitica/Speech-Nonverbal-Whisper', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mesolitica/Speech-Nonverbal-Whisper'), pr_revision=None, pr_num=None)

In [43]:
!zip -rq ami-array-chunk.zip ami-array-chunk

In [2]:
api.upload_file(
    path_or_fileobj="ami-array-chunk.zip",
    path_in_repo="ami-array-chunk.zip",
    repo_id="mesolitica/Speech-Nonverbal-Whisper",
    repo_type="dataset",
)

ami-array-chunk.zip:   0%|          | 0.00/1.47G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/Speech-Nonverbal-Whisper/commit/1ffceb86209606c7b7a7641ad2bb392398581e56', commit_message='Upload ami-array-chunk.zip with huggingface_hub', commit_description='', oid='1ffceb86209606c7b7a7641ad2bb392398581e56', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mesolitica/Speech-Nonverbal-Whisper', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mesolitica/Speech-Nonverbal-Whisper'), pr_revision=None, pr_num=None)