In [1]:
# !wget https://huggingface.co/datasets/mesolitica/Malaysian-STT-Whisper/resolve/main/extra.zip
# !unzip extra.zip

In [2]:
# !wget https://huggingface.co/datasets/mesolitica/Malaysian-STT-Whisper/resolve/main/data/extra-00000-of-00001.parquet
# !wget https://huggingface.co/datasets/mesolitica/Malaysian-STT-Whisper/resolve/main/data/science_context-00000-of-00001.parquet

In [3]:
# !wget https://huggingface.co/datasets/mesolitica/pseudolabel-science-large-v3-timestamp/resolve/main/check-same-science-context.jsonl

In [4]:
word = [
    'audio to Whisper ASR format word timestamp',
    'transcribe the audio into Whisper format in word timestamp'
]

srt = [
    'audio to SRT format',
    'transcribe the audio into srt format',
]

ttml = [
    'audio to TTML format',
    'transcribe the audio into ttml format',
]

segment = [
    'audio to Whisper ASR format',
    'transcribe the audio into Whisper format'
]

transcribe = ['transcribe the audio']

In [5]:
import re
import pandas as pd
import random
import os
import json
from tqdm import tqdm
from glob import glob
from xml.etree.ElementTree import Element, SubElement, tostring
import xml.dom.minidom

pattern = re.compile(r'<\|([\d.]+)\|>([^<]*)')

def format_srt_time(seconds):
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    millis = int((seconds - int(seconds)) * 1000)
    return f"{hours:02}:{minutes:02}:{secs:02},{millis:03}"

def get_subtitles(input_text):
    entries = pattern.findall(input_text)

    subtitles = []
    for i in range(len(entries) - 1):
        start_time = float(entries[i][0])
        end_time = float(entries[i + 1][0])
        text = entries[i][1].strip()
        if text:
            subtitles.append((start_time, end_time, text))
            
    return subtitles

def get_srt(subtitles):
    srt_output = ""
    for idx, (start, end, text) in enumerate(subtitles, start=1):
        srt_output += f"{idx}\n"
        srt_output += f"{format_srt_time(start)} --> {format_srt_time(end)}\n"
        srt_output += f"{text}\n\n"
    return srt_output

def format_ttml_time(seconds):
    return f"{int(seconds // 3600):02}:{int((seconds % 3600) // 60):02}:{int(seconds % 60):02}.{int((seconds % 1) * 1000):03}"

def get_tt(subtitles):
    tt = Element('tt', xmlns="http://www.w3.org/ns/ttml")
    body = SubElement(tt, 'body')
    div = SubElement(body, 'div')

    for start, end, text in subtitles:
        SubElement(div, 'p', begin=format_ttml_time(start), end=format_ttml_time(end)).text = text

    xml_str = xml.dom.minidom.parseString(tostring(tt)).toprettyxml(indent="  ")
    return str(xml_str)

In [6]:
df = pd.read_parquet('extra-00000-of-00001.parquet')

In [7]:
extra = []

for i in range(len(df)):
    if not os.path.exists(df['audio_filename'].iloc[i]):
        continue
    t = df['segment_timestamp'].iloc[i]
    extra.append({
        'question': random.choice(segment),
        'answer': t,
        'audio_filename': df['audio_filename'].iloc[i],
    })
    
    extra.append({
        'question': random.choice(transcribe),
        'answer': re.sub(r"<\|.*?\|>", "", t).strip(),
        'audio_filename': df['audio_filename'].iloc[i],
    })
    
    try:
        input_text = t.split('<|transcribe|>')[1]
        subtitles = get_subtitles(input_text)
    except:
        continue
        
    try:
        extra.append({
            'question': random.choice(srt),
            'answer': get_srt(subtitles),
            'audio_filename': df['audio_filename'].iloc[i],
        })
    except:
        pass
    
    try:
        extra.append({
            'question': random.choice(ttml),
            'answer': get_tt(subtitles),
            'audio_filename': df['audio_filename'].iloc[i],
        })
    except Exception as e:
        print(e)
        pass
    
len(extra)

30984

In [8]:
extra[-1]

{'question': 'transcribe the audio into ttml format',
 'answer': '<?xml version="1.0" ?>\n<tt xmlns="http://www.w3.org/ns/ttml">\n  <body>\n    <div>\n      <p begin="00:00:01.219" end="00:00:05.839">Pada pendapat kitak nak, seronok sik permainan congkak tok?</p>\n      <p begin="00:00:06.839" end="00:00:12.519">Kamek rasa nya best sebab aura persaingan nya sangatlah kuat.</p>\n    </div>\n  </body>\n</tt>\n',
 'audio_filename': 'audio/SM_FF_CONGKAK_001-6.mp3'}

In [9]:
df = pd.read_parquet('science_context-00000-of-00001.parquet')
df.head()

Unnamed: 0,audio_filename,word_timestamp,segment_timestamp
0,prepared-science-chunks/0-1.mp3,<|startoftranscript|><|en|><|transcribeprecise...,<|startoftranscript|><|en|><|transcribe|><|0.0...
1,prepared-science-chunks/2-0.mp3,<|startoftranscript|><|en|><|transcribeprecise...,<|startoftranscript|><|en|><|transcribe|><|0.0...
2,prepared-science-chunks/2-1.mp3,<|startoftranscript|><|en|><|transcribeprecise...,<|startoftranscript|><|en|><|transcribe|><|0.0...
3,prepared-science-chunks/3-0.mp3,<|startoftranscript|><|en|><|transcribeprecise...,<|startoftranscript|><|en|><|transcribe|><|0.0...
4,prepared-science-chunks/4-0.mp3,<|startoftranscript|><|en|><|transcribeprecise...,<|startoftranscript|><|en|><|transcribe|><|0.0...


In [18]:
from collections import Counter

def detect_repeated_phrases(text, n=3):
    words = text.lower().split()
    phrases = [' '.join(words[i:i+n]) for i in range(len(words)-n+1)]
    counter = Counter(phrases)
    return [phrase for phrase, count in counter.items() if count > 2]

science = []

with open('check-same-science-context.jsonl') as fopen:
    for l in tqdm(fopen):
        l = json.loads(l)
        audio_filename = l['audio_filename']
        if not os.path.exists(audio_filename):
            continue
        
        t = l['new_text']
        t_ = re.sub(r"<\|.*?\|>", "", t).strip()
        if len(t_.split()) < 10:
            continue
            
        if len(detect_repeated_phrases(t_, n=3)):
            continue
        
        if random.random() > 0.2:
            continue
        
        science.append({
            'question': random.choice(segment),
            'answer': t,
            'audio_filename': audio_filename,
        })

        science.append({
            'question': random.choice(transcribe),
            'answer': t_,
            'audio_filename': audio_filename,
        })
        
        try:
            input_text = t.split('<|transcribe|>')[1]
            subtitles = get_subtitles(input_text)
        except Exception as e:
            print(e)
            continue
        
        if random.random() > 0.7:
            try:
                science.append({
                    'question': random.choice(srt),
                    'answer': get_srt(subtitles),
                    'audio_filename': df['audio_filename'].iloc[i],
                })
            except:
                pass
        
        if random.random() > 0.7:
            try:
                science.append({
                    'question': random.choice(ttml),
                    'answer': get_tt(subtitles),
                    'audio_filename': df['audio_filename'].iloc[i],
                })
            except Exception as e:
                print(e)
                pass

849147it [00:48, 17379.36it/s]


In [19]:
len(science)

402124

In [21]:
science[:10]

[{'question': 'transcribe the audio into Whisper format',
  'answer': "<|startoftranscript|><|en|><|transcribe|><|0.00|> significant number of people in this world who also has experienced lost by way of a missing<|6.36|><|6.36|> person. And that is what happened with my family. So in 1987, my sister, Anita Wiley, disappeared<|14.84|><|14.84|> from Detroit really without a trace. And what makes the story a little more complicated is that<|21.26|><|21.26|> I never knew about Anita. She was a half-sister on my father's side. And I learned about her<|28.00|><|28.00|><|endoftext|>",
  'audio_filename': 'chunk/mp3-16k-0-0_003.mp3'},
 {'question': 'transcribe the audio',
  'answer': "significant number of people in this world who also has experienced lost by way of a missing person. And that is what happened with my family. So in 1987, my sister, Anita Wiley, disappeared from Detroit really without a trace. And what makes the story a little more complicated is that I never knew about Anita. 

In [22]:
pd.DataFrame(extra).to_parquet('extra-stt-extra.parquet')

In [23]:
pd.DataFrame(science).to_parquet('extra-stt-science.parquet')

In [24]:
!huggingface-cli upload mesolitica/Transcription-Instructions \
extra-stt-extra.parquet /data/extra-00000-of-00001.parquet --repo-type=dataset

Uploading files using Xet Storage..
Uploading...: 100%|███████████████████████████| 931k/931k [00:05<00:00, 158kB/s]
https://huggingface.co/datasets/mesolitica/Transcription-Instructions/blob/main//data/extra-00000-of-00001.parquet


In [25]:
!huggingface-cli upload mesolitica/Transcription-Instructions \
extra-stt-science.parquet /data/science-00000-of-00001.parquet --repo-type=dataset

Uploading files using Xet Storage..
Uploading...: 100%|████████████████████████| 77.2M/77.2M [00:12<00:00, 6.37MB/s]
https://huggingface.co/datasets/mesolitica/Transcription-Instructions/blob/main//data/science-00000-of-00001.parquet
