# Pitt cookie audio extraction

In [1]:
from pathlib import Path
import re
import glob
import pandas as pd
import pylangacq as pla

In [2]:
import torchaudio
from torchaudio import transforms

In [96]:
import torch

In [49]:
# .cha paths
pitt_path = Path('../data/Pitt/')
control_path = pitt_path/'Control'/'cookie'
dementia_path = pitt_path/'Dementia'/'cookie'

# media paths
media_path = Path('../data/media-cookie/Pitt/cookie')
media_control_path = media_path/'Control'
media_dementia_path = media_path/'Dementia'

# Extracted output paths
output_path = Path('../data/media-cookie/Pitt/cookie-par')
utterances_path = output_path/'utterances'
utterances_control_path = utterances_path/'Control'
utterances_dementia_path = utterances_path/'Dementia'
descriptions_path = output_path/'descriptions'
descriptions_control_path = descriptions_path/'Control'
descriptions_dementia_path = descriptions_path/'Dementia'

In [22]:
# Extraction of time marks from 'PAR' participants in .cha files
# The time marks are the last part of the last line
# of the participant's transcription.
def extract_times_from_file(file):
    par_re = re.compile(r'^\*PAR:\s(.*)')
    cont_re = re.compile(r'^\t(.*)')
    time_re = re.compile('.*\x15(\d+)_(\d+)\x15$')
    
    document = open(file).read()
    doc = document.split('\n')

    time_list = []
    in_par = False
    for line in doc:
        pattern = cont_re if in_par else par_re
        match = pattern.match(line)
        if match:
            in_par = True
            utterance = match.group(1)
            time_match = time_re.match(utterance)
            if time_match:
                time_begin = int(time_match.group(1))
                time_end = int(time_match.group(2))
                time_list.append((time_begin, time_end))
        else:
            in_par = False
            
    return(time_list)

times = extract_times_from_file(dementia_path/'703-0.cha')
times

[(3767, 7949),
 (8972, 11606),
 (12100, 13307),
 (20247, 25102),
 (25102, 26749),
 (27446, 34019)]

It looks like those numbers must be milliseconds.

## Extract all utterances and complete patient descriptions

We could take the opportunity to downsample and convert to mono to avoid doing it during training. We'll keep the source signal characteristics for now.

In [150]:
def extract_audio(source_cha_path, source_audio_path, utterances_output, descriptions_output):    
    files = sorted(glob.glob(str(source_cha_path)))
    for file in files:       
        filepath = Path(file)
        audio = (source_audio_path/(filepath.stem)).with_suffix('.mp3')
        print("Processing", audio)
        
        times = extract_times_from_file(file)
        try:
            (signal, sr) = torchaudio.load(audio)
        except:
            print("Could not open file", audio)
            continue
        
        description = None
        for i, (t_begin, t_end) in enumerate(times):
            s_begin = round(t_begin * sr / 1000)
            s_end = round(t_end * sr / 1000)
            utterance = signal[:, s_begin:s_end]
            # Some annotations yield 0 samples (i.e., '066-0' in the dementia set)
            # The values seem to be off in that file
            # TODO: check others
            if utterance.shape[1] == 0:
                print("Warning: 0-size fragment", i, audio)
                continue
            utterance_out = (utterances_output/(filepath.stem + f'-{i:02}')).with_suffix('.mp3')
            torchaudio.save(str(utterance_out), utterance, sr)
            
            # Concat all participant utterances
            description = utterance if description is None else torch.cat((description, utterance), 1)
            
        description_out = (descriptions_output/(filepath.stem)).with_suffix('.mp3')
        torchaudio.save(str(description_out), description, sr)

In [None]:
# This cell extracts all the files
extract_audio(control_path/'*.cha', media_control_path, utterances_control_path, descriptions_control_path)
extract_audio(dementia_path/'*.cha', media_dementia_path, utterances_dementia_path, descriptions_dementia_path)