In [222]:
import os
import platform
import pyarrow.parquet
import subprocess

from tqdm import tqdm

import numpy as np
import pandas as pd

In [253]:
class Audio:
    def __init__(self, sr=16000, channels=1):
        self.sr = sr
        self.channels = channels

    def ffmpeg(self, input, output, start_time=None, duration=None, overwrite=False):

        if (os.path.isfile(input) and os.path.isfile(output) and overwrite) or \
                (os.path.isfile(input) and not os.path.isfile(output)):

            cmd = ['ffmpeg',
                   '-y',
                   '-i',
                   input,
                   '-vn',
                   '-ar',
                   str(self.sr),
                   '-ac',
                   str(self.channels),
                   '-acodec',
                   'pcm_s16le']

            if start_time is not None:
                cmd.extend(['-ss', str(start_time)])
            if duration is not None:
                cmd.extend(['-t', str(duration)])

            cmd.append(output)

            subprocess.call(cmd)


def preprocess_asr(asr_data):
    asr_data['debate_id'] = ['-'.join(x.split('-')[:-1]) for x in asr_data['speech_id']]
    asr_data['speechnumber'] = [x.split('-')[-1] for x in asr_data['speech_id']]

    asr_dataframe = asr_data[
        ['debate_id', 'speechnumber', 'speech_id', 'segment_id', 'start', 'end', 'word', 'word_prob']]
    asr_dataframe.reset_index(inplace=True, drop=True)

    collapsed = asr_dataframe.groupby('speech_id').agg({
        'speechnumber': 'min',
        'start': 'min',
        'end': 'max',
        'word': ' '.join}).reset_index()

    collapsed.rename(columns={'word': 'text'}, inplace=True)

    collapsed['speechnumber'] = collapsed['speechnumber'].astype(int)
    collapsed = collapsed.sort_values('speechnumber')
    collapsed.reset_index(inplace=True, drop=True)
    speechid = collapsed['speech_id'].tolist()
    text = collapsed['text'].tolist()
    textlength = [len(x) for x in text]

    speechid = [speechid[i] for i in range(len(speechid)) if textlength[i] >= 30]
    text = [text[i] for i in range(len(text)) if textlength[i] >= 30]
    textlength = [textlength[i] for i in range(len(textlength)) if textlength[i] >= 30]

    return collapsed, speechid, text, textlength

In [154]:
# # # # Working Directory # # # #

if platform.system() == 'Linux':
    rootwd = '/home/rask/'
else:
    rootwd = 'C:/Users/au535365/'

wd = os.path.join(rootwd, 'Dropbox/teaching/css_fall2023')
    
# Change directory
os.chdir(wd)

# Confirm that the working directory is as intended 
os.getcwd()

'/home/rask/Dropbox/teaching/css_fall2023'

In [127]:
# Generate file ids
files = ['20001', 
         '20011',
         '20012',
         '20021',
         '20031',
         '20041',
         '20042',
         '20051',
         '20061',
         '20071',
         '20072',
         '20081',
         '20091']

# Specify base url
base_url = 'https://raw.githubusercontent.com/mraskj/css_fall2023/master/data/ft-speeches/'

# Read in data. Solution here:
df = pd.DataFrame()
for file in tqdm(files):
    df_term = pd.read_csv(base_url + file + '.csv')
    if len(df_term) > 10000:
        df = pd.concat([df, df_term])
df.reset_index(drop=True, inplace=True)

100%|███████████████████████████████████████████| 13/13 [00:17<00:00,  1.32s/it]


In [128]:
df['n_words'] = df.text.apply(lambda x: len(x))
p25, p75 = np.quantile(df['n_words'], q=.25), np.quantile(df['n_words'], q=.75)
df = df.loc[(df['n_words'] <= p75) & (df['n_words'] >= p25)]

In [133]:
df = df.loc[df['period'] == 20091].reset_index(drop=True)

In [134]:
speaker_df = df.groupby('sppid')['spitch'].describe().reset_index()
speakers = speaker_df.loc[speaker_df.sppid.isin(['Özlem Sara Cekic_SF', 'Claus Hjort Frederiksen_V'])]

In [224]:
speaker0_df = df.loc[(df['sppid']==speakers.iloc[0].sppid)].reset_index(drop=True)
q10_ix = np.argmin(abs(speaker0_df.loc[:, 'spitch'] - speaker0_df.loc[:, 'spitch'].quantile(0.10)))
q90_ix = np.argmin(abs(speaker0_df.loc[:, 'spitch'] - speaker0_df.loc[:, 'spitch'].quantile(0.90)))

speaker0_speechq10 = speaker0_df.iloc[q10_ix]
speaker0_speechq90 = speaker0_df.iloc[q90_ix]

speaker1_df = df.loc[(df['sppid']==speakers.iloc[1].sppid)].reset_index(drop=True)
q10_ix = np.argmin(abs(speaker1_df.loc[:, 'spitch'] - speaker1_df.loc[:, 'spitch'].quantile(0.10)))
q90_ix = np.argmin(abs(speaker1_df.loc[:, 'spitch'] - speaker1_df.loc[:, 'spitch'].quantile(0.90)))

speaker1_speechq10 = speaker1_df.iloc[q10_ix]
speaker1_speechq90 = speaker1_df.iloc[q90_ix]

In [235]:
debate_id_speaker0_speechq10 = '-'.join(speaker0_speechq10.speech_id.split('-')[:-1])
debate_id_speaker0_speechq90 = '-'.join(speaker0_speechq90.speech_id.split('-')[:-1])

debate_id_speaker1_speechq10 = '-'.join(speaker1_speechq10.speech_id.split('-')[:-1])
debate_id_speaker1_speechq90 = '-'.join(speaker1_speechq90.speech_id.split('-')[:-1])

speech_id_speaker0_speechq10 = speaker0_speechq10.speech_id
speech_id_speaker0_speechq90 = speaker0_speechq90.speech_id

speech_id_speaker1_speechq10 = speaker1_speechq10.speech_id
speech_id_speaker1_speechq90 = speaker1_speechq90.speech_id

In [246]:
speaker_list = ['speaker0', 'speaker0', 'speaker1', 'speaker1']
debate_id_list = [debate_id_speaker0_speechq10, 
                  debate_id_speaker0_speechq90, 
                  debate_id_speaker1_speechq10,
                  debate_id_speaker1_speechq90]

speech_id_list = [speech_id_speaker0_speechq10,
                  speech_id_speaker0_speechq90,
                  speech_id_speaker1_speechq10,
                  speech_id_speaker1_speechq90]

quantile_list = ['q10', 'q90', 'q10', 'q90']

In [264]:
speech_df = pd.DataFrame({'speaker': speaker_list, 
                          'debate_id': debate_id_list,
                          'speech_id': speech_id_list,
                          'qs': quantile_list})

In [265]:
# Initiate audio class
audio_processor = Audio()

for i in range(len(speech_df)):
    
    debate_id = speech_df.iloc[i].debate_id
    speech_id = speech_df.iloc[i].speech_id
    speaker = speech_df.iloc[i].speaker
    q = speech_df.iloc[i].qs
    
    asr = pyarrow.parquet.read_table(f'/media/rask/DK/DK/{debate_id}/speeches.parquet')
    asr_df = asr.to_pandas()
    asr_df['speech_id'] = [str(x) + '-' + str(y) for x, y in zip(asr_df.debate_id, asr_df.speechnumber)]
    start, end = asr_df.loc[asr_df['speech_id'] == f'{speech_id}'][['start', 'end']].values[0]
    duration = np.round(end-start, 4)
    infile = f'/media/rask/DK/DK/{debate_id}/{debate_id}.wav'
    segment_fpath = 'data/audio/' + f'{speaker}' + '_' + str(q) + '.wav'
    audio_processor.ffmpeg(infile, segment_fpath, start, duration)