In [None]:
dts = 'Jzuluaga/atcosim_corpus'
mdl = 'openai/whisper-large-v3'
spl = 'train+test'
print(len(spl.split('+')))
wsp = '-'.join(mdl.split('-')[1:])

print('Dataset: ', dts)
print('Model  : ', mdl)
print('Split  : ', spl)
print('Whisper: ', wsp)

2
Dataset:  Jzuluaga/atcosim_corpus
Model  :  openai/whisper-large-v3
Split  :  train+test
Whisper:  large-v3


In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
!pip install datasets
from datasets import load_dataset, Audio
dataset = load_dataset(dts)
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
dataset



DatasetDict({
    train: Dataset({
        features: ['id', 'audio', 'text', 'segment_start_time', 'segment_end_time', 'duration'],
        num_rows: 7638
    })
    test: Dataset({
        features: ['id', 'audio', 'text', 'segment_start_time', 'segment_end_time', 'duration'],
        num_rows: 1901
    })
})

In [None]:
# Select the first four training examples and the first validation example
dataset["train"] = dataset["train"].select(range(400))
dataset["test"] = dataset["test"].select(range(100))

# dataset should now contain 'input_features' and 'labels'
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'audio', 'text', 'segment_start_time', 'segment_end_time', 'duration'],
        num_rows: 400
    })
    test: Dataset({
        features: ['id', 'audio', 'text', 'segment_start_time', 'segment_end_time', 'duration'],
        num_rows: 100
    })
})

In [None]:
!pip install git+https://github.com/openai/whisper.git
import numpy as np
import pandas as pd
from datetime import datetime
import whisper

df = pd.DataFrame(columns=['split', 'hyp-prmpt', 'hyp-clean', 'ref'])

model = whisper.load_model('-'.join(mdl.split('-')[1:]))

print('Starting inference...')
nato = "alpha,bravo,charlie,delta,echo,foxtrot,golf,hotel,india,juliett,kilo,lima,mike,november,oscar,papa,quebec,romeo,sierra,tango,uniform,victor,whiskey,xray,yankee,zulu"
terminology = "climb, climbing, descend, descending, passing, feet, knots, degrees, direct, maintain, identified, ILS, VFR, IFR, contact, frequency, turn, right, left, heading, altitude, flight, level, cleared, squawk, approach, runway, established, report, affirm, negative, wilco, roger, radio, radar"

for s in spl.split('+'):
    for i in range(len(dataset[s])):
        audio = dataset[s][i]['audio']['array']
        audio = whisper.pad_or_trim(audio)

        if wsp == 'large-v3':
            mel = whisper.log_mel_spectrogram(np.float32(audio), n_mels=128).to(model.device)
        else:
            mel = whisper.log_mel_spectrogram(np.float32(audio)).to(model.device)

        try:
            prompt = 'Air Traffic Control Communications ' + dataset[s][i]['info'].replace('\n', ' ') + ' ' + nato.replace(',',' ') + ' ' + terminology.replace(',',' ')

        except:
            inf = ''
            prompt = 'Air Traffic Control Communications ' + nato.replace(',',' ') + ' ' + terminology.replace(',',' ')

        options = whisper.DecodingOptions(language='en', prompt=prompt, fp16=False)
        res_prmpt = whisper.decode(model, mel, options=options)
        options = whisper.DecodingOptions(language='en', fp16=False)
        res_clean = whisper.decode(model, mel, options=options)

        df.loc[len(df.index)] = [s, res_prmpt.text, res_clean.text, dataset[s][i]['text']]

        print(s, str(int(i/len(dataset[s])*100))+'%', end='\r')
df.to_excel(dts.split('/')[-1]+'-'+spl+'-'+mdl.split('/')[-1]+'-'+datetime.today().strftime('%Y-%m-%d--%H:%M:%S')+'.xlsx')

df

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-dq2ixf81
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-dq2ixf81
  Resolved https://github.com/openai/whisper.git to commit ba3f3cd54b0e5b8ce1ab3de13e32122d0d5f98ab
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 

In [None]:
# Normalization
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import jiwer

# Download required resources
nltk.download('stopwords')
nltk.download('wordnet')

# Define the normalization function
def filter_and_normalize(text):
    # Convert to lowercase
    text = text.lower()
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize
    tokens = text.split()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Join tokens back into a string
    return ' '.join(tokens)

# Sample DataFrame
data = {
    'ref': ['This is a sample text.', 'Another example!'],
    'hyp-clean': ['Sample text for hypothesis.', 'Example text.'],
    'hyp-prmpt': ['Prompted text.', 'Yet another example.'],
    'split': ['train', 'test']  # Adding the split column
}

df = pd.DataFrame(data)

# Apply the normalization function
df['ref-norm'] = df.apply(lambda x: filter_and_normalize(x['ref']), axis=1)
df['hyp-clean-norm'] = df.apply(lambda x: filter_and_normalize(x['hyp-clean']), axis=1)
df['hyp-prmpt-norm'] = df.apply(lambda x: filter_and_normalize(x['hyp-prmpt']), axis=1)

# Display the result
print(df.head())

                      ref                    hyp-clean             hyp-prmpt  \
0  This is a sample text.  Sample text for hypothesis.        Prompted text.   
1        Another example!                Example text.  Yet another example.   

   split         ref-norm          hyp-clean-norm       hyp-prmpt-norm  
0  train      sample text  sample text hypothesis        prompted text  
1   test  another example            example text  yet another example  


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
!pip install jiwer
import jiwer

def calcWER(df, spl):
    dff = df.loc[df['split'].isin(spl.split('+'))]
    wer_cln = jiwer.wer(list(dff['ref']), list(dff['hyp-clean']))
    wer_prm = jiwer.wer(list(dff['ref']), list(dff['hyp-prmpt']))
    wer_cln_nrm = jiwer.wer(list(dff['ref-norm']), list(dff['hyp-clean-norm']))
    wer_prm_nrm = jiwer.wer(list(dff['ref-norm']), list(dff['hyp-prmpt-norm']))

    print('clean        : {} %'.format(round(wer_cln*100,4)))
    print('prmpt        : {} %'.format(round(wer_prm*100,4)))
    print('clean-norm   : {} %'.format(round(wer_cln_nrm*100,4)))
    print('prmpt-norm   : {} %'.format(round(wer_prm_nrm*100,4)))

# Split Train+Test
spl = 'train+test'
wsp = '-'.join(mdl.split('-')[1:])

print('Dataset: ', dts)
print('Model  : ', mdl)
print('Split  : ', spl)
print('Whisper: ', wsp)

calcWER(df, spl)

Dataset:  Jzuluaga/atcosim_corpus
Model  :  openai/whisper-large-v3
Split  :  train+test
Whisper:  large-v3
clean        : 100.0 %
prmpt        : 100.0 %
clean-norm   : 75.0 %
prmpt-norm   : 50.0 %


In [None]:
# Split test
spl = 'test'
wsp = '-'.join(mdl.split('-')[1:])

print('Dataset: ', dts)
print('Model  : ', mdl)
print('Split  : ', spl)
print('Whisper: ', wsp)

calcWER(df, spl)

Dataset:  Jzuluaga/atcosim_corpus
Model  :  openai/whisper-large-v3
Split  :  test
Whisper:  large-v3
clean        : 100.0 %
prmpt        : 150.0 %
clean-norm   : 100.0 %
prmpt-norm   : 50.0 %
