In [None]:
# !7z x drive/MyDrive/dl2/train.7z -odrive/MyDrive/dl2/extracted/train
# !find drive/MyDrive/dl2/extracted/train -type f | wc -l

In [1]:
%%capture
! pip install git+https://github.com/openai/whisper.git
! pip install jiwer

# START

In [2]:
from google.colab import drive
drive.mount('/content/drive')
from scipy.io import wavfile
import whisper
import random
from tqdm import tqdm
import torch

random.seed(123)

def force_cudnn_initialization():
    s = 2
    dev = torch.device('cuda')
    torch.nn.functional.conv2d(torch.zeros(s, s, s, s, device=dev), torch.zeros(s, s, s, s, device=dev))
if torch.cuda.is_available():
    force_cudnn_initialization()
else:
    print('No cuda')

Mounted at /content/drive


## Listing data

In [3]:
import os

root_path = 'drive/MyDrive/dl2/extracted/'
train_path = root_path + 'train/train/audio/'
test_path = root_path + 'test/test/audio/'

train_classes_names = [c for c in os.listdir(train_path) if c != '_background_noise_']
print(f"Found {len(train_classes_names)} train classes")
print(train_classes_names[:10])
print(train_classes_names[10:20])
print(train_classes_names[20:])

Found 30 train classes
['bed', 'five', 'bird', 'eight', 'happy', 'house', 'go', 'left', 'four', 'dog']
['cat', 'down', 'no', 'seven', 'nine', 'right', 'marvin', 'on', 'off', 'one']
['sheila', 'six', 'yes', 'zero', 'wow', 'up', 'tree', 'two', 'stop', 'three']


# Dataset class

In [4]:
class WhisperDataset(torch.utils.data.Dataset):
    def __init__(self, train_path, size=None, selected_class=None):
        train_classes_names = os.listdir(train_path)
        self.dataset = []
        train_classes_names = [c for c in train_classes_names if c != '_background_noise_']
        train_classes_paths = [train_path + f for f in train_classes_names]
        image_filenames = []
        image_filenames_by_class = {}
        for class_name in train_classes_names:
            filenames = os.listdir(train_path + class_name)
            image_filenames_by_class[class_name] = []
            for filename in filenames:
                image_filenames.append((train_path, class_name, '/' , filename))
                image_filenames_by_class[class_name].append((train_path, class_name, '/' , filename))
        
        if selected_class != None:
            image_filenames_by_class = {selected_class: image_filenames_by_class[selected_class]}
            image_filenames = image_filenames_by_class[selected_class]

        if size != None and size > 0:
            image_filenames_selected = [] 
            size_per_class = round(size/len(image_filenames_by_class))
            print(f'Recordings per class: {size_per_class}')
            for class_name, class_filenames in image_filenames_by_class.items():
                print(f'class {class_name} - recordings found: {len(class_filenames)}')
                selected_from_this_class = random.sample(class_filenames, size_per_class)
                image_filenames_selected += selected_from_this_class
            image_filenames = image_filenames_selected
        

        for long_filename in tqdm(image_filenames):
            train_path, class_name, slash, filename = long_filename
            whole_filename_path = train_path + class_name + '/' + filename
            bitrate, array_file = wavfile.read(whole_filename_path)
            audio = torch.tensor(array_file).to(torch.float)
            # audio = whisper.pad_or_trim(audio.flatten())
            assert bitrate == 16000
            
            self.dataset.append((train_path, class_name, '/' , filename, audio))

            # mel = whisper.log_mel_spectrogram(audio)
            # self.dataset.append((mel, class_name))
              
    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, item):
        train_path, class_name, slash, filename, audio = self.dataset[item]

        audio = whisper.pad_or_trim(audio.flatten())
        mel = whisper.log_mel_spectrogram(audio)

        return (mel, class_name)


# Exporting datasets to pickles

In [5]:
# trainsets_root = '/content/drive/MyDrive/dl2/train_dataset/'
# for class_name in train_classes_names:
#     SIZE = -1
#     print(f'Starting class: {class_name}')
#     train_dataset = WhisperDataset(train_path, size=SIZE, selected_class=class_name)
#     pickled_name = f'trainset_class_{class_name}_size_{len(train_dataset)}.pt'
#     torch.save(train_dataset, trainsets_root + pickled_name)
#     print(f'Saved class: {class_name} to {pickled_name}')

# Loading ready datasets

In [6]:
trainsets_root = '/content/drive/MyDrive/dl2/train_dataset/'
datasets_pickles_names = {v.split('_')[2] : v for v in os.listdir(trainsets_root)}
# datasets_dict = {k:torch.load(trainsets_root + v) for k, v in datasets_pickles_names.items()} # load all datasets

def load_ready_dataset(class_name):
    trainsets_root = '/content/drive/MyDrive/dl2/train_dataset/'
    datasets_pickles_names = {v.split('_')[2] : v for v in os.listdir(trainsets_root)} 
    return torch.load(trainsets_root + datasets_pickles_names[class_name])

print(f'Available datasets: {len(datasets_pickles_names)}')
datasets_pickles_names

Available datasets: 30


{'bed': 'trainset_class_bed_size_1713.pt',
 'five': 'trainset_class_five_size_2357.pt',
 'bird': 'trainset_class_bird_size_1731.pt',
 'eight': 'trainset_class_eight_size_2352.pt',
 'happy': 'trainset_class_happy_size_1742.pt',
 'house': 'trainset_class_house_size_1750.pt',
 'go': 'trainset_class_go_size_2372.pt',
 'left': 'trainset_class_left_size_2353.pt',
 'four': 'trainset_class_four_size_2372.pt',
 'dog': 'trainset_class_dog_size_1746.pt',
 'cat': 'trainset_class_cat_size_1733.pt',
 'down': 'trainset_class_down_size_2359.pt',
 'no': 'trainset_class_no_size_2375.pt',
 'seven': 'trainset_class_seven_size_2377.pt',
 'nine': 'trainset_class_nine_size_2364.pt',
 'right': 'trainset_class_right_size_2367.pt',
 'marvin': 'trainset_class_marvin_size_1746.pt',
 'on': 'trainset_class_on_size_2367.pt',
 'off': 'trainset_class_off_size_2357.pt',
 'one': 'trainset_class_one_size_2370.pt',
 'sheila': 'trainset_class_sheila_size_1734.pt',
 'six': 'trainset_class_six_size_2369.pt',
 'yes': 'trainse

# Whisper

In [7]:
import whisper
import random
from tqdm import tqdm

random.seed(123)

In [16]:
# DONE:
# CLASS_NAME = 'dog'

# CLASS_NAME = 'zero'
# CLASS_NAME = 'one'
# CLASS_NAME = 'two'
# CLASS_NAME = 'three'
# CLASS_NAME = 'four'
# CLASS_NAME = 'five'

# CLASS_NAME = 'yes'
# CLASS_NAME = 'no'
# CLASS_NAME = 'left'
# CLASS_NAME = 'right'
# CLASS_NAME = 'up'
# CLASS_NAME = 'down'
# CLASS_NAME = 'go'
# CLASS_NAME = 'stop'
# CLASS_NAME = 'off'
# CLASS_NAME = 'on'


# TODO:

# CLASS_NAME = 'wow'
# CLASS_NAME = 'bed'
# CLASS_NAME = 'marvin'
# CLASS_NAME = 'sheila'
# CLASS_NAME = 'tree'
# CLASS_NAME = 'bird'
# CLASS_NAME = 'happy'
# CLASS_NAME = 'cat'
# CLASS_NAME = 'house'

# CLASS_NAME = 'six'
# CLASS_NAME = 'seven'
# CLASS_NAME = 'eight'
# CLASS_NAME = 'nine'

train_dataset = load_ready_dataset(CLASS_NAME)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=False)

In [17]:
MODEL_NAME = 'medium.en' 

per_class_root = '/content/drive/MyDrive/dl2/per_class_results/'

from tqdm import tqdm
import pandas as pd
import whisper
model = whisper.load_model(MODEL_NAME)
device = 'cuda'

options = whisper.DecodingOptions(language="en", without_timestamps=True)
hypotheses = []
references = []

for mels, texts in tqdm(train_loader):
    mels = mels.to(device)
    results = model.decode(mels, options)
    hypotheses.extend([result.text for result in results])
    references.extend(texts)

def clean_prediction(text):
    return text.lower().replace('.','').replace('?','').replace(' ','').replace('!','')

size = len(train_dataset)


data = pd.DataFrame(dict(hypothesis=hypotheses, reference=references))
hypotheses_clean = [clean_prediction(h) for h in hypotheses]
data_clean = pd.DataFrame(dict(hypothesis=hypotheses_clean, reference=references))

accuracy = len(data_clean[data_clean['hypothesis'] == data_clean['reference']]) / size
print(f'Accuracy: {round(accuracy,4)}')

results_filename_prefix = per_class_root + f'model_{MODEL_NAME}_train_{round(accuracy,4)}_class_{CLASS_NAME}_size_{size}'

raw_filename = results_filename_prefix + '_raw.csv'
print(f"Filename raw: {raw_filename}")
data.to_csv(raw_filename)
print(data.head(5))

clean_filename = results_filename_prefix + '.csv'
print(f"Filename: {clean_filename}")
data_clean.to_csv(clean_filename)
print(data_clean.head(5))

100%|██████████| 148/148 [11:20<00:00,  4.60s/it]

Accuracy: 0.4535
Filename raw: /content/drive/MyDrive/dl2/per_class_results/model_medium.en_train_0.4535_class_five_size_2357_raw.csv
  hypothesis reference
0      Five.      five
1      B A B      five
2         Hi      five
3  I'm fine.      five
4        Bye      five
Filename: /content/drive/MyDrive/dl2/per_class_results/model_medium.en_train_0.4535_class_five_size_2357.csv
  hypothesis reference
0       five      five
1        bab      five
2         hi      five
3    i'mfine      five
4        bye      five





## Multiclass

In [None]:
SIZE = 1000
train_dataset = WhisperDataset(train_path, size=SIZE)

In [None]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=False)

In [None]:
# available models = ['tiny.en', 'tiny', 'base.en', 'base', 'small.en', 'small', 
# 'medium.en', 'medium', 'large-v1', 'large-v2', 'large']

# MODEL_NAME = 'tiny.en' # 0.0005 acc
# MODEL_NAME = 'base.en' # 0.0043 acc
# MODEL_NAME = 'small.en' # 0.1716 acc
MODEL_NAME = 'medium.en' # 0.5080 acc
# MODEL_NAME = 'large' # 0.5010 acc
# MODEL_NAME = 'large-v2' #0.4980 acc

from tqdm import tqdm
import pandas as pd
import whisper
model = whisper.load_model(MODEL_NAME)
device = 'cuda'

options = whisper.DecodingOptions(language="en", without_timestamps=True)
hypotheses = []
references = []

def clean_prediction(text):
    return text.lower().replace('.','').replace('?','').replace(' ','')

for mels, texts in tqdm(train_loader):
    mels = mels.to(device)
    results = model.decode(mels, options)
    hypotheses.extend([result.text for result in results])
    references.extend(texts)

data = pd.DataFrame(dict(hypothesis=hypotheses, reference=references))

hypotheses_clean = [clean_prediction(h) for h in hypotheses]
data_clean = pd.DataFrame(dict(hypothesis=hypotheses_clean, reference=references))
accuracy = len(data_clean[data_clean['hypothesis'] == data_clean['reference']]) / len(data_clean)

print(f'Accuracy: {round(accuracy,4)}')

raw_filename = root_path + MODEL_NAME + f'_train_{round(accuracy,4)}_size_{SIZE}_raw.csv'
print(f"Filename raw: {raw_filename}")
data.to_csv(raw_filename)
print(data.head(5))

clean_filename = root_path + MODEL_NAME + f'_train_{round(accuracy,4)}_size_{SIZE}.csv'
print(f"Filename: {clean_filename}")
data_clean.to_csv(clean_filename)
print(data_clean.head(5))

100%|█████████████████████████████████████| 1.42G/1.42G [00:30<00:00, 49.9MiB/s]
  0%|          | 0/1 [00:00<?, ?it/s]


RuntimeError: ignored

In [None]:
from copy import deepcopy
import seaborn as sns
import matplotlib.pyplot as plt
df = deepcopy(data_clean)

for i in range(len(df)):
    if df.loc[i,'hypothesis'] not in train_classes_names:
        df.loc[i,'hypothesis'] = 'mistake'
df_confusion = pd.crosstab(df['reference'], df['hypothesis'])
fig, ax = plt.subplots(figsize=(15,10)) 
sns.heatmap(df_confusion, linewidths=1, annot=True, fmt='g')
plt.title(f"Whisper model {MODEL_NAME}, accuracy {round(accuracy,4)}, img per class: {int(len(df)/len(train_classes_names))}")

output_path = root_path + MODEL_NAME + f'_train_{round(accuracy,4)}_size_{SIZE}.png'
fig.savefig(output_path)
plt.show()


## Large 1000 start

In [None]:
data = pd.DataFrame(dict(hypothesis=hypotheses, reference=references))
print(data.head(5))
accuracy = len(data[data['hypothesis'] == data['reference']]) / len(data)
print(f'Accuracy: {round(accuracy,4)}')
print(f"Filename: {root_path + MODEL_NAME + f'_train_{round(accuracy,4)}_size_{SIZE}.csv'}")
data.to_csv(root_path + MODEL_NAME + f'_train_{round(accuracy,4)}_size_{SIZE}.csv')

  hypothesis reference
0       stop      stop
1       tree      tree
2        off       off
3         go        go
4      house     house
Accuracy: 0.501
Filename: drive/MyDrive/dl2/extracted/large_train_0.501_size_1000.csv


In [None]:
s = 0
for i in range(len(data)):
    if len(data.iloc[i,0]) == len(data.iloc[i,1]):
        s += 1
print(f'Same word length: {round(s/len(data), 4)}')

Same word length: 0.605


## Medium en 1000 start

In [None]:
data = pd.DataFrame(dict(hypothesis=hypotheses, reference=references))
print(data.head(5))
accuracy = len(data[data['hypothesis'] == data['reference']]) / len(data)
print(f'Accuracy: {round(accuracy,4)}')
print(f"Filename: {root_path + MODEL_NAME + f'_train_{round(accuracy,4)}_size_{SIZE}.csv'}")
data.to_csv(root_path + MODEL_NAME + f'_train_{round(accuracy,4)}_size_{SIZE}.csv')

  hypothesis reference
0      right     right
1        dog       dog
2         no        no
3       bird      bird
4       stop      stop
Accuracy: 0.5076
Filename: drive/MyDrive/dl2/extracted/medium.en_train_0.5076_size_10000.csv


In [None]:
s = 0
for i in range(len(data)):
    if len(data.iloc[i,0]) == len(data.iloc[i,1]):
        s += 1
print(f'Same word length: {round(s/len(data), 4)}')

Same word length: 0.6089


In [None]:
from collections import Counter
c = Counter(list(data.iloc[:,1].values.flatten()))
print("Most common words:")
{k:v for k, v in dict(c).items()}

Most common words:


{'right': 458,
 'dog': 347,
 'no': 423,
 'bird': 331,
 'stop': 438,
 'off': 425,
 'one': 424,
 'go': 402,
 'tree': 322,
 'happy': 316,
 'six': 437,
 'four': 442,
 'eight': 434,
 'three': 417,
 'nine': 442,
 'five': 429,
 'left': 441,
 'cat': 298,
 'house': 319,
 'sheila': 343,
 'down': 435,
 'seven': 434,
 'on': 461,
 'marvin': 278,
 'two': 172,
 'bed': 332}

## Medium en 1000 end

In [None]:
a = list(df[['result']].values.flatten())
a = [b.lower().replace('.','').replace('?','') for b in a]
from collections import Counter
print(len(a))
Counter(a)

203


Counter({' bed': 83,
         ' better': 1,
         ' then': 2,
         ' bud': 1,
         ' better!': 1,
         'sword': 1,
         ' bad': 33,
         ' everything': 1,
         ' no': 1,
         ' but': 3,
         ' oh, that one': 1,
         ' thank you': 1,
         ' dead': 18,
         ' good': 10,
         '!': 1,
         ' chair, there are some special questions to schedule that was very i had a hugely visit because of the informatism okay, sorry! okay, parliamentists are here, and have a long moment of': 1,
         ' there': 2,
         ' vein': 1,
         ' fad': 1,
         ' paid': 1,
         ' presence dead': 1,
         ' that': 2,
         ' bet': 2,
         ' hehe': 1,
         ' good, bad': 1,
         ' you bet': 1,
         ' bed,': 1,
         ' fed': 1,
         ' god': 2,
         ' good bet': 1,
         ' bedy': 1,
         ' ben': 5,
         ' god!': 1,
         ' that was bad': 1,
         " and i was like what's up bird": 1,
         ' bed can

In [None]:
import pandas as pd

df = pd.DataFrame(results)
df.to_csv('drive/MyDrive/dl2/extracted/test1.csv')

In [None]:
a = list(df[['result']].values.flatten())
a = [b.lower().replace('.','').replace('?','') for b in a]
from collections import Counter
print(len(a))
Counter(a)

284


Counter({' bed': 217,
         ' bad': 36,
         ' i bet': 1,
         ' but': 5,
         ' bid': 3,
         ' beds': 1,
         ' bye': 3,
         " i'm bad": 1,
         ' in bed': 3,
         ' dead': 4,
         ' clear on you that': 1,
         ' okay': 1,
         ' bed!': 1,
         ' bet': 2,
         ' better': 1,
         ' fed': 1,
         ' beard': 1,
         ' ed': 1,
         ' bae-d': 1})