In [None]:
# default_exp wandbutils

In [None]:
#all_slow

# Explore 

> Functions to process your data

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
#all_slow
import wandb
import librosa
import torchaudio
import numpy as np
from fastcore.basics import *
from datasets import Dataset, load_dataset

# Weights and Biases for EDA, Modeling Tracking and More

[General Hugging Face with Weights and Biases](https://docs.wandb.ai/integrations/huggingface?utm_source=github&utm_medium=github&utm_campaign=xlsr)

[Artifacts docs](https://docs.wandb.ai/artifacts?utm_source=github&utm_medium=github&utm_campaign=xlsr)

[Datasets and Predictions docs](https://docs.wandb.ai/datasets-and-predictions?utm_source=github&utm_medium=github&utm_campaign=xlsr)

## Explore Your Dataset with Weights & Biases

In [None]:
class WandbDataExplorer():
    '''
        Pass a Hugging Face Dataset and log it to a Weigths and Biases table
        Expects that your dataset contains a "path" column with file paths to 
        audio files.
        
        n_samples: If "n_samples" is less than the length of ds, a random n_samples number
        of samples will be logged
        
        cols_to_log: If not set, the table will contain the following columns:
            [audio, duration, <the rest of the columns in your dataset>...]
    '''
    def __init__(self, ds:Dataset=None, n_samples:int=100,  
                 cols_to_log:list=None,resample:bool=True, new_sr:int=16_000, 
                 artifact_type:str='audio_dataset', artifact_name:str = 'my_artifact', 
                 table_name:str='explore_samples', wandb_project = 'xlsr',
                 cols_to_exclude:list=None, verbose:bool=True):
        store_attr()
        self.ds_len = len(self.ds)
        if self.n_samples < self.ds_len: 
            self.idxs = np.random.randint(0, n_samples, n_samples)
        else:
            self.idxs = list(range(self.ds_len))
    
    def _get_audio(self, path):
        speech_array, sr = torchaudio.load(path)
        sa = speech_array[0].numpy()

        if self.resample: 
            sa = librosa.resample(np.asarray(sa), sr, self.new_sr)
            sr = self.new_sr
        return sa,sr
    
    def _make_row(self, ndx:int):
        '''Logs all data for that row and adds and audio and duration column''' 
        row = []
        path = self.ds["path"][ndx]
        fn = path.split('/')[-1] 
        
        # Grab each item of interest to log
        sa,sr = self._get_audio(path)

        # Create a Wandb Audio object to log the speech array too
        raw_audio = wandb.Audio(data_or_path=sa, sample_rate=sr, caption=fn)

        # Grab the duration of the track (in seconds)
        duration = librosa.get_duration(y=sa, sr=sr) 

        row.append(raw_audio)
        row.append(duration)
        for col in self.cols_to_log: row.append(self.ds[col][ndx])
        return row
    
    def _create_wandb_data(self):
        if self.cols_to_log is None:             
            self.cols_to_log = [col for col in self.ds.column_names if col not in self.cols_to_exclude]
    
            # Set the 3rd column to be the text column if there is one
            for i,col in enumerate(self.cols_to_log): 
                if ('text' in col) or ('sentence' in col): 
                    self.cols_to_log.insert(0, self.cols_to_log.pop(i))
            for i,col in enumerate(self.cols_to_log):    
                if 'path' in col: 
                        self.cols_to_log.insert(len(self.cols_to_log)-1, self.cols_to_log.pop(i))
        
        # Log to table data list, row by row
        table_data = []
        for ndx in self.idxs:
            table_data.append(self._make_row(ndx=ndx))
        
        # Create wandb table object add all data to it
        self.table_cols = ['audio', 'duration'] + self.cols_to_log
        self.wandb_table = wandb.Table(data=table_data, columns=self.table_cols)  
    
    def _log_table_to_wandb(self):
        # `type` can be set to whatever makes sense for you
        self.audio_ds_artifact = wandb.Artifact(name=self.artifact_name, type=self.artifact_type)

        # Add the table to the artifact
        self.audio_ds_artifact.add(self.wandb_table, self.table_name)
        
        # Save the artifact to 
        self.audio_ds_artifact.save(project=self.wandb_project)
        
    def log(self):
        self._create_wandb_data()
        self._log_table_to_wandb()

In [None]:
explore = WandbDataExplorer(ds=test_ds, n_samples=100, 
                            artifact_name = 'my_new_artifact', artifact_type='audio_dataset',
                            table_name='explore_samples', wandb_project = 'xlsr',
                            cols_to_exclude=['client_id','segment'])

In [None]:
explore.log()

[<wandb.data_types.Audio object at 0x7fe919075ad0>, 6.84, '"Bhí Tara Viscardi agus Meadhbh O\'Rourke ag seinnt i ngairdín na mbláth"', '', '', 0, '', 'ga-IE', 2, 'data/downloads/extracted/ab29a329cca022c56c2f6a514b26920e0ee09b0553fbba0978944f736ca9da51/cv-corpus-6.1-2020-12-11/ga-IE/clips/common_voice_ga-IE_21558428.mp3']
['sentence', 'accent', 'age', 'down_votes', 'gender', 'locale', 'up_votes', 'path']
['audio', 'duration', 'sentence', 'accent', 'age', 'down_votes', 'gender', 'locale', 'up_votes', 'path']


[34m[1mwandb[0m: View artifact at https://wandb.ai/wandb/xlsr/artifacts/audio_dataset/my_new_artifact/479258d8f9b1bfc9b9f0


## Tracking

In [None]:
# export
def setup_wandb(entity='wandb', project_name='xlsr', log_model=True):
    import wandb
    # Set W&B user name
    os.environ["WANDB_ENTITY"] = entity

    # Set W&B project name. xlsr is a public W&B project
    os.environ["WANDB_PROJECT"] = project_name
    
    # Log your trained model to W&B as an Artifact
    if log_model: os.environ["WANDB_LOG_MODEL"] = 'true'

    wandb.login()
    
    return entity, project_name

In [None]:
#setup_wandb(entity='wandb', project_name='xlsr', log_model=True)

In [None]:
## hide
from nbdev.export import notebook2script; notebook2script()

Converted 00_core.ipynb.
Converted 01_data.ipynb.
Converted 02_aug.ipynb.
Converted 03_training.ipynb.
Converted 04_evaluation.ipynb.
Converted index.ipynb.
