# Setup 

In [2]:
import sys 
import shutil 
import sklearn 
assert sklearn.__version__ >= "0.20" 
# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"
import tensorflow_text as tf_text 
import tensorflow_datasets as tfds 
# Common imports
import numpy as np
import os
# Others 
import transformers 


# to make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)


# Download Corpus

In [3]:
ANNO_URL = "http://groups.inf.ed.ac.uk/maptask/hcrcmaptask.nxtformatv2-1.zip"
DIALOG_URL = "http://groups.inf.ed.ac.uk/maptask/signals/dialogues"

DOWNLOAD_PATH = "data/maptask"

In [6]:
def download_from_url(dataset_name,save_dir_path, url=None):
    if url == None:
        return 
    download_path = "{}/{}".format(save_dir_path,"download")
    extract_path = "{}/{}".format(save_dir_path,"extract")
    [os.makedirs(path, exist_ok=True) for path in \
         (save_dir_path,download_path, extract_path)]
    dl_manager = tfds.download.DownloadManager(  
            download_dir=download_path,
            extract_dir=extract_path, 
            dataset_name =dataset_name)
    dir_path = dl_manager.download_and_extract(url) 
    return dir_path 
    

In [7]:
SAVE = "{}/{}".format(DOWNLOAD_PATH,"audio")
download_from_url("audio", SAVE, DIALOG_URL) 

Dl Completed...: 0 url [00:00, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:00<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:01<?, ? url/s]
Dl Completed...: 100%|██████████| 1/1 [00:01<00:00,  1.48s/ url]
Extraction completed...: 0 file [00:01, ? file/s]
Dl Size...: 0 MiB [00:01, ? MiB/s]
Dl Completed...: 100%|██████████| 1/1 [00:01<00:00,  1.50s/ url]


PosixGPath('data/maptask/audio/download/groups.inf.ed.ac.uk_maptask_signals_dialogues2Nh253dGVxdZZojya5yFiOhxBId6iB4hw-UxqQHUbNo')

In [54]:
def download_maptask(save_dir_path, annotation_url, audio_url): 
    # annotation_path = download_from_url(
    #     "maptask_annotations","{}/{}".format(save_dir_path,"annotations") , 
    #     annotation_url) 
    annotation_path = None
    audio_path = download_from_url(
        "maptask_audio", "{}/{}".format(save_dir_path,"audio"), 
        audio_url) 
    return annotation_path, audio_path


In [55]:
download_maptask(DOWNLOAD_PATH, ANNO_URL, DIALOG_URL)

Dl Completed...: 0 url [00:00, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:00<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:00<?, ? url/s]
Dl Completed...: 100%|██████████| 1/1 [00:00<00:00,  1.07 url/s]
Extraction completed...: 0 file [00:00, ? file/s]
Dl Size...: 0 MiB [00:00, ? MiB/s]
Dl Completed...: 100%|██████████| 1/1 [00:00<00:00,  1.06 url/s]


(None,
 PosixGPath('data/maptask/audio/download/groups.inf.ed.ac.uk_maptask_signals_dialogues2Nh253dGVxdZZojya5yFiOhxBId6iB4hw-UxqQHUbNo'))

NameError: name 'DOWNLOAD_PATH' is not defined

In [56]:
def download_audio(audio_path, url=None):
    if url is None:
        url = DIALOG_URL

    wget_cmd = [
        "wget",
        "-P",
        audio_path,
        "-r",
        "-np",
        "-R",
        "index.html*",
        "-nd",
        url,
        "-q",
        "--show-progress",
    ]
    system(" ".join(wget_cmd))

    for f in listdir(audio_path):
        fpath = join(audio_path, f)
        if not f.endswith(".wav"):
            system(f"rm {fpath}")

In [59]:
download_audio("{}/{}".format(DOWNLOAD_PATH,"audio"),DIALOG_URL)


     0K .......... .....                                       57.6M=0s
     0K                                                       100% 5.72M=0s
     0K .......... .....                                       56.9M=0s
     0K .......... .....                                        113M=0s
     0K .......... .....                                       77.1M=0s
     0K .......... .....                                       71.2M=0s
     0K .......... .......... .......... .......              100%  370K=0.1s
     0K .......... .......... .......... .......... ..........  0% 5.42M 4s
    50K .......... .......... .......... .......... ..........  0%  534K 21s
   100K .......... .......... .......... .......... ..........  0% 9.48M 15s
   150K .......... .......... .......... .......... ..........  0%  475K 22s
   200K .......... .......... .......... .......... ..........  1% 20.7M 18s
   250K .......... .......... .......... .......... ..........  1% 24.9M 15s
   300K .......... .....

In [42]:
download_from_url("maptask", DOWNLOAD_PATH, ANNO_URL) 

Dl Completed...: 0 url [00:00, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:00<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:00<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:01<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:01<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:01<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:01<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:01<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:01<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:02<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:02<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:02<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:02<?, ? url/s]
Dl Completed...:   0%|          | 0/1 [00:02<?, ? url/s]
Dl Completed...: 100%|██████████| 1/1 [00:02<00:00,  2.51s/ url]
Dl Completed...: 100%|██████████| 1/1 [00:02<00:00,  2.51s/ url]
[A
Dl Completed...: 100%|██████████| 1/1 [00:05<00:00,  2.51s/ url]
Extraction completed

PosixGPath('data/maptask/extract/ZIP.groups.inf.ed.ac.uk_maptas_hcrcma.nxtfor-1u7XTMQDQluYumRjiVqcRyUzt2JtgPX5ab2Et5GvU4nM.zip')

In [9]:
from os.path import join
from os import listdir, makedirs, system


def download_annotation(savepath, url=None):
    if url is None:
        url = ANNO_URL

    wget_cmd = ["wget", "-P", savepath, url, "-q", "--show-progress"]
    print("Downloading annotations")
    print("-----------------------")
    system(" ".join(wget_cmd))
    print("Download complete")

    print(f"Extracted annotations -> {savepath}/annotations")
    unzip_cmd = [
        "unzip",
        "-qq",
        join(savepath, "hcrcmaptask.nxtformatv2-1.zip"),
        "-d",
        savepath,
    ]
    system(" ".join(unzip_cmd))
    system(f'mv {join(savepath, "maptaskv2-1")} {join(savepath, "annotations")}')
    system(f'rm {join(savepath, "hcrcmaptask.nxtformatv2-1.zip")}')



In [10]:
download_annotation(DOWNLOAD_PATH)

Downloading annotations
-----------------------



     0K .......... .......... .......... .......... ..........  0% 85.1K 2m19s
    50K .......... .......... .......... .......... ..........  0%  127K 1m56s
   100K .......... .......... .......... .......... ..........  1%  248K 93s
   150K .......... .......... .......... .......... ..........  1%  110K 96s
   200K .......... .......... .......... .......... ..........  2%  171K 90s
   250K .......... .......... .......... .......... ..........  2%  248K 83s
   300K .......... .......... .......... .......... ..........  2%  218K 78s
   350K .......... .......... .......... .......... ..........  3%  154K 77s
   400K .......... .......... .......... .......... ..........  3%  128K 78s
   450K .......... .......... .......... .......... ..........  4%  246K 75s
   500K .......... .......... .......... .......... ..........  4%  142K 75s
   550K .......... .......... .......... .......... ..........  5%  250K 72s
   600K .......... .......... .......... .......... ..........  5%  465

Download complete
Extracted annotations -> data/maptask/annotations
