# About

The aim of this notebook is to download and manipulate data from the Candor 
corpus for future use. 

In [1]:
import sys 
import os 

# In this mode, the notebook will use small datasets, models, etc. to speed 
# up training = should be used primarily when running locally. 

IS_COLAB = "google.colab" in sys.modules
if IS_COLAB:
    %pip install -q -U tensorflow-addons
    %pip install -q -U transformers
    %pip install -q -U datasets
    from google.colab import drive
    drive.mount("/drive")

import sklearn 
assert sklearn.__version__ >= "0.20" 
# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
import tensorflow_datasets as tfds
assert tf.__version__ >= "2.0"
# Common imports
import numpy as np
import pandas as pd 
# Others 

# to make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)


In [3]:
# NOTE: Defining all global paths that will be used in the notebook. 
PROJECT_ROOT_DIR = "."
CANDOR_URL_FILE = os.path.join(PROJECT_ROOT_DIR,"file_urls.txt")
DATASET_DIR = os.path.join(PROJECT_ROOT_DIR,"datasets")
DOWNLOAD_DIR = os.path.join(DATASET_DIR,"download")
EXTRACT_DIR = os.path.join(DATASET_DIR,"extract")
DATASET_NAME = "candor"


In [4]:
# Creating the relevant directories
os.makedirs(DATASET_DIR, exist_ok=True)

# Download

In [5]:
# Read all the urls from the url file. 
with open(CANDOR_URL_FILE,'r') as f:
    lines = f.readlines()
    lines = lines[0].split("https")
    urls = ["https" + line.strip() for line in lines if len(line) > 0]
    

In [6]:
urls[0]

'https://betterup-public-dataset-release.s3.us-west-2.amazonaws.com/v1.0/raw_media_part_001.zip?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIASDTHSKXDJT5YJ2DE%2F20220323%2Fus-west-2%2Fs3%2Faws4_request&X-Amz-Date=20220323T201656Z&X-Amz-Expires=604800&X-Amz-SignedHeaders=host&X-Amz-Signature=b3dac17fc5735cd12c95beae2c68aa476dfd7485c00b8c198c9c7419e3a6cff2'

In [7]:

import shutil 
from tqdm.auto import *
import requests
import glob 
from zipfile import ZipFile
import time

def download_dataset_from_urls(urls, dataset_name,download_dir, extract_dir, unzip=True, chunkSize=8192):
    # Create paths 
    dataset_download_path = os.path.join(download_dir,dataset_name)
    dataset_extract_path = os.path.join(extract_dir,dataset_name)
    if os.path.isdir(dataset_download_path):
        shutil.rmtree(dataset_download_path)
    if os.path.isdir(dataset_extract_path):
        shutil.rmtree(dataset_extract_path)
    os.makedirs(dataset_download_path)
    os.makedirs(dataset_extract_path)
    # Download each url as a zip file. 
    print("Downloading zip files to folder: {}".format(dataset_download_path))
    if unzip:
        print("Extracting zip files to folder: {}".format(dataset_extract_path))
    for i,url in enumerate(urls):
        # Create a temp. dir for this specific url 
        name = "{}_url_{}".format(dataset_name, i)
        url_temp_path = "{}.zip".format(os.path.join(dataset_download_path,name))
        with requests.get(url, stream=True) as r:
            r.raise_for_status()
            pbar = tqdm(total=int(r.headers['Content-Length']), desc="{}".format(name))
            with open(url_temp_path,"wb+") as f: 
                for chunk in r.iter_content(chunk_size=chunkSize):
                    if chunk:  # filter out keep-alive new chunks
                        f.write(chunk)
                        pbar.update(len(chunk))
        if unzip:
            with ZipFile(url_temp_path, 'r') as zipObj:
                # Extract all the contents of zip file in different directory
                extract_path = os.path.join(dataset_extract_path,name)
                os.makedirs(extract_path)
                zipObj.extractall(extract_path)
    
        

In [8]:
test_urls = [urls[0]]

In [9]:
download_dataset_from_urls(test_urls, DATASET_NAME,DOWNLOAD_DIR,EXTRACT_DIR)

Downloading zip files to folder: ./datasets/download/candor
Extracting zip files to folder: ./datasets/extract/candor


HTTPError: 403 Client Error: Forbidden for url: https://betterup-public-dataset-release.s3.us-west-2.amazonaws.com/v1.0/raw_media_part_001.zip?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIASDTHSKXDJT5YJ2DE%2F20220323%2Fus-west-2%2Fs3%2Faws4_request&X-Amz-Date=20220323T201656Z&X-Amz-Expires=604800&X-Amz-SignedHeaders=host&X-Amz-Signature=b3dac17fc5735cd12c95beae2c68aa476dfd7485c00b8c198c9c7419e3a6cff2