In [1]:
import pandas as pd
import numpy as np
import os
import time
import urllib.request
import tarfile

def download_data(dir_):
    if not os.path.exists(dir_):
        os.makedirs(dir_)
    url = "http://rapidsai-data.s3-website.us-east-2.amazonaws.com/notebook-mortgage-data/mortgage_2000-2016.tgz"
    fname = os.path.join(dir_, "mortgage_2000-2016.tgz")

    if not os.path.exists(fname):
        max_attempts = 80
        attempts = 0
        sleeptime = 10
        while attempts < max_attempts:
            time.sleep(sleeptime)
            try:
                response = urllib.request.urlopen(url, timeout = 5)
                content = response.read()
                f = open(fname, 'wb' )
                f.write( content )
                f.close()
                print("Successfully downloaded data!")
                break
            except Exception as e:
                attempts += 1
                print(type(e))
                print(e)
    
    tar = tarfile.open(fname, "r:gz")
    tar.extractall(dir_)
    tar.close()
    print(f"Contents in directory '{dir_}': {os.listdir(dir_)}")

    print("Directory Structure:")
    print ('--------------------------------')
    print()
    for (root,dirs,files) in os.walk(dir_, topdown=True):
        print (root)
        print (dirs)
        print (files)
        print ('--------------------------------')


In [None]:
cwd = os.getcwd()
data_fldr = os.path.join(cwd, 'data')

In [7]:
download_data(data_fldr)

Successfully downloaded data!
Contents in directory '/mnt/batch/tasks/shared/LS_root/mounts/clusters/nkothapalli/code/Users/nkothapalli/fsi/fsi-samples/credit_default_risk/data': ['.amlignore', '.amlignore.amltmp', 'acq', 'mortgage_2000-2016.tgz', 'names.csv', 'perf']
Directory Structure:
--------------------------------

/mnt/batch/tasks/shared/LS_root/mounts/clusters/nkothapalli/code/Users/nkothapalli/fsi/fsi-samples/credit_default_risk/data
['acq', 'perf']
['.amlignore', '.amlignore.amltmp', 'mortgage_2000-2016.tgz', 'names.csv']
--------------------------------
/mnt/batch/tasks/shared/LS_root/mounts/clusters/nkothapalli/code/Users/nkothapalli/fsi/fsi-samples/credit_default_risk/data/acq
[]
['.amlignore', '.amlignore.amltmp', 'Acquisition_2000Q1.txt', 'Acquisition_2000Q2.txt', 'Acquisition_2000Q3.txt', 'Acquisition_2000Q4.txt', 'Acquisition_2001Q1.txt', 'Acquisition_2001Q2.txt', 'Acquisition_2001Q3.txt', 'Acquisition_2001Q4.txt', 'Acquisition_2002Q1.txt', 'Acquisition_2002Q2.txt', '

Since the data is downloaded, let's copy it

In [14]:
from azureml.core import Workspace

acq_file_folder = os.path.join(data_fldr, "acq/")
perf_file_folder = os.path.join(data_fldr, "perf/")

acq_files = [os.path.join(acq_file_folder, f) for f in os.listdir(acq_file_folder)]
perf_files = [os.path.join(perf_file_folder, f) for f in os.listdir(perf_file_folder)]

ws = Workspace.from_config()

datastore = ws.get_default_datastore()

datastore.upload_files(acq_files, target_path='credit_risk_data/acq/', overwrite=True, show_progress = True)



Uploading an estimated of 70 files
Uploading /mnt/batch/tasks/shared/LS_root/mounts/clusters/nkothapalli/code/Users/nkothapalli/fsi/fsi-samples/credit_default_risk/data/acq/.amlignore
Uploaded /mnt/batch/tasks/shared/LS_root/mounts/clusters/nkothapalli/code/Users/nkothapalli/fsi/fsi-samples/credit_default_risk/data/acq/.amlignore, 1 files out of an estimated total of 70
Uploading /mnt/batch/tasks/shared/LS_root/mounts/clusters/nkothapalli/code/Users/nkothapalli/fsi/fsi-samples/credit_default_risk/data/acq/.amlignore.amltmp
Uploaded /mnt/batch/tasks/shared/LS_root/mounts/clusters/nkothapalli/code/Users/nkothapalli/fsi/fsi-samples/credit_default_risk/data/acq/.amlignore.amltmp, 2 files out of an estimated total of 70
Uploading /mnt/batch/tasks/shared/LS_root/mounts/clusters/nkothapalli/code/Users/nkothapalli/fsi/fsi-samples/credit_default_risk/data/acq/Acquisition_2000Q1.txt
Uploaded /mnt/batch/tasks/shared/LS_root/mounts/clusters/nkothapalli/code/Users/nkothapalli/fsi/fsi-samples/credit

$AZUREML_DATAREFERENCE_4b5816450c404495aac19790a4b64b40

In [15]:
datastore.upload_files(perf_files, target_path='credit_risk_data/perf/', overwrite=True, show_progress = True)


Uploading an estimated of 114 files
Uploading /mnt/batch/tasks/shared/LS_root/mounts/clusters/nkothapalli/code/Users/nkothapalli/fsi/fsi-samples/credit_default_risk/data/perf/.amlignore
Uploaded /mnt/batch/tasks/shared/LS_root/mounts/clusters/nkothapalli/code/Users/nkothapalli/fsi/fsi-samples/credit_default_risk/data/perf/.amlignore, 1 files out of an estimated total of 114
Uploading /mnt/batch/tasks/shared/LS_root/mounts/clusters/nkothapalli/code/Users/nkothapalli/fsi/fsi-samples/credit_default_risk/data/perf/.amlignore.amltmp
Uploaded /mnt/batch/tasks/shared/LS_root/mounts/clusters/nkothapalli/code/Users/nkothapalli/fsi/fsi-samples/credit_default_risk/data/perf/.amlignore.amltmp, 2 files out of an estimated total of 114
Uploading /mnt/batch/tasks/shared/LS_root/mounts/clusters/nkothapalli/code/Users/nkothapalli/fsi/fsi-samples/credit_default_risk/data/perf/Performance_2000Q2.txt
Uploaded /mnt/batch/tasks/shared/LS_root/mounts/clusters/nkothapalli/code/Users/nkothapalli/fsi/fsi-sample

$AZUREML_DATAREFERENCE_e2a637ad186647589e86a2722599ffd4