# Extract API Parameters
The goals of this notebook are to

1. Extract the relevant API parameters from the metadata table (ingested in first copy job of Pipeline)
2. Create list of API call urls to iterate through
4. Run through list of API calls, ingest data into bronze

## Connect to Blob, Read metadata

In [13]:
# This cell will dynamically change to match the parameters passed to it from the Pipeline

run_id = '32fab968-7ad8-432b-a8fe-2fbbf60f441e'
pipeline_name = 'BEA_Regional'
source_group = 'BEA'
metadata_file_name = 'table_linecode_list'


In [14]:
# import packages
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
import pandas as pd
from pyspark.sql import SparkSession
import json
import os
from io import BytesIO, StringIO
import numpy as np
from multiprocessing import Pool
import tempfile
import requests
import time
import datetime
from notebookutils import mssparkutils
import re

# Get API Parameters

In [15]:
now = datetime.datetime.now()
now = now.strftime('%Y-%m-%dT%H:%M:%S')
api_keys = ['F4993EB8-9C0D-4141-9E37-828C00D09143', '191DA714-6A67-402A-881B-E9F984EA5267', 'A1F7718B-67A6-4597-A30D-AB58AA712921', 'A501A017-5838-4D2C-95E7-B4624C3D2BCF', '930BA9FE-78E2-40FB-A62E-C89ED7DA772F']

In [16]:
table_linecode_url = 'https://apps.bea.gov/api/data/?UserID=4DB3D6CD-EAD5-482E-BB14-FC7243B611E7&method=GetParameterValuesFiltered&datasetname=Regional&TargetParameter=LineCode'
response = requests.get(table_linecode_url)
if response.status_code == 200:
    metadata_table = json.loads(response.text)
else:
    print(f"Error retrieving data from {table_linecode_url}. Status code: {response.status_code}")

In [17]:
# convert parameters to df
table_array = metadata_table["BEAAPI"]["Results"]["ParamValue"]
table_df = pd.DataFrame.from_dict(table_array)
table_df = table_df.rename(columns = {"Key":"LineCode"})

# extract table name from desc
table_df["TableName"] = table_df["Desc"].str.extract('(?<=\\[)(.*?)(?=\\])') # filter for item between square brackets
table_df.drop("Desc", axis = 1).drop_duplicates
table_df

In [18]:
# create API calls

# function to reformat blob_path url because it was messing things up in silver
def reformat_url(url):
    url = url.replace('https://', '')
    reserved_characters = ['?']
    new_url = ''
    for c in url:
        if c in reserved_characters:
            new_url = new_url + '-'
        else:
           new_url = new_url + c 
    return(new_url)

state_df = table_df.loc[table_df["TableName"].str.match('^S')].assign(GeoFips = "STATE")
county_df = table_df.loc[table_df["TableName"].str.match('^C')].assign(GeoFips = "COUNTY")
msa_df = table_df.loc[table_df["TableName"].str.match('^[M]')].assign(GeoFips = "MSA")
port_df = table_df.loc[table_df["TableName"].str.match('^P')].assign(GeoFips = "PORT")


combined_df = pd.concat([state_df, county_df, msa_df, port_df], ignore_index = True)
combined_df['base_url'] = 'https://apps.bea.gov/api/data/'
combined_df['api_key'] = pd.qcut(range(len(combined_df)), q = 5, labels = api_keys).to_list()
combined_df["relative_url"] = "?UserID=" + combined_df['api_key'] \
    + '&method=GetData&datasetname=Regional&Year=ALL&TableName=' \
    + combined_df["TableName"] + '&LineCode=' + combined_df["LineCode"] \
    + '&GeoFips=' + combined_df["GeoFips"]

combined_df['full_url'] = combined_df['base_url'] + combined_df['relative_url']
combined_df['blob_path'] = [reformat_url(u) for u in combined_df['full_url'] + '/' + now + '_' + run_id +'.json']

combined_df.head()

# Run API calls

In [19]:
# split df into chunks for Multi processing

num_processes = len(api_keys)
chunks = np.array_split(combined_df, num_processes)

# initiate blob client
connection_string = "DefaultEndpointsProtocol=https;AccountName=usafactsbronze;AccountKey=WEH1nIXRgYYjWEjRPC6szld67DOir5Jx46GenOM8bmA+yWQQLlzTeJv5fI02wVxtsW89pSU8lBFc+AStCz7fWw==;EndpointSuffix=core.windows.net"
blob_service_client = BlobServiceClient.from_connection_string(connection_string)

In [20]:
# define function to read blob files
def ingest_to_bronze(chunk):
    chunk = chunk.reset_index()

    for i in range(len(chunk)):
        file_url = chunk['full_url'][i]
        blob_name = chunk['blob_path'][i]

        try:
            with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
                # Download the file from the URL and write it to the temporary file
                response = requests.get(file_url)
                tmp_file.write(response.content)
                tmp_file.flush()

                # Upload the file to Azure Blob Storage
                blob_client = blob_service_client.get_blob_client('bronze', blob_name)
                with open(tmp_file.name, "rb") as data:
                    blob_client.upload_blob(data, overwrite = True)
                
                # get size in mb
                file_size = os.path.getsize(tmp_file.name) / (1024 * 1024)

                # Delete the temporary file
                os.remove(tmp_file.name)
                print("uploaded blob: " + blob_name)
                # wait based on file size
                # side note: It's difficult to not throttle the API key while keeping this process fast. Some files are <1MB while others are 25 MB
                if file_size < 2:
                    time.sleep(2)
                else:
                    sleep = 60/(100/file_size) + 4
                    time.sleep(sleep)
                
        except requests.exceptions.RequestException as e:
            print(f"Error processing {file_url}: {str(e)}")
            fail.append(file_url)
        

In [21]:
# multi processing on API urls, each process uses different API key
fail = []

with Pool(num_processes) as p:
    p.map(ingest_to_bronze, chunks)

In [None]:
# add metadata to notebook activity output to check status of URL ingestion
fail_string = ", ".join(fail)
mssparkutils.notebook.exit("Number of failed urls: " + str(len(fail)) + ", " + fail_string)