
# Reading APIs [Application Programming Interface]
> *using api requests to retrieve data*  


<br>
  
<img width="900px" src ='https://owshqblobstg.blob.core.windows.net/stgfiles/png_files/api_request_bronze0.png'>
  
<br>

In [0]:
# python = driver does not execute in parallel
# pyspark = executes in parallels using driver and executors

# import libraries
# request is the most used library for request from url
# loading types to build the dataframe schema manually
import requests
import json
from pyspark.sql.types import *

# set explicit schema
# dataframe must have a schema upfront
jsonSchema = StructType(
[
 StructField('id', LongType(), False), 
 StructField('uid', StringType(), False), 
 StructField('valid_us_ssn', StringType(), False), 
 StructField('invalid_us_ssn', StringType(), False)
])

# get url
# request to api
# 100 rows per request
# return in json format
url = 'https://random-data-api.com/api/id_number/random_id_number'
size = {'size': 100}
r = requests.get(url, params=size)
data_json = r.json()

In [0]:
# data is inside of master [driver] node
# since it's not a dataframe spark stores data into driver
# for big data workloads data has to use driver & executors (slave)
print(type(data_json)) 
print(data_json)

In [0]:
# create [pyspark dataframe] from python list
# need to specify the schema to buid the dataframe
# [df] stored in memory
df = spark.createDataFrame(data_json, jsonSchema)

In [0]:
# show dataframe data
# pyspark ~ in-memory across the executors
display(df)

id,uid,valid_us_ssn,invalid_us_ssn
6611,dc74ada8-04a0-4d72-b340-275fdc3f1047,712-97-6013,536-02-0000
4184,2b049059-98cf-4fd1-8afd-318d781e24d6,123-90-7108,939-81-4731
2212,b08005f1-22dc-4e72-a5c3-9c79b9e1a4e4,685-87-8345,178-00-2473
2268,11a38499-d91b-4462-a881-163425148168,605-93-7570,666-88-3258
7505,5de90c34-b6b8-4680-8cd6-eb517467b39c,461-15-5755,666-76-3483
2130,042d522c-1816-40ef-8aa9-a011832c070d,027-23-7181,242-00-7306
1495,0f995d6b-2bf4-48f5-a162-78a5c10257aa,033-71-8805,259-28-0000
1937,c835f2c6-3518-4eae-a9a4-051312929637,085-71-9752,740-00-8436
7700,5e387552-42f3-4839-add5-e715a076650b,334-41-9177,323-38-0000
2448,ea076b44-f62c-40ca-ba14-7ffa65d924ba,209-25-5656,998-21-5306



<br>
  
<img width="400px" src ='https://brzluanmoreno.blob.core.windows.net/stgfiles/png_files/dl_pq.png'>
  
<br>

In [0]:
# write into parquet
# processing zone
# using append 
df.write.mode("append").parquet("/mnt/processing/parquet/batch/ssn")


<br>
  
<img width="900px" src ='https://brzluanmoreno.blob.core.windows.net/stgfiles/png_files/dl_delta_bronze0.png'>
  
<br>


In [0]:
# write into delta
# delta architecture zone
# using append
df.write.mode("append").format("delta").save("dbfs:/mnt/owshq/delta/batch/bronze/ssn")