In [None]:
#Import required libraries
import requests
import json
from datetime import datetime
from datetime import timedelta
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType

#-------------------------------------------------------------------------------------------------------------------

# Configure Payrix API Details
api_url = "https://api.payrix.com/"
api_key = dbutils.secrets.get("aspire-analyticsprod", "fieldroutes-analytics-payrix-api")
# '7a58ec5f0ddd3d9361e36d9a6d0a454a'
headers = {'APIKEY': api_key,'Accept': 'application/json'}

#-------------------------------------------------------------------------------------------------------------------

# Configure Storage Account Details
storage_account_name = dbutils.secrets.get("aspire-analyticsprod", "greenindustrydeltalake-storagename")
container_name = 'fieldroutes'
data_channel = "payrix"
medallion_stage_destination = "bronze"
storage_accountkey = dbutils.secrets.get("aspire-analyticsprod", "adls-greenindustrydeltalake-accesskey")

# Set ADLS Config
spark.conf.set(f"fs.azure.account.key.{storage_account_name}.dfs.core.windows.net", f"{storage_accountkey}")

# Set data destination adls path
destination_path = f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/{data_channel}/{medallion_stage_destination}"

#-------------------------------------------------------------------------------------------------------------------

# Initialize Spark session
spark = SparkSession.builder.appName("Payrix Data Load").getOrCreate()

In [None]:
def fetch_data_with_retry(url, headers, params, resource_name, retries=2, timeout=30):
    # Added a timeout parameter which defaults to 30 seconds

    for attempt in range(retries):

        try:
            # Added a timeout to the requests.get call
            response = requests.get(url, headers=headers, params=params, timeout=timeout)
            response.raise_for_status()
            return response.json()
        except requests.exceptions.Timeout:
            # Catches timeout specific errors
            print(f"Attempt {attempt+1} timed out for {resource_name}")
        except requests.exceptions.HTTPError as e:
            print(f"Attempt {attempt+1} failed with status code {response.status_code} for {resource_name}: {str(e)}")
            if response.status_code == 401:  # Unauthorized
                print("Invalid authentication, please check your API key.")
                break
            if response.status_code == 400:  # Bad Request
                print("Bad request, please check your request parameters.")
                break
        except requests.exceptions.RequestException as e:
            # This will catch other exceptions such as a connection error
            print(f"Attempt {attempt+1} failed for {resource_name} due to a connection error: {str(e)}")

    return None


In [None]:
# Pulling in data using a strt date of 01/01/2022

def fetch_and_save_data(api_endpoint, resource_name, destination_path, params={}):
    start_date = datetime(2024, 5, 13)
    current_date = datetime.now()

    print(f"Data fetch for {resource_name} started.")

    data_list = []  # Initialize an empty list to store all data

    while start_date <= current_date:
        date_str = start_date.strftime('%Y-%m-%d')
        params['search'] = f"created[equals]={date_str}"
        print(f"Fetching data from {date_str}")  # Print the page number
        data = fetch_data_with_retry(api_endpoint, headers, params, resource_name)
        if not data or 'data' not in data['response'] or not data['response']['data']:
            break  # No more data to fetch

        # Convert each item in the list to a JSON string and add it to data_list
        data_list.extend([json.dumps(item) for item in data['response']['data']])
        # print(f"Number of records in list after appending: {len(data_list)}")  # Print the number of records in list

        # Increment the day
        start_date += timedelta(days=1)

    # Define a schema with a single column named 'json_string'
    schema = StructType([StructField("json_string", StringType(), True)])

    try:
        # Convert the JSON strings into a DataFrame
        df = spark.createDataFrame(data_list, StringType()).toDF("json_string")

        # Save the DataFrame as JSON files
        df.write.format("delta").mode("append").save(f"{destination_path}/{resource_name}")

        # If you want to check and print the number of rows written, perform an action that triggers a job, like count()
        print(f"Data for {resource_name} fetched and saved to {destination_path}/{resource_name}, rows count: {df.count()}.\n")

    except Exception as e:
        print(f"Error processing data for {resource_name}: {str(e)}")



In [None]:
# # List of endpoints to iterate over
# endpoints = ['accounts', 'entities', 'members', 'funds', 'merchants', 'orgs', 'payouts', 'plans', 'subscriptions', 'teamLogins', 'txns', 'customers', 'invoices','billingEvents', 'billingModifiers', 'billings', 'bins', 'changeRequests', 'chargebacks', 'contacts', 'disbursements', 'divisions', 'holds', 'invoiceItems', 'invoiceLineItems', 'refunds','tokens','subscriptionTokens','mappings','logins']
#error - refunds

resource_names = ['accounts', 'entities', 'members', 'funds', 'merchants', 'orgs', 'payouts', 'plans', 'subscriptions', 'teamLogins', 'txns', 'customers', 'invoices','billingEvents', 'billingModifiers', 'billings', 'bins', 'changeRequests', 'chargebacks', 'contacts', 'disbursements', 'divisions', 'holds', 'invoiceItems', 'invoiceLineItems', 'refunds','tokens','subscriptionTokens','mappings','logins']

for resource_name in resource_names:
    api_endpoint = f"{api_url}{resource_name}"
    fetch_and_save_data(api_endpoint, resource_name, destination_path)

Data fetch for payouts started.
Fetching data from 2024-05-13
Fetching data from 2024-05-14
Fetching data from 2024-05-15
Fetching data from 2024-05-16
Fetching data from 2024-05-17
Fetching data from 2024-05-18
Fetching data from 2024-05-19
Fetching data from 2024-05-20
Fetching data from 2024-05-21
Fetching data from 2024-05-22
Data for payouts fetched and saved to abfss://fieldroutes@[REDACTED].dfs.core.windows.net/payrix/bronze/payouts, rows count: 300.

Data fetch for plans started.
Fetching data from 2024-05-13
Fetching data from 2024-05-14
Fetching data from 2024-05-15
Fetching data from 2024-05-16
Fetching data from 2024-05-17
Fetching data from 2024-05-18
Fetching data from 2024-05-19
Fetching data from 2024-05-20
Fetching data from 2024-05-21
Fetching data from 2024-05-22
Data for plans fetched and saved to abfss://fieldroutes@[REDACTED].dfs.core.windows.net/payrix/bronze/plans, rows count: 160.

Data fetch for subscriptions started.
Fetching data from 2024-05-13
Fetching dat