In [0]:
import requests
import pandas as pd

In [0]:

dbutils.widgets.dropdown("git_owner", "logeshrajan", ["logeshrajan"])
dbutils.widgets.dropdown("git_repo_name", "Adventureworks-data-engineering", ["Adventureworks-data-engineering"])
dbutils.widgets.text("git_source_folder_path", "data")
dbutils.widgets.text("volume_path", "/Volumes/adventureworks/raw/source_data")

In [0]:
# Function to list all files in a specified GitHub folder using the GitHub API
def list_github_folder_files(owner, repo, folder_path):
    url = f"https://api.github.com/repos/{owner}/{repo}/contents/{folder_path}?ref=main"
    response = requests.get(url)
    if response.status_code == 200:
        obj_files = response.json()
        files = [{'file_name': file["name"].split('.')[0].replace('AdventureWorks_','').lower()
                  , 'source_url': file["download_url"]}
                 for file in obj_files if file["type"] == "file"]
        print('Total no of files: ', len(files))
        return files
    else:
        print(f"Failed to fetch files: {response.status_code} - {response.text}")

def pull_data_from_github_to_volume(file_detail, volume_path):
    download_url = file_detail['source_url']
    file_name = file_detail['file_name']
    dbutils.fs.cp(download_url, f"{volume_path}/{file_name}/{file_name}.csv")
    print(f"    File {file_detail['file_name']} copied successfully to {volume_path}/{file_name}/")
    print()

In [0]:
owner = dbutils.widgets.get("git_owner")
repo = dbutils.widgets.get("git_repo_name")
folder_path = dbutils.widgets.get("git_source_folder_path")
volume_path = dbutils.widgets.get("volume_path")

In [0]:
file_list_urls = list_github_folder_files(owner, repo, folder_path)
for file_detail in file_list_urls:
    print(f"Copying file: {file_detail}")
    try:
        pull_data_from_github_to_volume(file_detail, volume_path)
    except Exception as e:
        print(f"    Error copying file: {file_detail}. Error: {e}")
        print()