# Read data directly from github to databricks tables

In [0]:
import requests
import pandas as pd
dbutils.widgets.dropdown("git_owner", "logeshrajan", ["logeshrajan"])
dbutils.widgets.dropdown("git_repo_name", "Adventureworks-data-engineering", ["Adventureworks-data-engineering"])
dbutils.widgets.text("source_folder_path", "data")

owner = dbutils.widgets.get("git_owner")
repo = dbutils.widgets.get("git_repo_name")
folder_path = dbutils.widgets.get("source_folder_path")

# Function to list all files in a specified GitHub folder using the GitHub API
def list_github_folder_files(owner, repo, folder_path):
    url = f"https://api.github.com/repos/{owner}/{repo}/contents/{folder_path}?ref=main"
    response = requests.get(url)
    if response.status_code == 200:
        obj_files = response.json()
        files = [{'file_name': file["name"].split('.')[0].replace('AdventureWorks_','').lower()
                  , 'source_url': file["download_url"]}
                 for file in obj_files if file["type"] == "file"]
        print('Total no of files: ', len(files))
        return files
    else:
        print(f"Failed to fetch files: {response.status_code} - {response.text}")

# Function to ingest a file from GitHub into a Databricks Unity Catalog table
def ingest_from_github(file_details):
    try:
        # read files from github
        pd_df = pd.read_csv(file_details["source_url"], header=0)
        df = spark.createDataFrame(pd_df)
        # write df into adventureworks catalog
        df.write.mode("overwrite").saveAsTable(f"adventureworks.raw.{file_details['file_name']}")
    except Exception as e:
        print(f"Error: {e}")

file_list_urls = list_github_folder_files(owner, repo, folder_path)
for file_detail in file_list_urls:
    print(f"Ingesting file: {file_detail}")
    ingest_from_github(file_detail)
    print(f"Ingestion completed for {file_detail['file_name']}")

In [0]:
# # from pyspark import SparkFiles
# # url_github = 'https://raw.githubusercontent.com/logeshrajan/Adventureworks-data-engineering/refs/heads/main/data/AdventureWorks_Calendar.csv'
# # spark.sparkContext.addFile(url_github)
# # df = spark.read.csv(SparkFiles.get("AdventureWorks_Calendar.csv"), inferSchema=True, header=True)
