In [0]:
%pip install gdown

Python interpreter will be restarted.
Collecting gdown
  Using cached gdown-5.2.0-py3-none-any.whl (18 kB)
Collecting tqdm
  Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Collecting PySocks!=1.5.7,>=1.5.6
  Using cached PySocks-1.7.1-py3-none-any.whl (16 kB)
Installing collected packages: PySocks, tqdm, gdown
Successfully installed PySocks-1.7.1 gdown-5.2.0 tqdm-4.67.1
Python interpreter will be restarted.


In [0]:
import requests
import os
import gdown
from pyspark.sql.types import StructType, StructField, StringType
from datetime import datetime
from pyspark.sql import SparkSession
import shutil

class ExternalFileLogger:
    def __init__(self, log_path="/mnt/datalake/Pipeline1/Mount_And_External_Sources/file_write_log/"):
        self.spark = SparkSession.builder.getOrCreate()
        self.log_path = log_path

    def log_file_event(self, source_type: str, file_path: str):
        schema = StructType([
        StructField("source_type", StringType(), True),
        StructField("file_path", StringType(), True),
        StructField("timestamp", StringType(), True)
        ])

        data = [(source_type, file_path, datetime.now().isoformat())]
        log_df = self.spark.createDataFrame(data, schema=schema)

        log_df.show()  # Check if the row is there

        log_df.write.mode("append").format("delta") \
            .option("mergeSchema", "true") \
            .save(self.log_path)
        print(f"Logged: [{source_type}] -> {file_path}")

    def _ensure_directory(self, output_path):
        local_dir = os.path.dirname(output_path)
        if not os.path.exists(local_dir):
            os.makedirs(local_dir, exist_ok=True)

    def download_from_gdrive(self, file_id, output_path):
        self._ensure_directory(output_path)
        # Download using gdown
        source_url = f"https://drive.google.com/uc?id={file_id}"
        local_path = gdown.download(source_url, output_path, quiet=False)
        # Convert to dbfs path for logging
        dbfs_path = local_path.replace("/dbfs", "dbfs:")
        # Copy file manually to DBFS
        shutil.copy(local_path, "/tmp/my_google_file.csv")
        dbutils.fs.cp("file:/tmp/my_google_file.csv", dbfs_path)
        df = spark.read.csv(dbfs_path)
        print(df)
        print(f"Downloaded to: {output_path}")
        self.log_file_event(source_url, dbfs_path)

    def download_from_sharepoint(self, download_url: str, output_path: str):
        response = requests.get(download_url)
        with open(output_path, "wb") as f:
            f.write(response.content)
        self.log_file_event("SharePoint", output_path)

    def log_mount(self, mount_source: str, mount_point: str, extra_configs: dict = None):
        if extra_configs:
            dbutils.fs.mount(
                source=mount_source,
                mount_point=mount_point,
                extra_configs=extra_configs
            )
        else:
            dbutils.fs.mount(
                source=mount_source,
                mount_point=mount_point
            )
        self.log_file_event(mount_source, mount_point)

    def show_logs(self):
        df = self.spark.read.format("delta").load(self.log_path)
        df.orderBy("timestamp", ascending=False).show(truncate=False)