---
Author: Mustapha Bouhsen <br>
[LinkedIn](https://www.linkedin.com/in/mustapha-bouhsen/)<br>
[Git](https://github.com/mus514)<br>
Date: February 2, 2024<br>
---

# Usefull function for the daily loads

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, DateType
from pyspark.sql import functions as F
import pandas as pd
import numpy as np
from datetime import datetime
import json

In [0]:
def get_files_paths_from_folders(folder_path, endsWith=None):
    """
    Recursively retrieves the paths of all files within the specified folder and its subfolders.

    Parameters:
    - folder_path (str): The path to the folder for which file paths are to be retrieved.
    - endsWith (str, optional): The suffix to filter files by. Defaults to ".parquet".

    Returns:
    - List[str]: A list containing the paths of all files within the specified folder and its subfolders that end with the specified suffix.
    """
    # Get the list of paths (files and subfolders) within the specified folder
    paths = dbutils.fs.ls(folder_path)

    # Initialize an empty list to store file paths
    my_paths = []

    # Iterate through the paths to identify files and subfolders
    for key in paths:
        # Check if the current path corresponds to a file
        if key.isFile():
            # If it's a file, append its path to the list
            my_paths.append(key[0])
        else:
            # If it's a subfolder, recursively call the function to get file paths within the subfolder
            my_paths = my_paths + get_files_paths_from_folders(key[0])

    if endsWith != None:
        # Filter the list of paths to include only those ending with the specified suffix
        my_paths = [path for path in my_paths if path.endswith(endsWith)]

    # Return the final list of file paths
    return my_paths

In [0]:
def delete_contents_recursively(folder_path):
    # List all files and subdirectories in the folder
    contents = dbutils.fs.ls(folder_path)

    # Delete each file and subdirectory
    for content in contents:
        if content.isDir():
            # Recursively delete contents of subfolder
            delete_contents_recursively(content.path)
        else:
            # Delete file
            dbutils.fs.rm(content.path)

    # After deleting all contents, delete the folder itself
    dbutils.fs.rm(folder_path)

In [0]:
def ingest_and_transform_to_parquet(files_paths, prod_folder_path, stock_name):
    """
    Ingests data from specified files, extracts date information, and transforms it to Parquet format.

    Parameters:
    - files_paths (list): List of file paths to be ingested.
    - prod_folder_path (str): Base path for the Parquet output.
    - stock_name (str) : the stock name
    """
    for file in files_paths:
        # Extract date information from the file path
        date_file = file.split("/")[-3:-1]
        year = int(date_file[0].split("=")[1])
        month = int(date_file[1].split("=")[1])

        # Build the destination Parquet file path
        prod_file_path = f'{prod_folder_path}{stock_name.lower()}/year={year}/month={"{:02}".format(month)}/{stock_name}.parquet'
        # Copy the file to the Parquet destination
        dbutils.fs.cp(file, prod_file_path)