---
Author: Mustapha Bouhsen <br>
[LinkedIn](https://www.linkedin.com/in/mustapha-bouhsen/)<br>
[Git](https://github.com/mus514)<br>
Date: February 2, 2024<br>
---

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, DateType
from pyspark.sql import functions as F
import pandas as pd
import numpy as np
from datetime import datetime
import json


### Load files from Azure blob storage : Set the data location and type


In [0]:
# storage_account_name = "mymlprojects"
# storage_key = "?sv=2022-11-02&ss=bfqt&srt=sco&sp=rwdlacupyx&se=2024-03-09T09:14:29Z&st=2024-02-03T01:14:29Z&spr=https&sig=v%2Bmvq02eWWEzGfaXqGJ%2F8BJiTJrD3PPGS4eL66SIsC8%3D"

# container_name = "prod"
# mount_point = "/mnt/prod"

# dbutils.fs.mount(
#   source = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/",
#   mount_point = mount_point,
#   extra_configs = {f"fs.azure.sas.{container_name}.{storage_account_name}.blob.core.windows.net":storage_key})

In [0]:
#-----------------------------------------
# Set the the raw and the prof folder paths
#-----------------------------------------
raw_folder_path = "/mnt/raw/"
prod_folder_path = "/mnt/prod/"

raw_files_paths = [file.path for file in dbutils.fs.ls(raw_folder_path)]

In [0]:
#-----------------------------------------
# The schema
#-----------------------------------------
schema = StructType([
    StructField("date", StringType(), True),
    StructField("open", StringType(), True),
    StructField("high", StringType(), True),
    StructField("low", StringType(), True),
    StructField("close", StringType(), True),
    StructField("volume", StringType(), True)
])

col_float = ["open", "high", "low", "close", "volume"]

In [0]:
#-----------------------------------------
# Load the json into spark data frame 
#-----------------------------------------
data = spark.read.json(raw_files_paths[1]).collect()
df = pd.DataFrame(data[0][1].asDict()).T.reset_index()
df = spark.createDataFrame(df, schema=schema)

In [0]:
#-----------------------------------------
# Convert the columns type
#-----------------------------------------
# date column
df = df.withColumn("date", F.col("date").cast(DateType()))
# Float columns
for col in col_float:
    df = df.withColumn(col, F.col(col).cast(FloatType()))

In [0]:
df.printSchema()

In [0]:
df = df.withColumn("year", F.year(F.col("date")))
df = df.withColumn("month", F.month(F.col("date")))

In [0]:
df.show()

In [0]:
# df.write.partitionBy(["year", "month"]).mode("overwrite").parquet(prod_folder_path+"/tempo")

In [0]:
# for file in raw_files_paths:
#     # Load the data
#     data = spark.read.json(file).collect()
#     df = pd.DataFrame(data[0][1].asDict()).T.reset_index()
#     df = spark.createDataFrame(df, schema=schema)

#     #Convert the columns type
#     # Date column
#     df = df.withColumn("date", F.col("date").cast(DateType()))
#     # Float columns
#     for col in col_float:
#         df = df.withColumn(col, F.col(col).cast(FloatType()))
    



In [0]:
temp_folder = prod_folder_path+"/tempo/"
dbutils.fs.rm(temp_folder+"_SUCCESS")

In [0]:
def get_files_paths_from_folders(folder_path):
    """
    Recursively retrieves the paths of all files within the specified folder and its subfolders.

    Parameters:
    - folder_path (str): The path to the folder for which file paths are to be retrieved.

    Returns:
    - List[str]: A list containing the paths of all files within the specified folder and its subfolders.
    """
    # Get the list of paths (files and subfolders) within the specified folder
    paths = dbutils.fs.ls(folder_path)

    # Initialize an empty list to store file paths
    my_path = []

    # Iterate through the paths to identify files and subfolders
    for key in paths:
        # Check if the current path corresponds to a file
        if key.isFile():
            # If it's a file, append its path to the list
            my_path.append(key[0])
        else:
            # If it's a subfolder, recursively call the function to get file paths within the subfolder
            my_path = my_path + get_files_paths_from_folders(key[0])

    # Return the final list of file paths
    return my_path

In [0]:
get_files_paths_from_folders(temp_folder)[-1]

In [0]:
dbutils.fs.ls(dbutils.fs.ls(temp_folder)[0][0])