---
Author: Mustapha Bouhsen <br>
[LinkedIn](https://www.linkedin.com/in/mustapha-bouhsen/)<br>
[Git](https://github.com/mus514)<br>
Date: February 2, 2024<br>
---

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, DateType
from pyspark.sql import functions as F
import pandas as pd
import numpy as np
from datetime import datetime
import json


### Load files from Azure blob storage : Set the data location and type


In [0]:
# storage_account_name = "mymlprojects"
# storage_key = "?sv=2022-11-02&ss=bfqt&srt=sco&sp=rwdlacupyx&se=2024-03-09T09:14:29Z&st=2024-02-03T01:14:29Z&spr=https&sig=v%2Bmvq02eWWEzGfaXqGJ%2F8BJiTJrD3PPGS4eL66SIsC8%3D"

# container_name = "prod"
# mount_point = "/mnt/prod"

# dbutils.fs.mount(
#   source = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/",
#   mount_point = mount_point,
#   extra_configs = {f"fs.azure.sas.{container_name}.{storage_account_name}.blob.core.windows.net":storage_key})

In [0]:
def get_files_paths_from_folders(folder_path, endsWith=".parquet"):
    """
    Recursively retrieves the paths of all files within the specified folder and its subfolders.

    Parameters:
    - folder_path (str): The path to the folder for which file paths are to be retrieved.
    - endsWith (str, optional): The suffix to filter files by. Defaults to ".parquet".

    Returns:
    - List[str]: A list containing the paths of all files within the specified folder and its subfolders that end with the specified suffix.
    """
    # Get the list of paths (files and subfolders) within the specified folder
    paths = dbutils.fs.ls(folder_path)

    # Initialize an empty list to store file paths
    my_paths = []

    # Iterate through the paths to identify files and subfolders
    for key in paths:
        # Check if the current path corresponds to a file
        if key.isFile():
            # If it's a file, append its path to the list
            my_paths.append(key[0])
        else:
            # If it's a subfolder, recursively call the function to get file paths within the subfolder
            my_paths = my_paths + get_files_paths_from_folders(key[0])

    # Filter the list of paths to include only those ending with the specified suffix
    my_paths = [path for path in my_paths if path.endswith(endsWith)]

    # Return the final list of file paths
    return my_paths

In [0]:
#-----------------------------------------
# Set the the raw and the prof folder paths
#-----------------------------------------
raw_folder_path = "/mnt/raw/"
prod_folder_path = "/mnt/prod/"

raw_files_paths = [file.path for file in dbutils.fs.ls(raw_folder_path)]

In [0]:
#-----------------------------------------
# The schema
#-----------------------------------------
schema = StructType([
    StructField("date", StringType(), True),
    StructField("open", StringType(), True),
    StructField("high", StringType(), True),
    StructField("low", StringType(), True),
    StructField("close", StringType(), True),
    StructField("volume", StringType(), True)
])

col_float = ["open", "high", "low", "close", "volume"]

In [0]:
#-----------------------------------------
# Load the json into spark data frame 
#-----------------------------------------
data = spark.read.json(raw_files_paths[1]).collect()
df = pd.DataFrame(data[0][1].asDict()).T.reset_index()
df = spark.createDataFrame(df, schema=schema)

In [0]:
#-----------------------------------------
# Convert the columns type
#-----------------------------------------
# date column
df = df.withColumn("date", F.col("date").cast(DateType()))
# Float columns
for col in col_float:
    df = df.withColumn(col, F.col(col).cast(FloatType()))

In [0]:
df.printSchema()

root
 |-- date: date (nullable = true)
 |-- open: float (nullable = true)
 |-- high: float (nullable = true)
 |-- low: float (nullable = true)
 |-- close: float (nullable = true)
 |-- volume: float (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)



In [0]:
df = df.withColumn("year", F.year(F.col("date")))
df = df.withColumn("month", F.month(F.col("date")))

In [0]:
df.filter(F.col("month") == "01").show(10)

+----------+-----+-----+-----+-----+---------+----+-----+
|      date| open| high|  low|close|   volume|year|month|
+----------+-----+-----+-----+-----+---------+----+-----+
|2000-01-03| 81.5|89.56| 80.0|89.38|1.61176E7|2000|    1|
|2000-01-04|85.38| 91.5|81.75|81.94|1.74874E7|2000|    1|
|2000-01-05|70.75|75.13|69.63|71.75|3.84574E7|2000|    1|
|2000-01-06|71.31|72.69| 64.0|65.56| 1.8752E7|2000|    1|
|2000-01-07| 67.0| 70.5|66.19|69.56|1.05054E7|2000|    1|
|2000-01-10|72.56|72.63|65.56|69.19|1.47579E7|2000|    1|
|2000-01-11|66.88| 70.0| 65.0|66.75|1.05327E7|2000|    1|
|2000-01-12|67.88| 68.0| 63.0|63.56|1.08045E7|2000|    1|
|2000-01-13|64.94|67.19|63.13|65.94|1.04481E7|2000|    1|
|2000-01-14|66.75| 68.5| 64.0|64.25|6853600.0|2000|    1|
+----------+-----+-----+-----+-----+---------+----+-----+
only showing top 10 rows



In [0]:
temp_folder = prod_folder_path+"tempo/"
df.write.partitionBy(["year", "month"]).mode("overwrite").parquet(temp_folder)
dbutils.fs.rm(temp_folder+"_SUCCESS")

In [0]:
files_paths = get_files_paths_from_folders(temp_folder)

for file in files_paths:
    date_file = file.split("/")[-3:-1]
    year = int(date_file[0].split("=")[1])
    month = int(date_file[1].split("=")[1])
    prod_file_path = f'{prod_folder_path}appl/year={year}/month={"{:02}".format(month)}/APPL.parquet'
    dbutils.fs.cp(file, prod_file_path)

dbutils.fs.rm(temp_folder, True) 

In [0]:
f'{prod_folder_path}appl'

'/mnt/prod/appl'

In [0]:
%sql

In [0]:
# for file in raw_files_paths:
#     # Load the data
#     data = spark.read.json(file).collect()
#     df = pd.DataFrame(data[0][1].asDict()).T.reset_index()
#     df = spark.createDataFrame(df, schema=schema)

#     #Convert the columns type
#     # Date column
#     df = df.withColumn("date", F.col("date").cast(DateType()))
#     # Float columns
#     for col in col_float:
#         df = df.withColumn(col, F.col(col).cast(FloatType()))