---
Author: Mustapha Bouhsen <br>
[LinkedIn](https://www.linkedin.com/in/mustapha-bouhsen/)<br>
[Git](https://github.com/mus514)<br>
Date: February 14, 2024<br>
---

In [0]:
%run Repos/bouhsen.m@gmail.com/ML_Pipeline_Hub/library/garch_model

In [0]:
%run Repos/bouhsen.m@gmail.com/ML_Pipeline_Hub/library/daily_utilities

## Join stocks tables

In [0]:
%sql
SELECT
    date, open, high, low, close, adj_close, volume
FROM
aapl
    

date,open,high,low,close,adj_close,volume
2004-01-02,0.384821,0.388393,0.378214,0.38,0.32170776,144642400.0
2004-01-05,0.3825,0.399821,0.3825,0.395893,0.33516285,395018400.0
2004-01-06,0.397321,0.400357,0.387679,0.394464,0.333953,509348000.0
2004-01-07,0.394643,0.407679,0.391607,0.403393,0.34151232,586874370.0
2004-01-08,0.407857,0.42375,0.404464,0.417143,0.3531531,460303200.0
2004-01-09,0.414821,0.430893,0.406964,0.410714,0.34771025,427459200.0
2004-01-12,0.415179,0.428571,0.4125,0.42375,0.35874656,487547200.0
2004-01-13,0.441071,0.443571,0.426071,0.430714,0.36464217,679016830.0
2004-01-14,0.435714,0.438214,0.424643,0.432143,0.36585206,620043200.0
2004-01-15,0.409107,0.417857,0.401786,0.408036,0.34544304,1018208770.0


In [0]:
%sql
DROP TABLE IF EXISTS stocks;

CREATE TABLE stocks AS

SELECT
    date, open, high, low, close, adj_close, volume,
    'aapl' AS stock
FROM
    aapl

UNION

SELECT
    date, open, high, low, close, adj_close, volume,
    'amzn' AS stock
FROM
    amzn

UNION

SELECT
    date, open, high, low, close, adj_close, volume,
    'googl' AS stock
FROM
    googl

UNION

SELECT
    date, open, high, low, close, adj_close, volume,
    'msft' AS stock
FROM
    msft

num_affected_rows,num_inserted_rows


## Save the stocks table on .csv file

In [0]:
#-----------------------------------------
# Set the prod folder path
#-----------------------------------------
tables_folder_path = "/mnt/tables/"

In [0]:
#-----------------------------------------
# Loading the stocks table and save it in csv
#-----------------------------------------
df = spark.sql("SELECT * FROM stocks")


# Temp folder to save temp parquet files
temp_folder = tables_folder_path + f"temp/"

# write data frame to csv
df.coalesce(1).write.mode("overwrite").option("header", "True").csv(temp_folder)

#get all files path ending with .parquet
files_paths = get_files_paths_from_folders(temp_folder, ".csv")
            
# Copy parquet files to final destination
ingest_and_transform_to_parquet(files_paths, tables_folder_path, "stocks")

# delete the temp folder
delete_contents_recursively(temp_folder)

In [0]:
df.printSchema()

root
 |-- date: date (nullable = true)
 |-- open: float (nullable = true)
 |-- high: float (nullable = true)
 |-- low: float (nullable = true)
 |-- close: float (nullable = true)
 |-- adj_close: float (nullable = true)
 |-- volume: float (nullable = true)
 |-- stock: string (nullable = true)



## Join Returns tables

In [0]:
%sql

SELECT
    date,
    Value,
    ColumnName
FROM
    (SELECT
        aapl, amzn, googl, msft
    FROM
        returns) t
UNPIVOT
    (Value FOR ColumnName IN (aapl, amzn, googl, msft)) AS unpivoted_data;