---
Author: Mustapha Bouhsen <br>
[LinkedIn](https://www.linkedin.com/in/mustapha-bouhsen/)<br>
[Git](https://github.com/mus514)<br>
Date: February 7, 2024<br>
---

In [0]:
%run Repos/bouhsen.m@gmail.com/ML_Pipeline_Hub/library/garch_model

In [0]:
%run Repos/bouhsen.m@gmail.com/ML_Pipeline_Hub/library/daily_utilities

In [0]:
#-----------------------------------------
# load libraries
#-----------------------------------------
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, DateType
from pyspark.sql import functions as F 
from pyspark.sql.functions import udf
import pandas as pd
import numpy as np
from datetime import datetime



### Create table from Azure storage for each stock


In [0]:
#-----------------------------------------
# Set the prod folder path
#-----------------------------------------
raw_folder_path = "/mnt/raw/"
prod_folder_path = "/mnt/prod/"
clean_folder_path = "/mnt/clean/"

stocks = ["aapl", "amzn", "googl", "msft"]

In [0]:
#-----------------------------------------
# The schema
#-----------------------------------------
schema = StructType([
    StructField("date", DateType(), True),
    StructField("open", FloatType(), True),
    StructField("high", FloatType(), True),
    StructField("low", FloatType(), True),
    StructField("close", FloatType(), True),
    StructField("adj_close", FloatType(), True),
    StructField("volume", FloatType(), True)
])


In [0]:
#-----------------------------------------
# Create the daily table for each stock
#-----------------------------------------
for stock in stocks:
    # Create the path for the stock
    path = f'{clean_folder_path}{stock}.csv'
    # Load all the parquet files at once
    df = spark.read.schema(schema).option("header", "True").csv(path)
    
    # Check if the table exists
    if spark.catalog.tableExists(stock):
        # Drop the existing table
        spark.sql(f"DROP TABLE {stock}")
        print(f'Dropped table: {stock}')
    
    # Create the table
    df.write.format("parquet").saveAsTable(stock)
    print(f'Table for {stock} is created')

Dropped table: aapl
Table for aapl is created
Dropped table: amzn
Table for amzn is created
Dropped table: googl
Table for googl is created
Dropped table: msft
Table for msft is created


In [0]:
#-----------------------------------------
# Write the prices in the prod
#-----------------------------------------
# Temp folder to save temp parquet files
temp_folder = prod_folder_path + f"temp/"

# Partion files in folders by year and month
df.write.mode("overwrite").option("header", "True").csv(temp_folder)

# get all files path ending with .parquet
files_paths = get_files_paths_from_folders(temp_folder, ".csv")
            
# Copy parquet files to final destination
ingest_and_transform_to_parquet(files_paths, prod_folder_path, "prices")

# delete the temp folder
delete_contents_recursively(temp_folder)

In [0]:
%sql

--- Display the table for apple stock
SELECT *
FROM aapl
LIMIT 10

date,open,high,low,close,adj_close,volume
2004-01-02,0.384821,0.388393,0.378214,0.38,0.32170781,144642400.0
2004-01-05,0.3825,0.399821,0.3825,0.395893,0.33516282,395018400.0
2004-01-06,0.397321,0.400357,0.387679,0.394464,0.33395305,509348000.0
2004-01-07,0.394643,0.407679,0.391607,0.403393,0.34151223,586874370.0
2004-01-08,0.407857,0.42375,0.404464,0.417143,0.35315305,460303200.0
2004-01-09,0.414821,0.430893,0.406964,0.410714,0.34771028,427459200.0
2004-01-12,0.415179,0.428571,0.4125,0.42375,0.35874647,487547200.0
2004-01-13,0.441071,0.443571,0.426071,0.430714,0.36464232,679016830.0
2004-01-14,0.435714,0.438214,0.424643,0.432143,0.36585203,620043200.0
2004-01-15,0.409107,0.417857,0.401786,0.408036,0.34544304,1018208770.0


In [0]:
%sql
-- Disply the year average stock price for aapl
SELECT year(date) as year, avg(close) as mean_stock_price_by_year
FROM aapl
GROUP BY year(date)
ORDER BY year(date)

year,mean_stock_price_by_year
2004,0.6344097272034676
2005,1.66700113056198
2006,2.528951308641776
2007,4.5812108640176845
2008,5.070679018148792
2009,5.24336163108311
2010,9.280087898647974
2011,13.000154476317148
2012,20.573204307556157
2013,16.87981717170231


## Creating table for stocks prices

In [0]:
%sql
DROP TABLE IF EXISTS stocks_prices;

CREATE TABLE stocks_prices AS
SELECT aapl.date, aapl.adj_close as aapl, amzn.adj_close as amzn, msft.adj_close as msft, googl.adj_close as googl
FROM aapl
JOIN amzn ON aapl.date = amzn.date
JOIN msft ON aapl.date = msft.date
JOIN googl ON aapl.date = googl.date
ORDER BY date;

num_affected_rows,num_inserted_rows


In [0]:
%sql
-- Disply the stock prices
SELECT *
FROM stocks_prices
ORDER BY date DESC
LIMIT 10

date,aapl,amzn,msft,googl
2024-02-12,187.15,172.34,415.26,147.53
2024-02-09,188.85,174.45,420.55,149.0
2024-02-08,188.08002,169.84,414.11,145.91
2024-02-07,189.16862,170.53,414.05,145.54
2024-02-06,189.05876,169.15,405.49,144.1
2024-02-05,187.44081,170.31,405.65,143.68
2024-02-02,185.61316,171.81,411.22,142.38
2024-02-01,186.62187,159.28,403.78,141.16
2024-01-31,184.165,155.2,397.58,140.1
2024-01-30,187.80035,159.0,408.59,151.46
