---
Author: Mustapha Bouhsen <br>
[LinkedIn](https://www.linkedin.com/in/mustapha-bouhsen/)<br>
[Git](https://github.com/mus514)<br>
Date: February 7, 2024<br>
---


### Create table from Azure storage for each stock


In [0]:
#-----------------------------------------
# Set the prod folder path
#-----------------------------------------
prod_folder_path = "/mnt/prod/"

stocks = ["aapl", "amzn", "googl", "msft"]

In [0]:
#-----------------------------------------
# Create the daily table for each stock
#-----------------------------------------
for stock in stocks:
    # Create the path for the stock
    path = f'{prod_folder_path}{stock}/year=*/month=*/*'
    # Load all the parquet files at once
    df = spark.read.parquet(path)
    
    # Check if the table exists
    if spark.catalog.tableExists(stock):
        # Drop the existing table
        spark.sql(f"DROP TABLE {stock}")
        print(f'Dropped table: {stock}')
    
    # Create the table
    df.write.format("parquet").saveAsTable(stock)
    print(f'Table for {stock} is created')

Dropped table: aapl
Table for aapl is created
Dropped table: amzn
Table for amzn is created
Dropped table: googl
Table for googl is created
Dropped table: msft
Table for msft is created


In [0]:
%sql

--- Display the table for apple stock
SELECT *
FROM aapl
LIMIT 10

date,open,high,low,close,volume
2019-10-31,247.24,249.17,237.26,248.76,34790520.0
2019-10-30,244.76,245.3,241.21,243.26,31130522.0
2019-10-29,248.97,249.75,242.57,243.29,35709868.0
2019-10-28,247.42,249.25,246.72,249.05,23655368.0
2019-10-25,243.16,246.73,242.88,246.58,18369296.0
2019-10-24,244.51,244.8,241.81,243.58,17916256.0
2019-10-23,242.1,243.24,241.22,243.18,19932544.0
2019-10-22,241.16,242.2,239.62,239.96,22684000.0
2019-10-21,237.52,240.99,237.32,240.51,21811568.0
2019-10-18,234.59,237.58,234.29,236.41,24248024.0


In [0]:
%sql
-- Disply the year average stock price for aapl
SELECT year(date) as year, avg(close) as mean_stock_price_by_year
FROM aapl
GROUP BY year(date)
ORDER BY year(date)

year,mean_stock_price_by_year
1999,96.85930278689362
2000,71.74892876261757
2001,20.21911299228668
2002,19.13952378621177
2003,18.54334883462815
2004,35.52689698385814
2005,51.68272765139316
2006,70.81172754660071
2007,128.27446834974555
2008,141.9790202506446


## Creating table for stocks prices

In [0]:
%sql
DROP TABLE IF EXISTS stocks_prices;

CREATE TABLE stocks_prices AS
SELECT aapl.date, aapl.close as aapl, amzn.close as amzn, msft.close as msft, googl.close as googl
FROM aapl
JOIN amzn ON aapl.date = amzn.date
JOIN msft ON aapl.date = msft.date
JOIN googl ON aapl.date = googl.date
ORDER BY date;

num_affected_rows,num_inserted_rows


In [0]:
%sql
-- Disply the stock prices
SELECT *
FROM stocks_prices
ORDER BY date DESC

date,aapl,amzn,msft,googl
2024-02-08,188.32,169.84,414.11,145.91
2024-02-07,189.41,170.53,414.05,145.54
2024-02-06,189.3,169.15,405.49,144.1
2024-02-05,187.68,170.31,405.65,143.68
2024-02-02,185.85,171.81,411.22,142.38
2024-02-01,186.86,159.28,403.78,141.16
2024-01-31,184.4,155.2,397.58,140.1
2024-01-30,188.04,159.0,408.59,151.46
2024-01-29,191.73,161.26,409.72,153.51
2024-01-26,192.42,159.12,403.93,152.185


## Creating table containg the log return for each stock prices

%md
The log return is given by :

$$
r_t = log(\frac{P_t}{P_{t-1}})
$$

Where $P_t$ is the stock price at time t