# Fetch historic oil price data

## Requirements & configuration

In [311]:
# # Install required packages
# ! pip install yfinance --upgrade --no-cache-dir
# ! pip install psycopg2-binary
# ! pip install sqlalchemy

In [1]:
# # Import required packages
# import os
# import json
import pandas as pd
import yfinance as yf # https://pypi.org/project/yfinance/
import psycopg2
from sqlalchemy import create_engine
from datetime import date

In [2]:
# DB configuration
config = {
    'host': 'datalake.cknmu1bvrxjg.us-east-1.rds.amazonaws.com',
    'port': '5432',
    'user': 'muser',
    'password': 'datalake',
    'dbname': 'datalake',
}

# Configure cnx_string for sqlalchemy
cnx_str = f'postgresql://{config["user"]}:{config["password"]}@{config["host"]}/{config["dbname"]}'

In [3]:
# Set ticker for required instrument "Brent Crude Oil"
brent = yf.Ticker("BZ=F")

## Inspect oil price data

In [312]:
# Get meta data of instrument
brent.info

In [313]:
# Get latest news on instrument as list of dictionaries
brent.news

In [9]:
# Get historical oil price data
hist = brent.history(period="max")
hist.tail(5)
# hist.info()
# hist.shape
# hist.size
# type(hist)
# hist.isnull().any()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-04-18,111.889999,114.82,110.720001,113.160004,13187,0,0
2022-04-19,112.650002,114.050003,106.769997,107.25,18972,0,0
2022-04-20,107.720001,108.980003,104.669998,106.800003,15985,0,0
2022-04-21,107.010002,109.790001,106.779999,108.330002,13203,0,0
2022-04-22,108.639999,108.730003,105.529999,106.650002,13203,0,0


## Initital transform of the data

In [7]:
# Remove columns 'Dividends' and 'Stock Splits'
hist.drop(['Dividends', 'Stock Splits'], inplace=True, axis=1)

# Remove last row
hist.drop(hist.tail(3).index, inplace = True)
hist.tail(5)

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-03-31,108.709999,109.360001,107.370003,107.910004,31
2022-04-01,107.669998,107.669998,107.669998,107.669998,17144
2022-04-04,104.25,108.550003,102.910004,107.529999,18406
2022-04-05,108.239998,109.849998,104.559998,106.639999,15613
2022-04-06,105.790001,108.660004,100.540001,101.07,20212


In [8]:
# Put the date index as column and convert it to date format
hist = hist.reset_index(level=0)
hist['Date'] = pd.to_datetime(hist['Date']).dt.date
hist.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,2007-07-30,75.849998,76.529999,75.440002,75.739998,2575
1,2007-07-31,75.699997,77.169998,75.669998,77.050003,3513
2,2007-08-01,77.0,77.059998,74.860001,75.349998,3930
3,2007-08-02,75.220001,76.209999,74.269997,75.760002,6180
4,2007-08-03,75.389999,76.0,74.529999,74.75,4387


In [9]:
# Rename columns
hist.rename(columns = {'Date':'date',
                       'Open':'open',
                       'High':'high',
                       'Low':'low',
                       'Close':'close',
                       'Volume':'volume'},
            inplace = True)
hist

Unnamed: 0,date,open,high,low,close,volume
0,2007-07-30,75.849998,76.529999,75.440002,75.739998,2575
1,2007-07-31,75.699997,77.169998,75.669998,77.050003,3513
2,2007-08-01,77.000000,77.059998,74.860001,75.349998,3930
3,2007-08-02,75.220001,76.209999,74.269997,75.760002,6180
4,2007-08-03,75.389999,76.000000,74.529999,74.750000,4387
...,...,...,...,...,...,...
3636,2022-03-31,108.709999,109.360001,107.370003,107.910004,31
3637,2022-04-01,107.669998,107.669998,107.669998,107.669998,17144
3638,2022-04-04,104.250000,108.550003,102.910004,107.529999,18406
3639,2022-04-05,108.239998,109.849998,104.559998,106.639999,15613


## Setup DB Connection

In [10]:
# Establish connection to database 'lakehouse'
try: 
    conn = psycopg2.connect(
        dbname=config['dbname'],
        user=config['user'],
        host=config['host'],
        password=config['password'],
        port=config['port']
    )

except psycopg2.Error as e: 
    print("Error: Could not make the connection to the postgres database")
    print(e)

# Create cursor
try: 
    cursor = conn.cursor()
except psycopg2.Error as e: 
    print("Error: Could not get the cursor to the database")
    print(e)

# Set auto commit feature
conn.set_session(autocommit=True)

# Create engine
engine = create_engine(cnx_str)

## Create table oilprice

In [11]:
# Create table oilprice
sql = """
    CREATE TABLE IF NOT EXISTS oilprice (
        date DATE,
        open float,
        high float,
        low float,
        close float,
        volume INT
        )
"""
cursor.execute(sql)

## Insert values into table

In [12]:
# Insert values into table using sqlalchemy
hist.to_sql('oilprice', engine, if_exists='replace', index=False)

## Check values

In [13]:
# Show inserted values
sql = '''
    SELECT *
    FROM oilprice
    ORDER BY date DESC
    LIMIT 5;
    '''
test = pd.read_sql(sql, engine)
test

Unnamed: 0,date,open,high,low,close,volume
0,2022-04-06,105.790001,108.660004,100.540001,101.07,20212
1,2022-04-05,108.239998,109.849998,104.559998,106.639999,15613
2,2022-04-04,104.25,108.550003,102.910004,107.529999,18406
3,2022-04-01,107.669998,107.669998,107.669998,107.669998,17144
4,2022-03-31,108.709999,109.360001,107.370003,107.910004,31


In [14]:
# Count inserted values
sql = '''
    SELECT COUNT(*) cnt
    FROM oilprice;
    '''
count = pd.read_sql(sql, engine)
test2 = count.values[0][0]
test2

3641

## Close the connection

In [15]:
# Close connection 
cursor.close()
conn.close()