### Data Engineering

Ingestion type:  Full Load

Schedule Run: Daily

Source: Yahoo Finance

Target location: raw/yfinance/petro/petro.csv

In [1]:
import pandas as pd
import yfinance as yf
import os
from datetime import datetime
from io import StringIO

In [2]:
# Defining start and end date for data extraction

start_date = datetime.strptime('2019-12-11', '%Y-%m-%d')
end_date = datetime.today()

In [None]:
# Extract Petro Data from Y Finance

def extract_pbr_data(ticker: str, start_date: datetime, end_date: datetime) -> pd.DataFrame:
    df = yf.download(ticker, start=start_date, end=end_date)[['Close', 'Adj Close']]
    return df

petro = extract_pbr_data('PBR', start_date, end_date)
petro = petro.rename(columns={'Close': 'close', 'Adj Close': 'adj_close'})

petro.head()

In [None]:
# Extract Brent Crude Oil Data from Y Finance

def extract_brent_data(ticker: str, start_date: datetime, end_date: datetime) -> pd.DataFrame:
    df = yf.download(ticker, start=start_date, end=end_date)[['Close']]
    return df

brent = extract_brent_data('BZ=F', start_date, end_date)
brent = brent.rename(columns={'Close': 'close_brent'})

brent.head()

In [None]:
# Extract USD/BRL Quotation from Y Finance

def extract_usd_data(ticker: str, start_date: datetime, end_date: datetime) -> pd.DataFrame:
    df = yf.download(ticker, start=start_date, end=end_date)[['Close']]
    return df

usd= extract_brent_data('USDBRL=X', start_date, end_date)
usd = usd.rename(columns={'Close': 'close_usd'})

usd.head()

In [None]:
# Join bases

data = petro.merge(brent, left_index=True, right_index=True)\
          .merge(usd, left_index=True, right_index=True)

data.head()

In [None]:
# Adding calendar/seasonal features

def add_seasonal(df:pd.DataFrame) -> pd.DataFrame:
    df["day"] = df.index.day
    df["d_week"] = df.index.dayofweek
    df["month"] = df.index.month
    df["quarter"] = df.index.quarter
    df["year"] = df.index.year
    return df

data = add_seasonal(data)
data.head()

In [None]:
# Creating lags from day 1 to day 7

def add_lags(df:pd.DataFrame, start:int, end:int) -> pd.DataFrame:
    for lag in range(start, end):
        df[f"lag_{lag}"] = df['close'].shift(lag)
    return df

data = add_lags(data, 1, 8)
data.head()

In [None]:
# Creating mean and standard rolling windows

def add_rollings(df:pd.DataFrame) -> pd.DataFrame:
    horizons = [2, 7, 14]
    for horizon in horizons:
        df[f"avg_{horizon}"] = df["close"].shift(1).rolling(window=horizon).mean()
        df[f"std_{horizon}"] = df["close"].shift(1).rolling(window=horizon).std()
    return df

data = add_rollings(data)
data.head()

In [None]:
# Applying diff for petro close values

def add_diff(df:pd.DataFrame) -> pd.DataFrame:
    df["diff"] = df["close"].shift(1).diff().dropna()
    return df

data = add_diff(data)
data.head()

In [None]:
# Drop null values

def clean_data(df:pd.DataFrame) -> pd.DataFrame:
    df = df.dropna()
    return df

data = clean_data(data)
data.head()