# This is an example of building a macd signal

In [1]:
from pprint import pprint

import pandas as pd

from aika import putki
from aika.putki import CalendarChecker
from aika.putki.context import Defaults, GraphContext
from aika.putki.graph import Graph, TaskModule
from aika.putki.runners import LocalRunner
from aika.putki.interface import Dependency
from aika.time.calendars import TimeOfDayCalendar
from aika.time.time_of_day import TimeOfDay
from aika.time.time_range import TimeRange#
from aika.time.timestamp import Timestamp
from aika.utilities.fin.macd import macd

from aika.datagraph.persistence.hash_backed import HashBackedPersistanceEngine
from aika.datagraph.persistence.mongo_backed import MongoBackedPersistanceEngine
from pandas_datareader import data
import typing as t
from pandas.tseries.offsets import BDay
import pymongo
import yfinance as yf
yf.pdr_override()

## Set up

### Create an engine
We support two kinds of engine at the momemnt, one purely in memory backed by a hash map, and one that stores the data permenantly in a mongodb. You can use either here.

In [2]:
engine = HashBackedPersistanceEngine()
# engine = MongoBackedPersistanceEngine(
#     pymongo.MongoClient(),
#     database_name="research_foo3"
# )


### Create a context
A context is the user interface for creating tasks. It mainly just functions as a place holder to fill in information that is common to all or nearly all tasks. In this case, the code version, the storage engine, and the time_range

In [3]:
context = GraphContext(
    defaults=Defaults(
        version="research", 
        persistence_engine=engine, 
        time_range= TimeRange("2018", "2020")
    )
)

## Create your first function. 
This just uses pandas datareader project to pull some stock data from yahoo. 

In [4]:
def pull_google_finance_data(
    tickers : t.List,
    time_range,
):
    finance_data = map(lambda stock: data.get_data_yahoo(stock,start=time_range.start, end=time_range.end),tickers)
    df = pd.concat(finance_data)
    df.index.name = None
    df.index = df.index.map(Timestamp) # this ensures it has a timezone.
    return df["Adj Close"]

In [5]:
pull_google_finance_data(["AAPL", "GOOGL"], TimeRange("2018", "2020"))

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


2018-01-02 00:00:00+00:00    40.888065
2018-01-03 00:00:00+00:00    40.880943
2018-01-04 00:00:00+00:00    41.070835
2018-01-05 00:00:00+00:00    41.538437
2018-01-08 00:00:00+00:00    41.384159
                               ...    
2019-12-24 00:00:00+00:00    67.221497
2019-12-26 00:00:00+00:00    68.123497
2019-12-27 00:00:00+00:00    67.732002
2019-12-30 00:00:00+00:00    66.985497
2019-12-31 00:00:00+00:00    66.969498
Name: Adj Close, Length: 1006, dtype: float64

Now instead we can create a task to do that. The task needs a name, the function to run, and the parameters, finally, because this is a "source" node of the graph, we must specify a completion checker. A completion checker specified the expected index for the data, in this case, we are saying that we expect it to have a value every buisiness day at midnight.

In [6]:
close_prices = context.time_series_task(
    "close_prices",
    pull_google_finance_data,
    tickers=("AAPL", "GOOGL"),
    completion_checker=CalendarChecker(
        TimeOfDayCalendar(time_of_day=TimeOfDay.from_str("00:00 [UTC]"))
    ),
)

Before we run the task it will evaluate as "not complete", and after we run it it will evaluate as "complete". Further, we pull the data from the engine and display it with the read command.

In [7]:
display(close_prices.complete())
close_prices.run()
display(close_prices.complete())
close_prices.read()

False

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


True

2018-01-02 00:00:00+00:00    40.888065
2018-01-03 00:00:00+00:00    40.880947
2018-01-04 00:00:00+00:00    41.070831
2018-01-05 00:00:00+00:00    41.538441
2018-01-08 00:00:00+00:00    41.384155
                               ...    
2019-12-24 00:00:00+00:00    67.221497
2019-12-26 00:00:00+00:00    68.123497
2019-12-27 00:00:00+00:00    67.732002
2019-12-30 00:00:00+00:00    66.985497
2019-12-31 00:00:00+00:00    66.969498
Name: Adj Close, Length: 1006, dtype: float64

## Lets do some macd using the library functions

Macd calculatsions are index preserving, as are most time series operations, so here we do not need to specify a completion checkier, it is inferred from its parent task, in this case close prices. Note as well here that we can store two different datasets in the same "node" of the data graph, all that is required is that their parameters are different.

In [8]:
macd_one = context.time_series_task(
    "macd",
    macd,
    prices=close_prices,
    fast_span=10,
    slow_span=20,
    vol_span=30
)

macd_two = context.time_series_task(
    "macd",
    macd,
    prices=close_prices,
    fast_span=20,
    slow_span=40,
    vol_span=60
)
macd_one.completion_checker

CalendarChecker(calendar=TimeOfDayCalendar(time_of_day=TimeOfDay(time=datetime.time(0, 0), tz=<UTC>), freq=<BusinessDay>, maximum_interval=<Week: weekday=None>))

In [9]:
display(macd_one.complete())
macd_one.run()
macd_one.read()

False

2018-01-02 00:00:00+00:00         NaN
2018-01-03 00:00:00+00:00         NaN
2018-01-04 00:00:00+00:00         NaN
2018-01-05 00:00:00+00:00         NaN
2018-01-08 00:00:00+00:00         NaN
                               ...   
2019-12-24 00:00:00+00:00    1.235258
2019-12-26 00:00:00+00:00    1.173843
2019-12-27 00:00:00+00:00    1.121572
2019-12-30 00:00:00+00:00    0.875007
2019-12-31 00:00:00+00:00    0.730185
Name: Adj Close, Length: 1006, dtype: float64

In [10]:
display(macd_two.complete())
macd_two.run()
display(macd_two.complete())
macd_two.read()

False

True

2018-01-02 00:00:00+00:00         NaN
2018-01-03 00:00:00+00:00         NaN
2018-01-04 00:00:00+00:00         NaN
2018-01-05 00:00:00+00:00         NaN
2018-01-08 00:00:00+00:00         NaN
                               ...   
2019-12-24 00:00:00+00:00    1.464027
2019-12-26 00:00:00+00:00    1.439976
2019-12-27 00:00:00+00:00    1.430197
2019-12-30 00:00:00+00:00    1.344861
2019-12-31 00:00:00+00:00    1.297514
Name: Adj Close, Length: 1006, dtype: float64

## Branching Engines

Sometimes we will have one engine that already contains the data that we need, and want to run some experiments that run in a different engine.

In [11]:
engine_two = HashBackedPersistanceEngine()
macd_three = context.time_series_task(
    "macd",
    macd,
    prices=close_prices,
    fast_span=20,
    slow_span=40,
    vol_span=60,
    persistence_engine=engine_two
)

In [12]:
macd_three.run()
display(macd_three.complete())

True

Now the new data set is in the engine below.

In [13]:
display(engine.exists(macd_three.output))
display(engine_two.exists(macd_three.output))

False

True

In [14]:
def describe(data):
    return data.describe()

In [15]:
describe_two = context.static_task(
    "macd.describe",
    describe,
    data=macd_two,
)
describe_two.run()
describe_two.read()

count    826.000000
mean       0.328492
std        0.971663
min       -2.034219
25%       -0.444699
50%        0.422194
75%        1.023426
max        2.726097
Name: Adj Close, dtype: float64

In [16]:
describe_three = context.static_task(
    "macd.describe",
    describe,
    data=macd_three,
)
describe_three.run()

note that because we changed the engine in its dependency this task now has engine_two as its storage output despite not being used.

In [17]:
describe_three.output.engine == engine_two

True