# Example of building an ml pipeline for return prediction

In [None]:
from pprint import pprint

import pandas as pd
import numpy as np
from aika import putki
from aika.putki import CalendarChecker, IrregularChecker
from aika.putki.context import Defaults, GraphContext
from aika.putki.graph import Graph, TaskModule
from aika.putki.runners import LocalRunner
from aika.putki.interface import Dependency
from aika.time.calendars import TimeOfDayCalendar, OffsetCalendar
from aika.time.time_of_day import TimeOfDay
from aika.time.time_range import TimeRange#
from aika.time.timestamp import Timestamp
from aika.utilities.fin.macd import macd, ewm_volatility
from aika.utilities.fin.returns import arithmetic_bar_returns
from aika.ml.generators.walkforward import CausalDataSetGenerator
from aika.ml.interface import Pipeline, SklearnEstimator, GenericStatelessTransformer, Dataset

from aika.datagraph.persistence.hash_backed import HashBackedPersistanceEngine
from aika.datagraph.persistence.mongo_backed import MongoBackedPersistanceEngine
from pandas_datareader import data
import typing as t
from pandas.tseries.offsets import BDay
import pymongo

In [None]:
engine = HashBackedPersistanceEngine()
context = GraphContext(
    defaults=Defaults(
        version="research", 
        persistence_engine=engine, 
        time_range= TimeRange("2010", "2020")
    )
)

In [None]:
def pull_google_finance_data(
    tickers : t.List,
    time_range,
):
    df = data.DataReader(list(tickers), "yahoo", start=time_range.start, end=time_range.end)
    df.index.name = None
    df.index = df.index.map(Timestamp) # this ensures it has a timezone.
    return df["Adj Close"]

close_prices = context.time_series_task(
    "close_prices",
    pull_google_finance_data,
    tickers=("AAPL", "GOOGL"),
    completion_checker=CalendarChecker(
        TimeOfDayCalendar(time_of_day=TimeOfDay.from_str("00:00 [UTC]"))
    ),
)
close_prices.run()

returns = context.time_series_task(
    "returns",
    arithmetic_bar_returns,
    prices=close_prices,
    step=1,
    time_level="end"
)
returns.run()
returns.read()

def risk_adjusted_returns(returns):
    return returns.divide(ewm_volatility(returns, span=30).shift(1))

risk_adjusted_returns = context.time_series_task(
    "returns.risk_adjusted",
    risk_adjusted_returns,
    returns=returns,
    time_level="end"
)
risk_adjusted_returns.run()

weekly_returns = context.time_series_task(
    "weekly_returns",
    arithmetic_bar_returns,
    prices=close_prices,
    step=5,
    time_level="end"
)
weekly_returns.run()

In [None]:
def macd_multi_horizon(
    prices : pd.DataFrame,
    horizons : t.List[t.Tuple[int, int]],
    vol_span : int
):
    results = []
    for fast, slow in horizons:
        foo = macd(prices, fast, slow, vol_span)
        foo.columns = pd.MultiIndex.from_tuples(
            [(name, fast, slow) for name in prices.columns], 
            names=("Symbols", "fast", "slow")
        )
        results.append(foo)
    return pd.concat(results, axis=1)

all_macd = context.time_series_task(
    "all_macd",
    macd_multi_horizon,
    prices=close_prices,
    horizons=(
        (10,20),
        (20,40),
        (40,80),
        (80,160),
        (160,320)
    ),
    vol_span=90
)
all_macd.run()
all_macd.read()

In [None]:
# engine.delete(fitted_models.output, recursive=True)

In [None]:
from sklearn.linear_model import LinearRegression

def fill_zeros(df : pd.DataFrame):
    return df.fillna(0.0)

def stack(df : pd.DataFrame):
    return df.stack(level="Symbols")

def unstack(df : pd.DataFrame):
    return df.unstack(level="Symbols")

def fit_model(all_macd : pd.DataFrame, returns : pd.DataFrame):
    gen = CausalDataSetGenerator(
        features=all_macd,
        responses=returns,
        step_size=100,
        window_size=500,
        min_periods=300,
        strict_step_size=True,
        causal_kwargs={
            "index_level":"start",
            "contemp":True
        }
    )
    results = {}
    for dataset in gen:
        pipeline = Pipeline(
            steps=[
                GenericStatelessTransformer(fill_zeros),
                GenericStatelessTransformer(stack),
                SklearnEstimator(LinearRegression(fit_intercept=True, copy_X=True)),
                GenericStatelessTransformer(unstack)
            ]
        )
        pipeline.fit(dataset)
        results[dataset.X.index[-1][1]] = pipeline
    return pd.Series(results).sort_index()

def apply_model(models : pd.Series, data):
    points = list(np.searchsorted(
        data.index, models.index
    ))
    results = []
    for i,(start,end) in enumerate(zip(points, points[1:] + [data.shape[0]])):
        if start != end:
            print((start, end))
            results.append(
                models.iat[i].transform(
                    Dataset(X=data.iloc[start:end], y=None)
                ).y
            )

    results = pd.concat(results, axis=0)
    return results

fitted_models = context.time_series_task(
    "fitted_models",
    fit_model,
    all_macd=all_macd,
    returns=risk_adjusted_returns,
    completion_checker=IrregularChecker()
)
fitted_models.run()

model_outputs = context.time_series_task(
    "model_outputs",
    apply_model,
    models=Dependency(fitted_models, lookback=200 * BDay(), inherit_frequency=False),
    data=all_macd
)
model_outputs.run()
model_outputs.read()

In [None]:
o = model_outputs.read().copy()
r = risk_adjusted_returns.read().copy().droplevel("end")

o.columns = pd.MultiIndex.from_tuples([("Prediction", symbol) for symbol in o.columns])
r.columns = pd.MultiIndex.from_tuples([("Returns", symbol) for symbol in r.columns])

results = pd.concat([o,r], axis=1)
results

In [None]:
display(results.stack(level=1).corr())
results.stack(level=1).plot.scatter(x="Prediction", y="Returns")

In [None]:
pd.DataFrame({"Prediction": o, "Return":r.droplevel("end")})