In [1]:
# Bloomberg recommends that you use %package, which uses the conda/mamba install. I use %pip. 
# The difference is fairly similar. %pip is faster and pulls from PYPI, which is more standard and up to date. 
# %package pulls from Conda Forge, which includes non-Python resources. The difference doesn't really matter for *most* purposes.

#%package install plotly
#%pip install scikit-learn
#%pip install xgboost

import bql
import plotly.express as px


In [2]:
import bql
bql_svc = bql.Service()
query = """
    get(
      px_last
    ) for(
      'IBM US Equity'
    ) with(
      dates=range(-29d, 0d),
      fill=prev
    )
"""
response = bql_svc.execute(query)
base_df = bql.combined_df(response)

# Most models need ordinal (numeric) dates, so we'll calculate this up front
base_df['date_ordinal'] = base_df['DATE'].apply(lambda x: x.toordinal())


In [3]:
# Draw a simple line chart of px_last

px.line(base_df, x="DATE", y="px_last")

In [4]:
import pandas as pd

# Make a copy of the dataframe
df_withavgs = base_df.copy()

df_withavgs["sma_3day"] = df_withavgs["px_last"].rolling(3).mean()
px.line(df_withavgs, x="DATE", y=["px_last", "sma_3day"])

In [5]:
# This example uses scikit-learn (also called: sklearn) to perform a simple linear
# regression. This pattern of fitting a model, and then predicting, unlocks a lot of other tools
# You'll see xgboost uses the same flow. 
# This model uses the entire date range, with no train/test split. 

from sklearn.linear_model import LinearRegression

linear_model_full = LinearRegression()
X = df_withavgs[['date_ordinal']]
y = df_withavgs['px_last']

linear_model_full.fit(X, y)
df_withavgs["px_last_pred_fulltrain"] = linear_model_full.predict(X)

df_withavgs["sma_3day"] = df_withavgs["px_last"].rolling(3).mean()
px.line(df_withavgs, x="DATE", y=["px_last", "sma_3day", "px_last_pred_fulltrain"])

In [6]:
# Last example wasn't very interesting. We fit the model to the entirety of the data,
# telling us little about whether the model is useful or not. 
#
# Instead, let's split the data into a "Train/Test" split. Being time series, we'll 
# train on the first 3 weeks (21 days) and forecast (test) the remaining 9 days.

train_df = base_df.iloc[:21]
test_df = base_df[["DATE", "date_ordinal"]].iloc[21:]

linear_model_split = LinearRegression()
X_train = train_df[["date_ordinal"]]
y_train = train_df["px_last"]

linear_model_split.fit(X_train, y_train)

X_test = test_df[["date_ordinal"]]
test_df["px_last_pred_split"] = linear_model_split.predict(X_test)

df_withavgs.loc[test_df.index, 'px_last_pred_split'] = test_df['px_last_pred_split']
df_withavgs

px.line(df_withavgs, x="DATE", y=["px_last", "px_last_pred_fulltrain", "px_last_pred_split"])

In [7]:
# But, what about the future?
# Using the full 30 days, let's predict 14 days into the future
import datetime 

future_range = 14 # days
df_future = pd.DataFrame({"date_ordinal":range(base_df['date_ordinal'].max()+1, base_df['date_ordinal'].max()+future_range)}, index=range(base_df.index.max()+1, base_df.index.max()+future_range))

X_future_pred = linear_model_full.predict(df_future)

df_future["px_last_pred_future"] = X_future_pred
df_future["DATE"] = pd.to_datetime(df_future['date_ordinal'].apply(lambda x: datetime.date.fromordinal(x)))

df_with_future = pd.concat([df_withavgs, df_future])

px.line(df_with_future, x="DATE", y=["px_last", "px_last_pred_fulltrain", "px_last_pred_future"])

In [8]:
# Let's put what we've learned together and do the same thing with xgboost

import xgboost as xgb

xg_df = base_df.copy()
X_train = train_df[["date_ordinal"]]
y_train = train_df["px_last"]

model = xgb.XGBRegressor(n_estimators=5, learning_rate=0.10, objective='reg:squarederror')
model.fit(X_train, y_train)

xg_df.loc[X_test.index, 'xgpredicted_px_last'] = model.predict(X_test)

px.line(xg_df, x="DATE", y=["px_last", "xgpredicted_px_last"])
