In [None]:
!pip freeze
!pip install pytimetk
!pip install pyarrow==15.0.2
!pip install fastparquet

In [None]:
# Import python packages
import streamlit as st
import pandas as pd
import numpy as np
import pytimetk as tk
# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
from snowflake.snowpark import Session, Table
session = get_active_session()

import snowflake.snowpark as snowpark
import fastparquet

We have some market data stored in a table called `PRICES_TIMETK`. Let's load that data now using a `sql` chunk.

In [None]:
#in the LQG session loaded this data from a database
# for reproducibility we load from the library 

stocks_df = tk.load_dataset("stocks_daily", parse_dates = ['date'])

stocks_df.glimpse()

If we had loaded from a database we would need to convert to pandas. Our data exists as a snowflake dataframe. We want to work with pandas so use the `to_pandas()` method.

In [None]:
# stocks_df = stocks_from_db.to_pandas()

# date coercion
# stocks_df['DATE'] = pd.to_datetime(stocks_df['DATE'])


stocks_df.glimpse()

Let's create some quick plots using `plotly`.

In [None]:
# plotly engine
stocks_df \
    .groupby('SYMBOL') \
    .plot_timeseries(
        'DATE', 'ADJUSTED',
        facet_ncol = 2,
        smooth = True,
        smooth_frac = 0.10,
        width = 600,
        height = 400,
        engine = 'plotly',
    )

### timetk built in rolling

Next let's create new columns to hold the 10, 50 and 200-day rolling average price. `timetk` makes it easy to add new features at scale. We will see later how to add many more features.

In [None]:
# Add 3 moving averages (10-day, 50-day and 200-Day)
sma_df = stocks_df[['SYMBOL', 'DATE', 'ADJUSTED']] \
    .groupby('SYMBOL') \
    .augment_rolling(
        date_column = 'DATE',
        value_column = 'ADJUSTED',
        window = [10, 50, 200],
        window_func = ['mean'],
        center = False
    )


sma_df.glimpse()

Let's save the new pandas dataframe that holds our rolling mean to a Snowflake table in our database. This is a crucial step that isn't complicated. We call the `write_pandas()` method from Snowpark (which Mats will discuss soon). It's important because it has now opened up the world of Python feature engineering to us. My favorite time series library is `timetk`, yours is probably something different. 

In [None]:

session.write_pandas(sma_df, "stocks_10_50_200_sma_two", auto_create_table=True)


In [None]:
(sma_df 
    # zoom in on dates
    .query('DATE <= "2014-01-01"')
)

In [None]:
(sma_df 

    # zoom in on dates
    .query('DATE >= "2023-01-01"') 

    # Convert to long format
    .melt(
        id_vars = ['SYMBOL', 'DATE'],
        value_vars = ["ADJUSTED", "ADJUSTED_rolling_mean_win_50", "ADJUSTED_rolling_mean_win_200"]
    ) 

    # Group on symbol and visualize
    .groupby("SYMBOL") 
    .plot_timeseries(
        date_column = 'DATE',
        value_column = 'value',
        color_column = 'variable',
        smooth = False, 
        facet_ncol = 2,
        width = 700,
        height = 400,
        engine = "plotly"
    )
)

In [None]:
returns_wide_df = stocks_df[['SYMBOL', 'DATE', 'ADJUSTED']] \
    .pivot(index = 'DATE', columns = 'SYMBOL', values = 'ADJUSTED') \
    .pct_change() \
    .reset_index() \
    [1:]

returns_wide_df.head()

In [None]:
corr_table_df = returns_wide_df.drop('DATE', axis=1).corr()

corr_table_df

What if we wish to convert to a csv and save to a stage?

In [None]:
returns_wide_df.reset_index(drop=True, inplace=True)
MY_STAGE = "PERMANENT_STAGE"
MY_FILE_NAME = "wide_df.csv"
 
returns_wide_df.to_csv(MY_FILE_NAME, index=False) 

# Upload the file to a stage.
put_result = session.file.put(MY_FILE_NAME, MY_STAGE, auto_compress=False,overwrite=True)
put_result[0].status

In [None]:
returns_long_df = returns_wide_df \
    .melt(id_vars='DATE', value_name='returns') 

returns_long_df

Now let's revisit calculating rolling features. This time we will create 7 new rolling 90-day features for each of our symbols.

In [None]:
rolling_stats_df = returns_long_df \
    .groupby('SYMBOL') \
    .augment_rolling(
        date_column = 'DATE',
        value_column = 'returns',
        window = [90],
        window_func = [
            'mean', 
            'std', 
            'min',
            ('q25', lambda x: np.quantile(x, 0.25)),
            'median',
            ('q75', lambda x: np.quantile(x, 0.75)),
            'max'
        ]
    ) \
    .dropna()

rolling_stats_df

### Streamlit in Notebooks

Next let's look at the power of using streamlit inside of snowflake notebooks. Streamlit is an open source Python framework for creating interactive visualizations. We have natively integrated it into Snowflake, so you can build apps with the click of a button or you can integrate directly into a notebook. 

In [None]:
import altair as alt

In [None]:
chosen_stock = st.selectbox("Choose a Symbol", rolling_stats_df['SYMBOL'].unique())

df = rolling_stats_df \
    .loc[rolling_stats_df['SYMBOL'] == chosen_stock]

st.header('Rolling 90-Day Mean Returns: {}'.format(chosen_stock))

actuals = alt.Chart(df).mark_line(color='darkgreen').encode(
    x = alt.X('DATE', title = None),
    y = alt.Y('returns_rolling_mean_win_90'))

st.altair_chart(actuals,theme= None, use_container_width=True)

In [None]:
rolling_stats_long_df = rolling_stats_df \
    .melt(
        id_vars = ["SYMBOL", "DATE"],
        var_name = "statistic"
    )

rolling_stats_long_df

In [None]:
chosen_stock = st.selectbox("Choose a Symbol", rolling_stats_long_df['SYMBOL'].unique())
chosen_stat = st.selectbox("Choose a State", rolling_stats_long_df['statistic'].unique())


df = rolling_stats_long_df \
    .loc[rolling_stats_long_df['SYMBOL'] == chosen_stock] \
    .loc[rolling_stats_long_df['statistic'] == chosen_stat]

st.header('Rolling 90-Day Std Dev Returns: {}'.format(chosen_stock))

actuals = alt.Chart(df).mark_line(color='blue').encode(
    x = alt.X('DATE', title = None),
    y = alt.Y('value'))

st.altair_chart(actuals,theme= None, use_container_width=True)

In [None]:
return_combinations_long_df = returns_long_df \
    .merge(returns_wide_df, how='left', on = 'DATE') \
    .melt(
        id_vars = ['DATE', 'SYMBOL', 'returns'],
        var_name = "comp",
        value_name = "returns_comp"
    )

In [None]:
return_corr_df = return_combinations_long_df \
    .query('SYMBOL != comp') \
    .groupby(["SYMBOL", "comp"]) \
    .augment_rolling_apply(
        date_column = "DATE",
        window = 90,
        window_func=[('corr', lambda x: x['returns'].corr(x['returns_comp']))],
        threads = 1, # Change to -1 to use all available cores
    ) \
    .query('rolling_corr_win_90.notnull()')




In [None]:
return_corr_df['name'] = return_corr_df['SYMBOL'] + '_' + return_corr_df['comp'].astype(str)


In [None]:
(return_corr_df
# .query('rolling_corr_win_90 != 1')
# .query('symbol != comp')
# .query('rolling_corr_win_90.notnull()')
)

In [None]:
# Assume Market Returns = Equal Weight Portfolio
market_returns_df = returns_wide_df \
    .set_index("DATE") \
    .assign(returns_market = lambda df: df.sum(axis = 1) * (1 / df.shape[1])) \
    .reset_index() \
    [['DATE', 'returns_market']]

# Merge with returns long
returns_long_market_df = returns_long_df \
    .merge(market_returns_df, how='left', on='DATE')

returns_long_market_df

### Rolling Regression

Next we run a rolling regression against the "market portfolio" we created. We will see later how this workflow could use the model registry to scale to millions of regression runs.

In [None]:
def regression(df):
    
    # External functions must 
    from sklearn.linear_model import LinearRegression

    model = LinearRegression()
    X = df[['returns_market']]  # Extract X values (independent variables)
    y = df['returns']  # Extract y values (dependent variable)
    model.fit(X, y)
    ret = pd.Series([model.intercept_, model.coef_[0]], index=['Intercept', 'Slope'])
    
    return ret # Return intercept and slope as a Series

return_regression_df = returns_long_market_df \
    .groupby('SYMBOL') \
    .augment_rolling_apply(
        date_column = "DATE",
        window = 90,
        window_func = [('regression', regression)],
        threads = 1,  
    ) \
    .dropna()

return_regression_df

In [None]:
intercept_slope_df = pd.concat(return_regression_df['rolling_regression_win_90'].to_list(), axis=1).T 

intercept_slope_df.index = return_regression_df.index

return_beta_df = pd.concat([return_regression_df, intercept_slope_df], axis=1)

return_beta_df