We will only need pandas and numpy for this exercise

In [1]:
import pandas as pd
import numpy as np
from numpy.linalg import norm

Read in the parquet file

In [2]:
df = pd.read_parquet('retail_flow_daily.parquet')

Preview of the data

In [3]:
print(df.head())

         date ticker  volume_buy  volume_sell  vwap_buy  vwap_sell
0  2016-01-04      A       28418        30233     40.60      40.54
1  2016-01-04    AAL      448009       437415     40.79      40.83
2  2016-01-04    AAP       45070        31598    152.29     152.02
3  2016-01-04   AAPL     3030463      2640489    103.91     103.84
4  2016-01-04   ABBV      307619       291025     57.25      57.28


Sort values by ticker, and then by date

In [4]:
df = df.sort_values(by=['ticker', 'date'])
df = df.reset_index(drop=True)

Create a column full of 1's. This will allow us to easily see how many days have passed (since we'll be looking at previous x trading days.

In [5]:
df['ones']= 1

Index by ticker and date, and then cumulative sum the data

In [6]:
df_grouped = df.groupby(['ticker', 'date']).sum().groupby(level=0).cumsum()

In [7]:
print(df_grouped)

                   volume_buy  volume_sell   vwap_buy  vwap_sell  ones
ticker date                                                           
A      2016-01-04       28418        30233      40.60      40.54     1
       2016-01-05       47714        74909      81.21      81.09     2
       2016-01-06       68063       103797     121.78     121.78     3
       2016-01-07      109157       142496     161.03     161.00     4
       2016-01-08      141344       168150     199.86     199.87     5
...                       ...          ...        ...        ...   ...
ZTS    2022-01-06    68433030     64925805  161743.90  161727.86  1515
       2022-01-07    68470994     64967654  161956.81  161940.81  1516
       2022-01-10    68516628     65009491  162167.53  162151.30  1517
       2022-01-11    68562959     65049639  162378.97  162362.67  1518
       2022-01-12    68603187     65085979  162591.86  162575.38  1519

[737378 rows x 5 columns]


In [8]:
df_grouped = df_grouped.reset_index(drop=True)

Factor 1: percent change over the last 20 trading days relative to prior 20 trading day period.

In [9]:
factor1 = []

In [10]:
for idx in df_grouped.index:
    if df_grouped['ones'][idx] >= 40:
        cur_buy_change = df_grouped['volume_buy'][idx] - df_grouped['volume_buy'][idx-19]
        prev_buy_change = df_grouped['volume_buy'][idx-20] - df_grouped['volume_buy'][idx-39]
        percent_change = (cur_buy_change - prev_buy_change) / prev_buy_change
        factor1.append(percent_change)
    else:
        factor1.append(None)

Factor 2: Total volume buys in the last 20 trading days

In [11]:
factor2 = []

In [12]:
for idx in df_grouped.index:
    if df_grouped['ones'][idx] >= 20:
        l20_volume = df_grouped['volume_buy'][idx] - df_grouped['volume_buy'][idx-19]
        factor2.append(l20_volume)
    else:
        factor2.append(None)

Add factor1, factor 2, and the number of days column to the original dataframe, since the third factor doesn't need cumulative sum data.

In [13]:
df['trading_days'] = df_grouped['ones']
df['factor_1'] = factor1
df['factor_2'] = factor2

In [14]:
print(df)

              date ticker  volume_buy  volume_sell  vwap_buy  vwap_sell  ones  \
0       2016-01-04      A       28418        30233     40.60      40.54     1   
1       2016-01-05      A       19296        44676     40.61      40.55     1   
2       2016-01-06      A       20349        28888     40.57      40.69     1   
3       2016-01-07      A       41094        38699     39.25      39.22     1   
4       2016-01-08      A       32187        25654     38.83      38.87     1   
...            ...    ...         ...          ...       ...        ...   ...   
737373  2022-01-06    ZTS       51517        38158    218.14     217.93     1   
737374  2022-01-07    ZTS       37964        41849    212.91     212.95     1   
737375  2022-01-10    ZTS       45634        41837    210.72     210.49     1   
737376  2022-01-11    ZTS       46331        40148    211.44     211.37     1   
737377  2022-01-12    ZTS       40228        36340    212.89     212.71     1   

        trading_days  facto

Factor 3: Cosine similarity of monthly volume buys

In [15]:
factor3 = []

In [16]:
for idx in df.index:
    if df['trading_days'][idx] >= 40:
        cur20 = df.loc[(idx-19):idx, ['volume_buy']].to_numpy().flatten()
        prev20 = df.loc[(idx-39):(idx-20), ['volume_buy']].to_numpy().flatten()
        cosine = np.dot(cur20,prev20)/(norm(cur20)*norm(prev20))
        
        dif = np.sum(cur20) - np.sum(prev20)
        
        if dif >= 0:
            direction = 1
        else:
            direction = -1
        
        factor3.append((cosine * direction))
    else:
        factor3.append(None)

In [17]:
df['factor_3'] = factor3

In [18]:
print(df)

              date ticker  volume_buy  volume_sell  vwap_buy  vwap_sell  ones  \
0       2016-01-04      A       28418        30233     40.60      40.54     1   
1       2016-01-05      A       19296        44676     40.61      40.55     1   
2       2016-01-06      A       20349        28888     40.57      40.69     1   
3       2016-01-07      A       41094        38699     39.25      39.22     1   
4       2016-01-08      A       32187        25654     38.83      38.87     1   
...            ...    ...         ...          ...       ...        ...   ...   
737373  2022-01-06    ZTS       51517        38158    218.14     217.93     1   
737374  2022-01-07    ZTS       37964        41849    212.91     212.95     1   
737375  2022-01-10    ZTS       45634        41837    210.72     210.49     1   
737376  2022-01-11    ZTS       46331        40148    211.44     211.37     1   
737377  2022-01-12    ZTS       40228        36340    212.89     212.71     1   

        trading_days  facto