In [99]:
import numpy as np
import pandas as pd
import yfinance as yf
import pandas_ta as ta
import datetime
from data_and_research import ac
import talib


In [101]:
len(ac.get_library('us_equities').list_symbols())

4642

In [21]:
import warnings

# Suppress specific FutureWarnings
warnings.filterwarnings("ignore", category=FutureWarning, message="The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning.")


##  US Stock Data Worklfow
### 1) Split symbols by sector
### 2) Calculate sector ranks
### 3) Store each sector group in a sector table in ArcticDB, e.g. us_equities/sectorname
### 4) Store each stock individually in us_equities/symbol
### 5) Create a sector overview table in ArcticDB that tracks relative strength of sectors (short-term) and (long-term) - ArcticDBs aggregate function can help

In [2]:
univ = ac.get_library('univ', create_if_missing=True)
univ_df = univ.read('us_equities',columns=['Symbol','Name','Sector','Market Cap']).data

symbols = univ_df['Symbol'].unique().tolist()
names = univ_df.Name.to_list()
sectors = univ_df.Sector.to_list()

lib = ac.get_library('us_equities', create_if_missing=True)


In [3]:
# Create a dictionary mapping Symbols to Columns we want to keep
symbol_to_name = dict(zip(univ_df["Symbol"], univ_df["Name"]))
symbol_to_sector = dict(zip(univ_df["Symbol"], univ_df["Sector"]))
symbol_to_mktcap = dict(zip(univ_df["Symbol"], univ_df["Market Cap"]))

In [45]:
# We start with one sector "Health technology" as example 
# and then later generalize it in a loop
# for sector in univ_df['Sector].unique().tolist():

sector = 'Health technology'
sdf = univ_df[univ_df.Sector == sector]
sector_symbols = sdf.Symbol.to_list()

if 'us_equities' not in lib.list_symbols():
    data = yf.download(sector_symbols, group_by="Ticker", period="max", auto_adjust=True)
else:
    end_date = datetime.datetime.now()
    start_date = end_date - datetime.timedelta(days=560)  # Approximately 1.5 years
    data = yf.download(sector_symbols, group_by="Ticker", start=start_date, end=end_date, auto_adjust=True)

df = data.stack(level=0).rename_axis(['Date', 'Symbol']).reset_index(level=1)
df = df.sort_values(by='Symbol',axis='index',kind='stable')

# Insert Name, Sector, MktCap columns
df["Name"] = df["Symbol"].map(symbol_to_name)
df['Sector'] = df['Symbol'].map(symbol_to_sector)
df['Market Cap'] = df['Symbol'].map(symbol_to_mktcap)





[*********************100%***********************]  896 of 896 completed


In [94]:
# Calculating technical indicators

# Performance and Relative Strength Rank + RS Rank Moving Average for a Trend Filter
df['1M'] = df.groupby('Symbol')['Close'].pct_change(21)
df['3M'] = df.groupby('Symbol')['Close'].pct_change(63)
df['6M'] = df.groupby('Symbol')['Close'].pct_change(126)
df['12M'] = df.groupby('Symbol')['Close'].pct_change(252)
df['RS IBD'] = 2*df['3M']+df['6M']+df['12M'] # IBD Relative Strength =  2x 3M + 1x 6M + 1x 12M
df['RS Rank'] = df.groupby(df.index)['RS IBD'].rank(pct=True)
df["RS Rank 20D MA"] = df.groupby("Symbol")["RS Rank"].rolling(window=20).mean().reset_index(level=0, drop=True)

# Calculate EMAs
df["20D_EMA"] = df.groupby("Symbol")["Close"].transform(lambda x: ta.ema(x, length=20))
df["50D_EMA"] = df.groupby("Symbol")["Close"].transform(lambda x: ta.ema(x, length=50))
df["200D_EMA"] = df.groupby("Symbol")["Close"].transform(lambda x: ta.ema(x, length=200))

df['ATR'] = df.groupby('Symbol').apply(lambda group: talib.ATR(group['High'], group['Low'], group['Close'], timeperiod=20)).shift(-1).reset_index(level=0, drop=True)
df['STD'] = df.groupby('Symbol')['Close'].rolling(window=20).std().reset_index(level=0, drop=True)

df['KC_Upper'] = df.groupby('Symbol').apply(lambda x: x['20D_EMA'] + (x['ATR'] * 1.5)).reset_index(level=0, drop=True)  # Upper Keltner Channel
df['KC_Lower'] = df.groupby('Symbol').apply(lambda x: x['20D_EMA'] - (x['ATR'] * 1.5)).reset_index(level=0, drop=True)  # Lower Keltner Channel

df['DC_Upper'] = df.groupby('Symbol')['High'].rolling(window=20).max().reset_index(level=0, drop=True)  # Upper Donchian Channel
df['DC_Lower'] = df.groupby('Symbol')['Low'].rolling(window=20).min().reset_index(level=0, drop=True)  # Lower Donchian Channel

# Calculate the Upper and Lower Bands
df['BB_Upper'] = df.groupby('Symbol').apply(lambda x: x['20D_EMA'] + (x['STD'] * 2)).reset_index(level=0, drop=True)
df['BB_Lower'] = df.groupby('Symbol').apply(lambda x: x['20D_EMA'] - (x['STD'] * 2)).reset_index(level=0, drop=True)

# Daily Returns for later aggregation & comparing among sectors
df['log_ret_1d'] = np.log(df.groupby('Symbol')['Close'].shift(-1)/df.groupby('Symbol')['Close'])
df['1d'] = df.groupby('Symbol')['Close'].pct_change(1)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (896, 2) + inhomogeneous part.

In [None]:
# Store the data in ArcticDB
lib.write(f'us_equities/{sector}', df)

In [55]:
# Store each symbol individually
for symbol in sector_symbols:
    symbol_data = df[df.Symbol == symbol]
    lib.write(f'us_equities/{symbol}', symbol_data)

In [58]:
df[df.Symbol == "AAPL"]

Price,Symbol,Close,High,Low,Open,Volume,Name,Sector,Market Cap,1M,3M,6M,12M,RS IBD,RS Rank,RS Rank 20D MA,20D_EMA,50D_EMA,200D_EMA,ATR
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1


In [50]:
test = df[df.Symbol == "AAPL"]

# Calculate ATR using pandas_ta
test['ATR'] = ta.atr(df['High'], df['Low'], df['Close'], length=14, mamode='ema')
test


Price,Symbol,Close,High,Low,Open,Volume,Name,Sector,Market Cap,1M,3M,6M,12M,RS IBD,RS Rank,RS Rank 20D MA,20D_EMA,50D_EMA,200D_EMA,ATR
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2018-02-16 00:00:00+00:00,,,,,,,,,,,,,,,,,,,,
2018-02-20 00:00:00+00:00,,,,,,,,,,,,,,,,,,,,
2018-02-21 00:00:00+00:00,,,,,,,,,,,,,,,,,,,,
2018-02-22 00:00:00+00:00,,,,,,,,,,,,,,,,,,,,
2018-02-23 00:00:00+00:00,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-09-06 00:00:00+00:00,,,,,,,,,,,,,,,,,,,,0.343982
2024-09-09 00:00:00+00:00,,,,,,,,,,,,,,,,,,,,0.327983
2024-09-10 00:00:00+00:00,,,,,,,,,,,,,,,,,,,,0.325270
2024-09-11 00:00:00+00:00,,,,,,,,,,,,,,,,,,,,0.317751


In [22]:
# Performance and Relative Strength Rank + RS Rank Moving Average for a Trend Filter
df['1M'] = df.groupby('Symbol')['Close'].pct_change(21)
df['3M'] = df.groupby('Symbol')['Close'].pct_change(63)
df['6M'] = df.groupby('Symbol')['Close'].pct_change(126)
df['12M'] = df.groupby('Symbol')['Close'].pct_change(252)
df['RS IBD'] = 2*df['3M']+df['6M']+df['12M'] # IBD Relative Strength =  2x 3M + 1x 6M + 1x 12M
df['RS Rank'] = df.groupby(df.index)['RS IBD'].rank(pct=True)
df["RS Rank 20D MA"] = df.groupby("Symbol")["RS Rank"].rolling(window=20).mean().reset_index(level=0, drop=True)

# Calculate EMAs
df["20D_EMA"] = df.groupby("Symbol")["Close"].transform(lambda x: ta.ema(x, length=20))
df["50D_EMA"] = df.groupby("Symbol")["Close"].transform(lambda x: ta.ema(x, length=50))
df["200D_EMA"] = df.groupby("Symbol")["Close"].transform(lambda x: ta.ema(x, length=200))

In [24]:
# Calculating technical indicators

# Performance and Relative Strength Rank + RS Rank Moving Average for a Trend Filter
df['1M'] = df.groupby('Symbol')['Close'].pct_change(21)
df['3M'] = df.groupby('Symbol')['Close'].pct_change(63)
df['6M'] = df.groupby('Symbol')['Close'].pct_change(126)
df['12M'] = df.groupby('Symbol')['Close'].pct_change(252)
df['RS IBD'] = 2*df['3M']+df['6M']+df['12M'] # IBD Relative Strength =  2x 3M + 1x 6M + 1x 12M
df['RS Rank'] = df.groupby(df.index)['RS IBD'].rank(pct=True)
df["RS Rank 20D MA"] = df.groupby("Symbol")["RS Rank"].rolling(window=20).mean().reset_index(level=0, drop=True)

# Calculate EMAs
df["20D_EMA"] = df.groupby("Symbol")["Close"].transform(lambda x: ta.ema(x, length=20))
df["50D_EMA"] = df.groupby("Symbol")["Close"].transform(lambda x: ta.ema(x, length=50))
df["200D_EMA"] = df.groupby("Symbol")["Close"].transform(lambda x: ta.ema(x, length=200))

# Calculate ATR
df['ATR'] = df.groupby("Symbol").apply(lambda x: ta.atr(x['High'], x['Low'], x['Close'], length=20)).reset_index(level=0, drop=True)

# # Calculate Bollinger Bands
# bollinger = df.groupby("Symbol").apply(lambda x: ta.bbands(x['Close'], length=20)).reset_index(level=0, drop=True)
# df['BB_Lower'], df['BB_Middle'], df['BB_Upper'] = bollinger['BBL_20_2.0'], bollinger['BBM_20_2.0'], bollinger['BBU_20_2.0']

# Calculate Keltner Channel
# keltner = df.groupby("Symbol").apply(lambda x: ta.kc(x['High'], x['Low'], x['Close'], length=20)).reset_index(level=0, drop=True)
# df['KC_Lower'], df['KC_Middle'], df['KC_Upper'] = keltner.iloc[:, 0], keltner.iloc[:, 1], keltner.iloc[:, 2]

# # Calculate Donchian Channel
# donchian = df.groupby("Symbol").apply(lambda x: ta.donchian(x['High'], x['Low'], lower_length=20, upper_length=20)).reset_index(level=0, drop=True)
# df['DC_Lower'], df['DC_Middle'], df['DC_Upper'] = donchian['DCL_20_20'], donchian['DCM_20_20'], donchian['DCU_20_20']






In [40]:
testdata = yf.download(["AAPL", "MSFT"], group_by="ticker", period="max", auto_adjust=True)




[*********************100%***********************]  2 of 2 completed


In [41]:
df = testdata.stack(level=0).rename_axis(['Date', 'Symbol']).reset_index(level=1)
df

Price,Symbol,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1980-12-12 00:00:00+00:00,AAPL,0.100764,0.101203,0.100764,0.100764,469033600.0
1980-12-15 00:00:00+00:00,AAPL,0.095508,0.095946,0.095508,0.095946,175884800.0
1980-12-16 00:00:00+00:00,AAPL,0.088498,0.088936,0.088498,0.088936,105728000.0
1980-12-17 00:00:00+00:00,AAPL,0.090688,0.091126,0.090688,0.090688,86441600.0
1980-12-18 00:00:00+00:00,AAPL,0.093317,0.093755,0.093317,0.093317,73449600.0
...,...,...,...,...,...,...
2024-09-10 00:00:00+00:00,MSFT,414.200012,416.329987,407.700012,408.200012,19594300.0
2024-09-11 00:00:00+00:00,AAPL,222.660004,223.089996,217.889999,221.460007,44587100.0
2024-09-11 00:00:00+00:00,MSFT,423.040009,423.989990,409.579987,415.500000,19266900.0
2024-09-12 00:00:00+00:00,AAPL,222.770004,223.539993,219.820007,222.300003,37376315.0


In [43]:
# Calculate Bollinger Bands for each symbol
def calculate_bbands(group):
    bb = ta.bbands(group['Close'], length=20, std=2)
    return pd.DataFrame({
        'BB_Lower': bb['BBL_20_2.0'],
        'BB_Upper': bb['BBU_20_2.0'],
        'BB_Bandwidth': bb['BBB_20_2.0']
    })

# Apply the Bollinger Bands calculation to each symbol
bbands = df.groupby("Symbol").apply(calculate_bbands).reset_index(level=0, drop=True)



To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  bbands = df.groupby("Symbol").apply(calculate_bbands).reset_index(level=0, drop=True)


In [44]:
bbands

Unnamed: 0,BB_Lower,BB_Upper,BB_Bandwidth
0,,,
1,,,
2,,,
3,,,
4,,,
...,...,...,...
20727,402.825401,426.146602,5.626535
20728,218.915230,230.449771,5.133707
20729,402.700546,427.174457,5.898216
20730,219.058150,230.411851,5.052039


In [33]:
# Download AAPL data
test = yf.download("AAPL", period="max", auto_adjust=True)

# Calculate Bollinger Bands using pandas_ta
bb = ta.bbands(test['Close'], length=20, std=2)

test['BB_Lower'], test['BB_Upper'], test['BB_Bandwidth'] = bb['BBL_20_2.0'], bb['BBU_20_2.0'], bb['BBB_20_2.0']
test


[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Volume,BB_Lower,BB_Upper,BB_Bandwidth
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1980-12-12,0.100764,0.101203,0.100764,0.100764,469033600,,,
1980-12-15,0.095946,0.095946,0.095508,0.095508,175884800,,,
1980-12-16,0.088936,0.088936,0.088498,0.088498,105728000,,,
1980-12-17,0.090688,0.091126,0.090688,0.090688,86441600,,,
1980-12-18,0.093317,0.093755,0.093317,0.093317,73449600,,,
...,...,...,...,...,...,...,...,...
2024-09-06,223.949997,225.240005,219.770004,220.820007,48423000,217.075214,231.425787,6.399350
2024-09-09,220.820007,221.270004,216.710007,220.910004,67180000,218.107102,230.860899,5.681383
2024-09-10,218.919998,221.479996,216.729996,220.110001,51591000,218.717832,230.508169,5.249178
2024-09-11,221.460007,223.089996,217.889999,222.660004,44587100,218.915230,230.449771,5.133707


In [32]:
test

Unnamed: 0_level_0,Open,High,Low,Close,Volume,BB_Lower,BB_Upper,BB_Bandwidth
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1980-12-12,0.100764,0.101203,0.100764,0.100764,469033600,,,
1980-12-15,0.095946,0.095946,0.095508,0.095508,175884800,,,
1980-12-16,0.088936,0.088936,0.088498,0.088498,105728000,,,
1980-12-17,0.090688,0.091126,0.090688,0.090688,86441600,,,
1980-12-18,0.093317,0.093755,0.093317,0.093317,73449600,,,
...,...,...,...,...,...,...,...,...
2024-09-06,223.949997,225.240005,219.770004,220.820007,48423000,217.075214,231.425787,6.399350
2024-09-09,220.820007,221.270004,216.710007,220.910004,67180000,218.107102,230.860899,5.681383
2024-09-10,218.919998,221.479996,216.729996,220.110001,51591000,218.717832,230.508169,5.249178
2024-09-11,221.460007,223.089996,217.889999,222.660004,44587100,218.915230,230.449771,5.133707


In [None]:
df = data.stack(level=0).rename_axis(['Date', 'Symbol']).reset_index(level=1)

In [None]:
df = df.sort_values(by='Symbol',axis='index',kind='stable')

In [None]:
# univ_df = univ_df.set_index('Symbol',drop=True)

In [None]:
# Create a dictionary mapping Symbols to Names
symbol_to_name = dict(zip(univ_df["Symbol"], univ_df["Name"]))
symbol_to_sector = dict(zip(univ_df["Symbol"], univ_df["Sector"]))

# Use the map function to map the Name values based on the Symbol column


df["Name"] = df["Symbol"].map(symbol_to_name)
df['Sector'] = df['Symbol'].map(symbol_to_sector)

In [None]:
df["20D_SMA"] = df.groupby("Symbol")["Close"].rolling(window=20).mean().reset_index(level=0, drop=True)
df["50D_SMA"] = df.groupby("Symbol")["Close"].rolling(window=50).mean().reset_index(level=0, drop=True)
df["200D_SMA"] = df.groupby("Symbol")["Close"].rolling(window=200).mean().reset_index(level=0, drop=True)
#df['ATR'] = df.groupby('Symbol').apply(lambda group: talib.ATR(group['High'], group['Low'], group['Close'], timeperiod=20)).reset_index(level=0, drop=True)
df['1M'] = df.groupby('Symbol')['Close'].pct_change(21)
df['3M'] = df.groupby('Symbol')['Close'].pct_change(63)
df['6M'] = df.groupby('Symbol')['Close'].pct_change(126)
df['12M'] = df.groupby('Symbol')['Close'].pct_change(252)
df['RS IBD'] = 2*df['3M']+df['6M']+df['12M'] # IBD Relative Strength =  2x 3M + 1x 6M + 1x 12M
df['RS Rank'] = df.groupby(df.index)['RS IBD'].rank(pct=True)
df["RS Rank 20D MA"] = df.groupby("Symbol")["RS Rank"].rolling(window=20).mean().reset_index(level=0, drop=True)

In [None]:
df[(df.index == "2024-09-11") & (df['RS Rank 20D MA'] > df['RS Rank'])].sort_values('RS Rank').dropna()

In [None]:
univ_df

In [None]:
ac