# Create Datasets

Create an HDF5 table from SHARADAR data that mirrors the WIKI_PRICES.csv format.
This script combines data from SHARADAR_SEP.csv (price data) and SHARADAR_ACTIONS.csv
(dividend and split information) to create a dataset compatible with the format used
in the ML4T examples.

In [14]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from pathlib import Path
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Define paths
DATA_DIR = Path('/home/noslen/alpaca-trading/data')
SHARADAR_DIR = DATA_DIR / 'SHARADAR'
OUTPUT_FILE = DATA_DIR / 'assets.h5'

### Load SHARADAR_SEP.csv price data

In [15]:
sep_path = SHARADAR_DIR / 'SHARADAR_SEP.csv'
sep_df = pd.read_csv(
    sep_path,
    parse_dates=['date'],
    index_col=None
)
print(sep_df.head())
print(f"loaded {len(sep_df)} rows from {sep_path}")

   ticker       date  open  high   low  close   volume  closeadj  closeunadj  \
0   ABILF 2021-11-09  0.30  0.33  0.30   0.33   7500.0      0.33        0.33   
1   ABILF 2021-11-08  0.35  0.35  0.35   0.35      0.0      0.35        0.35   
2     AAC 2021-09-24  9.74  9.75  9.73   9.75  38502.0      9.75        9.75   
3   AAC.U 2021-09-24  9.95  9.95  9.90   9.90   2692.0      9.90        9.90   
4  AAC.WS 2021-09-24  0.92  0.92  0.87   0.89  38784.0      0.89        0.89   

  lastupdated  
0  2021-11-09  
1  2021-11-09  
2  2021-09-24  
3  2021-09-24  
4  2021-09-24  
loaded 17300827 rows from /home/noslen/alpaca-trading/data/SHARADAR/SHARADAR_SEP.csv


### Load SHARADAR_ACTIONS.csv for dividend and split information

In [16]:
actions_path = SHARADAR_DIR / 'SHARADAR_ACTIONS.csv'
    
actions_df = pd.read_csv(
    actions_path,
    parse_dates=['date'],
    index_col=None
)

print(actions_df.head())
print(f"loaded {len(actions_df)} rows from {actions_path}")


        date         action ticker                               name  \
0 2015-01-02       delisted   XWES         WORLD ENERGY SOLUTIONS INC   
1 2015-01-02  acquisitionby   XWES         WORLD ENERGY SOLUTIONS INC   
2 2015-01-02       dividend    WSR                    WHITESTONE REIT   
3 2015-01-02       dividend   WSCI                 WSI INDUSTRIES INC   
4 2015-01-02          split  WMLPQ  WESTMORELAND RESOURCE PARTNERS LP   

      value contraticker   contraname  
0  69.40000          NaN          NaN  
1  69.40000         ENOC  ENERNOC INC  
2   0.09500          NaN          NaN  
3   0.04000          NaN          NaN  
4   0.08333          NaN          NaN  
loaded 323840 rows from /home/noslen/alpaca-trading/data/SHARADAR/SHARADAR_ACTIONS.csv


### Extract dividend information from actions dataframe

In [17]:
# Filter for dividend actions
dividends = actions_df[actions_df['action'] == 'dividend'].copy()

# Create a dataframe with ticker, date, and dividend value
dividend_df = dividends[['date', 'ticker', 'value']].rename(columns={'value': 'ex-dividend'})

# If there are multiple dividends on the same day for the same ticker, sum them
dividend_df = dividend_df.groupby(['date', 'ticker']).sum().reset_index()

### Extract split information from actions dataframe

In [1]:
### Filter for split actions
splits = actions_df[actions_df['action'] == 'split'].copy()
    
### Create a dataframe with ticker, date, and split ratio
split_df = splits[['date', 'ticker', 'value']].rename(columns={'value': 'split_ratio'})

NameError: name 'actions_df' is not defined

## Transform SHARADAR data into WIKI_PRICES.csv format

WIKI_PRICES.csv columns:
ticker,date,open,high,low,close,volume,ex-dividend,split_ratio,adj_open,adj_high,adj_low,adj_close,adj_volume

SHARADAR_SEP.csv columns:
ticker,date,open,high,low,close,volume,closeadj,closeunadj,lastupdated

In [19]:
# Start with the price data
wiki_df = sep_df[['ticker', 'date', 'open', 'high', 'low', 'close', 'volume']].copy()
    
# Add ex-dividend column (default to 0.0)
wiki_df['ex-dividend'] = 0.0
    
# Add split_ratio column (default to 1.0)
wiki_df['split_ratio'] = 1.0

# Update with actual dividend information
if not dividend_df.empty:
    logger.info("Merging dividend information...")
    # Merge dividend information
    wiki_df = pd.merge(
        wiki_df, 
        dividend_df, 
        on=['ticker', 'date'], 
        how='left'
    )
    # Fill missing values with 0.0 and handle duplicates
    wiki_df['ex-dividend'] = wiki_df['ex-dividend_y'].fillna(wiki_df['ex-dividend_x'])
    wiki_df.drop(['ex-dividend_x', 'ex-dividend_y'], axis=1, inplace=True)

# Update with actual split information
if not split_df.empty:
    logger.info("Merging split information...")
    # Merge split information
    wiki_df = pd.merge(
        wiki_df, 
        split_df, 
        on=['ticker', 'date'], 
        how='left'
    )
    # Fill missing values with 1.0 and handle duplicates
    wiki_df['split_ratio'] = wiki_df['split_ratio_y'].fillna(wiki_df['split_ratio_x'])
    wiki_df.drop(['split_ratio_x', 'split_ratio_y'], axis=1, inplace=True)

# Calculate adjusted values using closeadj/close ratio from SHARADAR
# In SHARADAR, closeadj is already adjusted for both splits and dividends
adj_ratio = sep_df['closeadj'] / sep_df['close']

wiki_df['adj_open'] = sep_df['open'] * adj_ratio
wiki_df['adj_high'] = sep_df['high'] * adj_ratio
wiki_df['adj_low'] = sep_df['low'] * adj_ratio
wiki_df['adj_close'] = sep_df['closeadj']
wiki_df['adj_volume'] = sep_df['volume']  # Volume typically doesn't need adjustment in this context

# Set index to date and ticker for consistency with WIKI_PRICES format
wiki_df = wiki_df.set_index(['date', 'ticker']).sort_index()

print(wiki_df.info())
print(wiki_df.head())

2025-05-31 15:01:07,371 - INFO - Merging dividend information...
2025-05-31 15:01:17,331 - INFO - Merging split information...


: 

: 

In [None]:
# quandl_path = DATA_DIR / 'WIKI_PRICES.csv'
# quandl_df = (pd.read_csv(quandl_path,
#                  parse_dates=['date'],
#                  index_col=['date', 'ticker'],
#                  infer_datetime_format=True)
#      .sort_index())

In [None]:
# # Compare wiki_df and quandl_df for duplicate indexes
# print("Comparing wiki_df and quandl_df for duplicate indexes...")

# # Basic info about both dataframes
# print(f"wiki_df shape: {wiki_df.shape}")
# print(f"quandl_df shape: {quandl_df.shape}")

# # Get unique tickers and dates in both dataframes
# wiki_tickers = wiki_df.index.get_level_values('ticker').unique()
# wiki_dates = wiki_df.index.get_level_values('date').unique()
# quandl_tickers = quandl_df.index.get_level_values('ticker').unique()
# quandl_dates = quandl_df.index.get_level_values('date').unique()

# print(f"wiki_df unique tickers: {len(wiki_tickers)}")
# print(f"wiki_df date range: {wiki_dates.min()} to {wiki_dates.max()}")
# print(f"quandl_df unique tickers: {len(quandl_tickers)}")
# print(f"quandl_df date range: {quandl_dates.min()} to {quandl_dates.max()}")

# # Find common tickers and dates
# common_tickers = set(wiki_tickers).intersection(set(quandl_tickers))
# common_dates = set(wiki_dates).intersection(set(quandl_dates))

# print(f"Number of common tickers: {len(common_tickers)}")
# print(f"Number of common dates: {len(common_dates)}")

# # Check for duplicate indexes (date, ticker pairs)
# wiki_indexes = set(wiki_df.index.to_flat_index())
# quandl_indexes = set(quandl_df.index.to_flat_index())
# duplicate_indexes = wiki_indexes.intersection(quandl_indexes)

# print(f"Number of duplicate indexes (date, ticker pairs): {len(duplicate_indexes)}")

# # If there are duplicates, show a sample
# if len(duplicate_indexes) > 0:
#     print("\nSample of duplicate indexes:")
#     sample_size = min(5, len(duplicate_indexes))
#     sample_duplicates = list(duplicate_indexes)[:sample_size]
    
#     # Convert tuple indexes back to MultiIndex for easier comparison
#     sample_idx = pd.MultiIndex.from_tuples(sample_duplicates, names=['date', 'ticker'])
    
#     print("\nQuandl data for duplicates:")
#     print(quandl_df.loc[sample_idx])
    
#     print("\nWiki data for duplicates:")
#     print(wiki_df.loc[sample_idx])
    
#     # Compare values for duplicate indexes
#     print("\nComparing values for duplicate indexes...")
#     for idx in sample_duplicates:
#         quandl_row = quandl_df.loc[idx]
#         wiki_row = wiki_df.loc[idx]
        
#         # Compare common columns
#         common_cols = set(quandl_df.columns).intersection(set(wiki_df.columns))
#         for col in common_cols:
#             quandl_val = quandl_row[col]
#             wiki_val = wiki_row[col]
            
#             if isinstance(quandl_val, (float, int)) and isinstance(wiki_val, (float, int)):
#                 if not np.isclose(quandl_val, wiki_val, rtol=1e-5):
#                     print(f"Different values for {idx}, column {col}: Quandl={quandl_val}, Wiki={wiki_val}")
#             elif quandl_val != wiki_val:
#                 print(f"Different values for {idx}, column {col}: Quandl={quandl_val}, Wiki={wiki_val}")

In [None]:
# with pd.HDFStore(OUTPUT_FILE) as store:
#     store.put('quandl/wiki/prices', quandl_df)

In [None]:
with pd.HDFStore(OUTPUT_FILE) as store:
    # Use the same path as in the examples: 'sharadar/prices'
    store.put('sharadar/prices', wiki_df)
    logger.info(f"Data saved to {OUTPUT_FILE} at path 'sharadar/prices'")
    
    # Print information about the stored data
    logger.info("HDF5 store contents:")
    for item in store.keys():
        logger.info(f"  {item}: {store.get_storer(item)}")

ImportError: Missing optional dependency 'pytables'.  Use pip or conda to install pytables.