In [None]:
####CREATE TICKER SPECIFIC FILES

import os
import polars as pl
import glob
import gzip
from concurrent.futures import ThreadPoolExecutor, as_completed

# Function to process a single file
def process_file(file_path, ticker, year_month, output_dir):
    with gzip.open(file_path, 'rb') as f:
        df = pl.read_csv(f)
    df_filtered = df.filter(df['ticker'] == ticker)
    output_file = os.path.join(output_dir, f"{year_month}_{ticker}.csv")
    # Since writing to the same file from multiple threads is not safe, we return the filtered DataFrame
    return df_filtered, output_file

# Specify the directory containing the data files and other parameters
data_dir = "/Users/brandon/Documents/polygon_data/minute_aggs/"
tickers = ["NVDA","MSFT", "TSM", "AAPL","FICO"]  # List of tickers to process
tickers = ["CRWD", "NDVX", "GOOG", "META"]  # List of tickers to process

# Configure the maximum number of threads to use
max_workers = 24

# Loop through each ticker
for ticker in tickers:
    output_dir = f"/Users/brandon/Documents/stonk_bot_data/{ticker}/" 
    os.makedirs(output_dir, exist_ok=True)

    # Create a dictionary to store monthly dataframes
    monthly_dfs = {}

    # Loop through each year from 2019 to 2025
    for year in range(2024, 2025):
        for month in range(1, 13):
            month_dir = os.path.join(data_dir, str(year), "{:02d}".format(month))
            if not os.path.exists(month_dir):
                print(f"No data found for {year}-{month:02d}")
                continue

            # Collect all file paths to process
            file_paths = [os.path.join(month_dir, file) for file in os.listdir(month_dir) if file.endswith(".csv.gz")]

            # Use ThreadPoolExecutor to process files in parallel
            with ThreadPoolExecutor(max_workers=max_workers) as executor:
                # Create a future for each file
                futures = [executor.submit(process_file, file_path, ticker, f"{year}-{month:02d}", output_dir) for file_path in file_paths]

                # As futures complete, process their results
                for future in as_completed(futures):
                    df_filtered, output_file = future.result()
                    # Append or extend the DataFrame in monthly_dfs
                    year_month = os.path.basename(output_file).replace(f"_{ticker}.csv", "")
                    if year_month not in monthly_dfs:
                        monthly_dfs[year_month] = df_filtered
                    else:
                        monthly_dfs[year_month] = monthly_dfs[year_month].extend(df_filtered)
                    print(f"Processed {output_file}")

    # Output the combined monthly DataFrames to files
    for year_month, df in monthly_dfs.items():
        output_file = os.path.join(output_dir, f"{year_month}_{ticker}.csv")
        df.write_csv(output_file)
        print(f"Saved {output_file}")


In [None]:
#*******CREATE BASE DATA FRAME****************

import os
import os
import polars as pl
import datetime as dt
from datetime import timezone
import numpy as np
from polars import Config
import pytz
from zoneinfo import ZoneInfo

pl.Config.set_fmt_str_lengths(100)
pl.Config.set_tbl_rows(1000)
pl.Config(tbl_cols=-1)

tickers = ["NVDA"]
# tickers = ['NVDA', 'MSFT', 'AAPL', 'TSM']
# tickers = ['NVDA', 'MSFT', 'AAPL', 'TSM',
#            'GOOG', 'FICO']
# tickers = ["NVDA", "AAPL", "GOOG", "MSFT",
#            "TSM", "CRWD", "NVDX", "FICO"]  # List of tickers

# directory_path = f"/Users/brandon/Documents/stonk_bot_data/{ticker}/"

def combine_data_for_tickers(tickers):
    combined_df = None  # Initialize combined DataFrame

    for ticker in tickers:
        directory_path = f"/Users/brandon/Documents/stonk_bot_data/{ticker}/"

        try:
            # List all CSV files in the directory
            files = [os.path.join(directory_path, file) for file in os.listdir(directory_path)
                     if file.endswith(".csv")]
        except FileNotFoundError:
            print(f"Directory not found for ticker: {ticker}. Skipping...")
            continue  # Skip to the next ticker

        # Ensure there are files to process
        if not files:
            print(f"No CSV files found for ticker: {ticker}. Skipping...")
            continue

        # Read the first CSV file to infer the schema
        df = pl.read_csv(files[0])
        schema = {col: df[col].dtype for col in df.columns}

        # Create an empty list to store DataFrames
        dfs = []

        # Read each CSV file and append its DataFrame to the list, applying the schema
        for file in files:
            df = pl.read_csv(file, dtypes=schema)
            dfs.append(df)

        # Concatenate all DataFrames into a single DataFrame for the current ticker
        if dfs:  # Ensure dfs is not empty
            ticker_df = pl.concat(dfs)
            ticker_df = ticker_df.with_columns(pl.lit(ticker).alias('ticker'))

            # Append the DataFrame for the current ticker to the combined DataFrame
            if combined_df is None:
                combined_df = ticker_df
            else:
                combined_df = combined_df.vstack(ticker_df)

    return combined_df


df = combine_data_for_tickers(tickers)



# Drop index column if it exists
if 'index' in df.columns:
    df = df.drop('index')

# Drop duplicates
df = df.unique(subset=df.columns)


# eastern_tz = ZoneInfo("America/New_York")

# def convert_to_est(df):
#     # Convert nanoseconds to microseconds (Polars' timestamp precision)
#     df = df.with_columns([
#         (pl.col('window_start') // 1000).cast(pl.Datetime('us')).alias('window_start_utc'),
#         ((pl.col('window_start') + 60 * 1e9) // 1000).cast(pl.Datetime('us')).alias('window_end_utc')
#     ])

#     # Convert UTC to Eastern Time
#     df = df.with_columns([
#         pl.col('window_start_utc').dt.replace_time_zone("UTC").dt.convert_time_zone(eastern_tz).alias('window_start_est'),
#         pl.col('window_end_utc').dt.replace_time_zone("UTC").dt.convert_time_zone(eastern_tz).alias('window_end_est')
#     ])

#     return df
# df = agg_df

# Ensure Polars DataFrame contains the data with nanoseconds
df = df.with_columns([
    (pl.col("window_start").alias("window_start_ts")),
    (pl.col("window_start") / 1e3).cast(pl.Datetime).alias("window_start_dt"),
    ((pl.col("window_start") + 60 * 1e9) / 1e3).cast(pl.Datetime).alias("window_end_dt")
])
# # Convert UTC datetime to Eastern Time
# df = df.with_columns([
#     pl.col("window_start_dt").dt.convert_time_zone("US/Eastern").alias("window_start_est"),
#     pl.col("window_end_dt").dt.convert_time_zone("US/Eastern").alias("window_end_est")
# ])

df = df.with_columns([
    pl.col("window_start_dt").dt.replace_time_zone("UTC").alias("window_start_dt_utc"),
    pl.col("window_end_dt").dt.replace_time_zone("UTC").alias("window_end_dt_utc")
])

# Convert the timezone to "US/Eastern"
df = df.with_columns([
    pl.col("window_start_dt_utc").dt.convert_time_zone("US/Eastern").alias("window_start_est"),
    pl.col("window_end_dt_utc").dt.convert_time_zone("US/Eastern").alias("window_end_est")
])

# Define trading hours
pre_market_start = dt.time(4, 0)  # Pre-market starts at 4:00 AM
trading_start = dt.time(9, 30)
trading_end = dt.time(16, 0)

# Create flags for regular trading hours, after hours, and pre-market hours
regular_trading_hours = pl.when(
    (pl.col('window_start_est').dt.time() >= trading_start) &
    (pl.col('window_start_est').dt.time() <= trading_end)
).then(1).otherwise(0)

after_hours = pl.when(
    (pl.col('window_start_est').dt.time() > trading_end) |
    (pl.col('window_start_est').dt.time() < pre_market_start)
).then(1).otherwise(0)

pre_market_hours = pl.when(
    (pl.col('window_start_est').dt.time() >= pre_market_start) &
    (pl.col('window_start_est').dt.time() < trading_start)
).then(1).otherwise(0)

# Add the new columns to the DataFrame
df = df.with_columns([
    regular_trading_hours.alias('regular_trading_hours'),
    after_hours.alias('after_hours'),
    pre_market_hours.alias('pre_market_hours'),
    df['window_start_est'].dt.date().alias('transaction_date')
])

unfiltered_df = df
df = df.filter(pl.col('regular_trading_hours') == 1)
print(f"Length unfiltered {len(unfiltered_df)}")
print(f"Length filtered {len(df)}")

In [None]:
df.tail(1000)

In [None]:
#########Apply Stock Splits

import requests
import logging
import datetime as dt
import polars as pl

pl.Config.set_tbl_formatting("UTF8_FULL")
pl.Config.set_fmt_str_lengths(100)


def get_stock_splits(ticker):
    url = f"https://api.polygon.io/v3/reference/splits?ticker={ticker}&apiKey={API_KEY}"
    response = requests.get(url)
    response.raise_for_status()  # Raise an error for bad status codes
    splits = response.json().get("results", [])
    return splits

# List of tickers
tickers = ["NVDA","MSFT", "TSM", "AAPL","FICO", "CRWD", "NDVX", "GOOG", "META"]

# Combine splits data for all tickers
def combine_splits_for_tickers(tickers):
    all_splits = []
    for ticker in tickers:
        splits = get_stock_splits(ticker)
        for split in splits:
            all_splits.append({
                "ticker": split['ticker'],
                "execution_date": split['execution_date'],
                "split_from": split['split_from'],
                "split_to": split['split_to']
            })
    
    # Convert to Polars DataFrame
    splits_df = pl.DataFrame(all_splits)
    return splits_df

# Fetch and combine splits for all tickers
splits_df = combine_splits_for_tickers(tickers)

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


# Check the type of the execution_date column
if splits_df['execution_date'].dtype == pl.Date:
    print("execution_date is already a Date type. No conversion needed.")
elif splits_df['execution_date'].dtype == pl.Datetime:
    # If it's a datetime, convert to date
    splits_df = splits_df.with_columns([
        pl.col('execution_date').dt.date().alias('execution_date')
    ])
elif splits_df['execution_date'].dtype == pl.Utf8:
    # If it's a string, convert to date
    splits_df = splits_df.with_columns([
        pl.col('execution_date').str.strptime(pl.Date, "%Y-%m-%d").alias('execution_date')
    ])
else:
    raise ValueError(f"Unexpected type for execution_date: {splits_df['execution_date'].dtype}")


def apply_adjustments(df: pl.DataFrame, splits_df: pl.DataFrame, type: str, price_cols: list, volume_cols: list) -> pl.DataFrame:
    # Ensure splits_df is sorted by execution_date
    splits_df = splits_df.sort('execution_date')

    # Convert window_start_est to date
    df = df.with_columns([
        pl.col('window_start_est').dt.date().alias("window_start_date")
    ])

    # Determine the prefix based on the type
    prefix = "adjusted_" if type == "stock" else "adjusted_option_"

    # Initialize the applied_splits column as an empty list
    df = df.with_columns([pl.lit("").alias('applied_splits')])

    # Create adjusted columns for price and volume
    for col in price_cols + volume_cols:
        df = df.with_columns([pl.col(col).alias(f'{prefix}{col}')])

    for split in splits_df.to_dicts():
        ticker = split['ticker']
        execution_date = split['execution_date']
        # If execution_date is already a date, we don't need to parse it
        if isinstance(execution_date, dt.date):
            pass
        elif isinstance(execution_date, str):
            execution_date = dt.datetime.strptime(execution_date, '%Y-%m-%d').date()
        else:
            raise ValueError(f"Unexpected type for execution_date: {type(execution_date)}")
        
        split_from = split['split_from']
        split_to = split['split_to']
        ratio = split_to / split_from

        # Convert the split dictionary to a string for storage
        split_str = str(split)

        # Apply split to relevant rows
        adjustments = []
        for col in price_cols:
            adjustments.append(
                pl.when((pl.col('ticker') == ticker) & (pl.col('window_start_date') < execution_date))
                .then(pl.col(f'{prefix}{col}') / ratio)
                .otherwise(pl.col(f'{prefix}{col}')).alias(f'{prefix}{col}')
            )
        
        for col in volume_cols:
            adjustments.append(
                pl.when((pl.col('ticker') == ticker) & (pl.col('window_start_date') < execution_date))
                .then(pl.col(f'{prefix}{col}') * ratio)
                .otherwise(pl.col(f'{prefix}{col}')).alias(f'{prefix}{col}')
            )

        adjustments.append(
            pl.when((pl.col('ticker') == ticker) & (pl.col('window_start_date') < execution_date))
            .then(pl.concat_str([pl.col('applied_splits'), pl.lit(f",{split_str}")]))
            .otherwise(pl.col('applied_splits')).alias('applied_splits')
        )

        df = df.with_columns(adjustments)

    return df

# Apply the stock splits to the DataFrame
df_split = apply_adjustments(df, splits_df, "stock", ["open", "close", "high", "low"], ["volume"])

def format_column(column):
    return column.cast(pl.Int64)

cols = ["adjusted_volume"]

# Apply the function to all columns
df_split = df_split.with_columns([format_column(pl.col(column)).alias(column) for column in cols])

print("Stock Splits Applied")
df_split = df_split.sort("window_start_est")
df_split.tail(10000)


In [None]:
######ADD HISTORICAL/REALIZED VOLATILITY

def add_historical_volatility(df: pl.DataFrame, period: int = 252) -> pl.DataFrame:
    # Ensure the DataFrame is sorted by window_start_est
    df = df.sort('window_start_est')

    # Extract the date part and calculate daily close prices
    daily_df = df.group_by('transaction_date').agg([
        pl.col('adjusted_close').last().alias('daily_close')
    ])

    # Calculate daily returns
    daily_df = daily_df.with_columns([
        pl.col('daily_close').pct_change().alias('daily_return')
    ]).drop_nulls()

    # Check if we have enough data for the specified period
    if len(daily_df) < period:
        raise ValueError(f"Not enough data for the specified period. Got {len(daily_df)} days, need at least {period}.")

    # Calculate the standard deviation of daily returns for a rolling window
    daily_df = daily_df.with_columns([
        pl.col('daily_return').rolling_std(window_size=period, min_periods=period)
        .mul(252 ** 0.5)  # Annualize
        .mul(.01) # divide by 100 for percentage
        .alias('hv')
    ])

    # Join the HV data back to the original DataFrame
    df = df.join(daily_df.select(['transaction_date', 'hv']), on='transaction_date', how='left')

    # Fill any NaN values in the 'hv' column with the first valid HV value
    first_valid_hv = df.select(pl.col('hv').drop_nulls().first()).item()
    df = df.with_columns(pl.col('hv').fill_null(first_valid_hv))

    return df


df_with_hv = add_historical_volatility(df_split)
print("Historical Volatility calculated")
# df_with_hv.tail(10000)

In [None]:
#*******ADD INDICATORS****************
import ta

df = df_with_hv
# Define a function to calculate EMA given a series, span, and adjust flag
def ema_custom(series: pl.Series, name: str, span: int) -> pl.Series:

    ema = series.ewm_mean(com=span, adjust=False, ignore_nulls=True).alias(name)
    return ema

df = df.sort('window_start')

# Calculate traded value for each transaction
df = df.with_columns([
    (pl.col('volume') * pl.col('close')).alias('traded_value')
])
df = df.with_columns([
    (pl.col('adjusted_volume') * pl.col('adjusted_close')).alias('adjusted_traded_value')
])

# Calculate cumulative totals within each trading session
df = df.with_columns([
    pl.col('traded_value').cum_sum().over(['ticker', 'transaction_date']).alias('daily_traded_value'),
    pl.col('volume').cum_sum().over(['ticker', 'transaction_date']).alias('daily_cumulative_volume'),
    pl.col('adjusted_traded_value').cum_sum().over(['ticker', 'transaction_date']).alias('adjusted_daily_traded_value'),
    pl.col('adjusted_volume').cum_sum().over(['ticker', 'transaction_date']).alias('adjusted_daily_cumulative_volume')
])

# Calculate the VWAP for each transaction
df = df.with_columns([
    (pl.col('adjusted_daily_traded_value') / pl.col('adjusted_daily_cumulative_volume')).alias('daily_vwap')
])

df = df.with_columns([
    pl.col('daily_vwap').rolling_mean(window_size=2).over(["ticker"]).alias('vwap_2d'),
    pl.col('daily_vwap').rolling_mean(window_size=3).over(["ticker"]).alias('vwap_3d'),
    pl.col('daily_vwap').ewm_mean(com=200, adjust=False, ignore_nulls=True).over(["ticker", "transaction_date"]).alias('ema_200_vwap')
])

# Extract hour from 'window_start_est'
df = df.with_columns(pl.col('window_start_est').dt.hour().alias('hour'))

hourly_vwap_calculation = df.group_by(['ticker', 'transaction_date', 'hour']).agg([
    (pl.sum('traded_value').alias('hourly_traded_value')),
    (pl.sum('volume').alias('hourly_volume')),
    (pl.sum('traded_value') / pl.sum('volume')).alias('hourly_vwap')
])

# Merge the hourly VWAP back into the original DataFrame
df = df.join(
    hourly_vwap_calculation,
    on=['ticker', 'transaction_date', 'hour'],
    how='left'
)

df = df.sort(by=['ticker', 'transaction_date'], descending=True)


# Define a function to calculate EMA given a series, span, and adjust flag
def ema_custom(series: pl.Series, name: str, span: int) -> pl.Series:
    alpha = 2 / (span + 1)
    ema = series.ewm_mean(com=span, adjust=False, ignore_nulls=True).alias(name)
    return ema

# Calculate EMA for 12 and 26 periods and add them to the DataFrame
df = df.with_columns(
    [
        ema_custom(df["adjusted_close"], f"ema_{span}", span=span)
        for span in [12, 26]
    ]
)

def calculate_sma(df, periods=[50, 200]):
    for period in periods:
        df = df.with_columns(
            pl.col('adjusted_close').rolling_mean(window_size=period).alias(f'sma_{period}')
        )
    return df

df = calculate_sma(df, periods=[50, 200])

print(df.schema)

# Calculate the MACD line
df = df.with_columns((pl.col("ema_12") - pl.col("ema_26")).alias("MACD_line"))

# Calculate the Signal line
df = df.with_columns(ema_custom(pl.col("MACD_line"), "signal_line", span=9))

# Calculate the distance between MACD and Signal lines
df = df.with_columns((pl.col("MACD_line") - pl.col("signal_line")).alias("MACD_Signal_distance"))

# Calculate the slope of the MACD line (current value - previous value)
df = df.with_columns(pl.col("MACD_line").diff().alias("MACD_slope"))

# Calculate the slope of the Signal line (current value - previous value)
df = df.with_columns(pl.col("signal_line").diff().alias("Signal_slope"))

# # Identify MACD line crosses above Signal line (bullish signal)
# df = df.with_columns(
#     ((pl.col("MACD_line") > pl.col("signal_line")) & 
#     (pl.col("MACD_line").shift(-1) <= pl.col("signal_line").shift(-1))).cast(pl.Int8).alias("MACD_cross_above_Signal")
# )

# # Identify MACD line crosses below Signal line (bearish signal)
# df = df.with_columns(
#     ((pl.col("MACD_line") < pl.col("signal_line")) & 
#     (pl.col("MACD_line").shift(-1) >= pl.col("signal_line").shift(-1))).cast(pl.Int8).alias("MACD_cross_below_Signal")
# )

# Identify MACD line crosses above Signal line (bullish signal)
df = df.with_columns(
    (
        (pl.col("MACD_line") > pl.col("signal_line")) &
        (pl.col("MACD_line").shift(1) <= pl.col("signal_line").shift(1))
    ).cast(pl.Int8).alias("MACD_cross_above_Signal")
)

# Identify MACD line crosses below Signal line (bearish signal)
df = df.with_columns(
    (
        (pl.col("MACD_line") < pl.col("signal_line")) &
        (pl.col("MACD_line").shift(1) >= pl.col("signal_line").shift(1))
    ).cast(pl.Int8).alias("MACD_cross_below_Signal")
)

# Calculate the angle of intersection (in degrees) at the points where the MACD crosses the Signal line
# df = df.with_columns([
#     pl.when(
#         (df["MACD_cross_above_Signal"] == 1) | (df["MACD_cross_below_Signal"] == 1)
#     ).then(
#         (pl.col("MACD_slope") - pl.col("Signal_slope")).map_elements(np.rad2deg)
#     ).otherwise(pl.lit(None)).alias("MACD_x_Signal_angle_of_intersection")
# ])

# Group by ticker and transaction_date and calculate the sums.
daily_crosses = df.group_by(['ticker', 'transaction_date']).agg([
    pl.sum('MACD_cross_above_Signal').alias('daily_MACD_crosses_above'),
    pl.sum('MACD_cross_below_Signal').alias('daily_MACD_crosses_below')
])

# Create the flags for crosses above, below, or at zero
df = df.with_columns([
    pl.when(
        (pl.col("MACD_cross_above_Signal") == 1) & (pl.col("MACD_line") < 0)
    ).then(True).otherwise(False).alias("MACD_x_above_Signal_below_zero"),
    
    pl.when(
        (pl.col("MACD_cross_below_Signal") == 1) & (pl.col("MACD_line") > 0)
    ).then(True).otherwise(False).alias("MACD_x_below_Signal_above_zero"),
    
    pl.when(
        (pl.col("MACD_cross_above_Signal") == 1) & (pl.col("MACD_line") == 0)
    ).then(True).otherwise(False).alias("MACD_x_above_Signal_at_zero"),
    
    pl.when(
        (pl.col("MACD_cross_below_Signal") == 1) & (pl.col("MACD_line") == 0)
    ).then(True).otherwise(False).alias("MACD_x_below_Signal_at_zero")
])

df = df.with_columns(
    (pl.col('MACD_line') - pl.col('signal_line')).alias('MACD_histogram')
)

# Add a column for previous histogram values
df = df.with_columns(pl.col("MACD_histogram").shift(1).alias("previous_histogram"))

# Define the momentum based on the histogram values
df = df.with_columns([
    pl.when(
        (pl.col("MACD_histogram") > 0) & 
        (pl.col("MACD_histogram") > pl.col("previous_histogram"))
    ).then(True)
    .otherwise(False)
    .alias("histogram_bullish_momentum_strengthening"),

    pl.when(
        (pl.col("MACD_histogram") < 0) &
        (pl.col("MACD_histogram") < pl.col("previous_histogram"))
    ).then(True)
    .otherwise(False)
    .alias("histogram_bearish_momentum_strengthening"),

    pl.when(
        (pl.col("MACD_histogram") > 0) &
        (pl.col("MACD_histogram") < pl.col("previous_histogram"))
    ).then(True)
    .otherwise(False)
    .alias("histogram_bullish_momentum_weakening"),

    pl.when(
        (pl.col("MACD_histogram") < 0) &
        (pl.col("MACD_histogram") > pl.col("previous_histogram"))
    ).then(True)
    .otherwise(False)
    .alias("histogram_bearish_momentum_weakening")
])

df = df.sort('window_start')

prediction_window = 20

# Shift the close price by negative prediction window to get future prices
df = df.with_columns([
    pl.col('close').shift(-prediction_window).alias('20m_future_close')
])

df = df.with_columns([
    ((pl.col('20m_future_close') - pl.col('close')) / pl.col('close')).alias('20m_future_price_change_pct'),
    (pl.col('20m_future_close') - pl.col('close')).alias('20m_future_price_change')
    # You can add more calculations as needed for your analysis
])


def calculate_rsi(df, period=14):
    # Calculate price change (delta)
    df = df.with_columns(
        pl.col('close').diff().alias('delta')
    )
    
    # Separate the positive and negative price changes
    df = df.with_columns([
        pl.when(pl.col('delta') > 0).then(pl.col('delta')).otherwise(0).alias('gain'),
        pl.when(pl.col('delta') < 0).then(-pl.col('delta')).otherwise(0).alias('loss')
    ])
    
    # Calculate the rolling average gain and loss
    df = df.with_columns([
        pl.col('gain').rolling_mean(window_size=period, min_periods=period).alias('avg_gain'),
        pl.col('loss').rolling_mean(window_size=period, min_periods=period).alias('avg_loss')
    ])
    
    # Compute the Relative Strength (RS)
    df = df.with_columns(
        (pl.col('avg_gain') / pl.col('avg_loss')).alias('rs')
    )
    
    # Calculate the RSI
    df = df.with_columns(
        (100 - (100 / (1 + pl.col('rs')))).alias('rsi')
    )
    
    # Drop intermediate columns if necessary
    df = df.drop(['delta', 'gain', 'loss', 'avg_gain', 'avg_loss', 'rs'])
    
    return df

df = calculate_rsi(df, period=14)


def calculate_atr(df, period=14):
    df = df.with_columns(
        [
            (pl.col('high') - pl.col('low')).alias('hl'),
            (pl.col('high') - pl.col('close').shift(1)).abs().alias('hc'),
            (pl.col('low') - pl.col('close').shift(1)).abs().alias('lc')
        ]
    )
    
    # Calculate true range as the maximum of the three calculated columns
    df = df.with_columns(
        pl.max_horizontal(['hl', 'hc', 'lc']).alias('true_range')
    )
    
    # Calculate ATR as the rolling mean of the true range
    df = df.with_columns(
        pl.col('true_range').rolling_mean(window_size=period).alias('atr')
    )
    
    # Drop intermediate columns if necessary
    df = df.drop(['hl', 'hc', 'lc', 'true_range'])
    
    return df

df = calculate_atr(df, period=14)


def calculate_bollinger_bands(df, period=20, std_dev=2):
    df = df.with_columns(
        pl.col('close').rolling_mean(window_size=period).alias('bb_middle_band')
    )
    df = df.with_columns(
        pl.col('close').rolling_std(window_size=period).alias('std_dev')
    )
    df = df.with_columns(
        (pl.col('bb_middle_band') + std_dev * pl.col('std_dev')).alias('bb_upper_band'),
        (pl.col('bb_middle_band') - std_dev * pl.col('std_dev')).alias('bb_lower_band')
    )
    return df.drop(['std_dev'])

df = calculate_bollinger_bands(df, period=20, std_dev=2)


def calculate_stochastic_oscillator(df, period=14, k=3, d=3):
    df = df.with_columns(
        [
            (pl.col('close') - pl.col('low').rolling_min(window_size=period)).alias('num'),
            (pl.col('high').rolling_max(window_size=period) - pl.col('low').rolling_min(window_size=period)).alias('den')
        ]
    )
    df = df.with_columns(
        ((pl.col('num') / pl.col('den')) * 100).alias('stochastic_k')
    )
    df = df.with_columns(
        pl.col('stochastic_k').rolling_mean(window_size=k).alias('stochastic_d')
    )
    return df.drop(['num', 'den'])

df = calculate_stochastic_oscillator(df, period=14, k=3, d=3)


def calculate_adx(df, period=14):
    df = df.with_columns(
        [
            (pl.col('high') - pl.col('high').shift(1)).abs().alias('up_move'),
            (pl.col('low') - pl.col('low').shift(1)).abs().alias('down_move')
        ]
    )

    df = df.with_columns(
        [
            pl.when((pl.col('up_move') > pl.col('down_move')) & (pl.col('up_move') > 0)).then(pl.col('up_move')).otherwise(0).alias('plus_dm'),
            pl.when((pl.col('down_move') > pl.col('up_move')) & (pl.col('down_move') > 0)).then(pl.col('down_move')).otherwise(0).alias('minus_dm')
        ]
    )

    df = df.with_columns(
        [
            pl.col('plus_dm').rolling_mean(window_size=period).alias('plus_di'),
            pl.col('minus_dm').rolling_mean(window_size=period).alias('minus_di')
        ]
    )

    df = df.with_columns(
        ((pl.col('plus_di') - pl.col('minus_di')).abs() / (pl.col('plus_di') + pl.col('minus_di')) * 100).alias('dx')
    )

    df = df.with_columns(
        pl.col('dx').rolling_mean(window_size=period).alias('adx')
    )

    return df.drop(['up_move', 'down_move', 'plus_dm', 'minus_dm', 'plus_di', 'minus_di', 'dx'])

df = calculate_adx(df, period=14)

#Candlestick Pattern detection
def detect_engulfing(df):
    df = df.with_columns([
        ((df["close"] > df["open"]) & (df["close"].shift(1) < df["open"].shift(1)) & 
         (df["close"] > df["open"].shift(1)) & (df["open"] < df["close"].shift(1))).alias("bullish_engulfing"),
        ((df["close"] < df["open"]) & (df["close"].shift(1) > df["open"].shift(1)) & 
         (df["close"] < df["open"].shift(1)) & (df["open"] > df["close"].shift(1))).alias("bearish_engulfing")
    ])
    return df

def detect_momentum(df):
    df = df.with_columns([
        ((df["high"] - df["low"]) > 2 * (df["high"].shift(1) - df["low"].shift(1))).alias("momentum")
    ])
    return df

def detect_multiple_wicks(df):
    df = df.with_columns([
        ((df["high"] - df["close"]) > (df["close"] - df["low"]) * 2).alias("long_upper_wick"),
        ((df["close"] - df["low"]) > (df["high"] - df["close"]) * 2).alias("long_lower_wick")
    ])
    
    df = df.with_columns([
        (df["long_upper_wick"].cast(pl.Int32).rolling_sum(window_size=3) >= 2).alias("multiple_upper_wicks"),
        (df["long_lower_wick"].cast(pl.Int32).rolling_sum(window_size=3) >= 2).alias("multiple_lower_wicks")
    ])
    
    return df

def detect_doji(df):
    df = df.with_columns([
        ((df["high"] - df["low"]) > 0) & (abs(df["close"] - df["open"]) / (df["high"] - df["low"]) < 0.1).alias("doji")
    ])
    return df

# def detect_hammer(df):
#     df = df.with_columns([
#         ((df["close"] > df["open"]) & ((df["high"] - df["close"]) <= (df["close"] - df["open"])) & 
#          ((df["close"] - df["open"]) > 2 * (df["open"] - df["low"]))).alias("hammer")
#     ])
#     return df


def detect_bullish_hammer(df: pl.DataFrame) -> pl.DataFrame:
    return df.with_columns([
        (
            # Lower wick is at least twice the body
            (pl.min_horizontal(pl.col("adjusted_open"), pl.col("adjusted_close")) - pl.col("adjusted_low") >= 
             2 * (pl.col("adjusted_close") - pl.col("adjusted_open")).abs()) &
            
            # Upper wick is no more than 10% of lower wick
            (pl.col("adjusted_high") - pl.max_horizontal(pl.col("adjusted_open"), pl.col("adjusted_close")) <= 
             0.1 * (pl.min_horizontal(pl.col("adjusted_open"), pl.col("adjusted_close")) - pl.col("adjusted_low"))) &
            
            # Body is in the upper half of the range
            (pl.min_horizontal(pl.col("adjusted_open"), pl.col("adjusted_close")) >= 
             (pl.col("adjusted_high") + pl.col("adjusted_low")) / 2) &
            
            # Close is higher than open
            (pl.col("adjusted_close") > pl.col("adjusted_open"))
        ).alias("bullish_hammer")
    ])

def detect_skrong_bullish_hammer(df: pl.DataFrame, min_score: int = 2) -> pl.DataFrame:
    return df.with_columns(
        (pl.col("bullish_hammer_score") >= min_score).alias("skrong_bullish_hammer")
    )

def score_bullish_hammer(df: pl.DataFrame) -> pl.DataFrame:
    return df.with_columns([
        (
            pl.when(pl.col("bullish_hammer"))
            .then(
                (pl.col("rsi") < 30).cast(pl.Int8) +  # Oversold condition
                (pl.col("adjusted_close") < pl.col("bb_lower_band")).cast(pl.Int8) +  # Price below lower Bollinger Band
                (pl.col("MACD_line") > pl.col("signal_line")).cast(pl.Int8) +  # MACD bullish crossover
                (pl.col("adx") > 25).cast(pl.Int8) +  # Strong trend
                (pl.col("volume") > pl.col("volume").rolling_mean(window_size=20)).cast(pl.Int8) +  # Above average volume
                (pl.col("adjusted_close") < pl.col("vwap_2d")).cast(pl.Int8) +  # Price below 2-day VWAP
                (pl.col("stochastic_k") < 20).cast(pl.Int8)  # Stochastic oversold
            )
            .otherwise(0)
        ).alias("bullish_hammer_score")
        
    ])

def detect_bearish_hammer(df: pl.DataFrame) -> pl.DataFrame:
    return df.with_columns([
        (
            # Lower wick is at least twice the body
            (pl.min_horizontal(pl.col("adjusted_open"), pl.col("adjusted_close")) - pl.col("adjusted_low") >= 
             2 * (pl.col("adjusted_close") - pl.col("adjusted_open")).abs()) &
            
            # Upper wick is no more than 10% of lower wick
            (pl.col("adjusted_high") - pl.max_horizontal(pl.col("adjusted_open"), pl.col("adjusted_close")) <= 
             0.1 * (pl.min_horizontal(pl.col("adjusted_open"), pl.col("adjusted_close")) - pl.col("adjusted_low"))) &
            
            # Body is in the upper half of the range
            (pl.min_horizontal(pl.col("adjusted_open"), pl.col("adjusted_close")) >= 
             (pl.col("adjusted_high") + pl.col("adjusted_low")) / 2) &
            
            # Close is lower than open
            (pl.col("adjusted_close") < pl.col("adjusted_open"))
        ).alias("bearish_hammer")
    ])


def detect_shooting_star(df):
    df = df.with_columns([
        ((df["open"] > df["close"]) & ((df["high"] - df["open"]) <= (df["open"] - df["close"])) & 
         ((df["open"] - df["close"]) > 2 * (df["close"] - df["low"]))).alias("shooting_star")
    ])
    return df

def detect_tweezer(df):
    df = df.with_columns([
        ((df["close"].shift(1) < df["open"].shift(1)) & (df["close"] > df["open"]) & 
         (df["low"].shift(1) == df["low"])).alias("bullish_tweezer"),
        ((df["close"].shift(1) > df["open"].shift(1)) & (df["close"] < df["open"]) & 
         (df["high"].shift(1) == df["high"])).alias("bearish_tweezer")
    ])
    return df

def detect_marubozu(df):
    df = df.with_columns([
        ((df["close"] > df["open"]) & (df["low"] == df["open"]) & (df["high"] == df["close"])).alias("bullish_marubozu"),
        ((df["close"] < df["open"]) & (df["low"] == df["close"]) & (df["high"] == df["open"])).alias("bearish_marubozu")
    ])
    return df


def calculate_roc(df: pl.DataFrame, period: int) -> pl.DataFrame:
    df = df.with_columns(
        [
            (pl.col("close") - pl.col("close").shift(period)).alias("price_change"),
            pl.col("close").shift(period).alias("price_n_periods_ago")
        ]
    )
    
    df = df.with_columns(
        ((pl.col("price_change") / pl.col("price_n_periods_ago")) * 100).alias("roc")
    )
    
    # Drop intermediate columns if necessary
    df = df.drop(['price_change', 'price_n_periods_ago'])
    
    return df

df = calculate_roc(df, period=14)

# Detecting candlestick patterns
df = detect_engulfing(df)
df = detect_momentum(df)
df = detect_multiple_wicks(df)
df = detect_doji(df)
df = detect_bullish_hammer(df)
df = detect_bearish_hammer(df)
df = detect_shooting_star(df)
df = detect_tweezer(df)
df = detect_marubozu(df)
df = score_bullish_hammer(df)
df = detect_skrong_bullish_hammer(df)

print("Indicators Added")

import logging 
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# Create a logger
logger = logging.getLogger(__name__)

def calculate_parabolic_sar(df, step=0.02, max_step=0.2):
    logger.info("Starting Parabolic SAR calculation")
    
    # Store the original schema
    original_schema = df.schema
    
    # Convert datetime columns to integers (nanoseconds since epoch)
    datetime_columns = [col for col, dtype in original_schema.items() if isinstance(dtype, pl.Datetime)]
    for col in datetime_columns:
        df = df.with_columns(pl.col(col).dt.timestamp().cast(pl.Int64).alias(f"{col}_temp"))
        df = df.drop(col)
    
    # Initialize columns with default values
    df = df.with_columns([
        pl.lit(None).cast(pl.Float64).alias('parabolic_sar'),
        pl.lit(step).alias('acceleration_factor'),
        pl.lit(None).cast(pl.Float64).alias('extreme_point'),
        pl.lit(None).cast(pl.Float64).alias('prev_sar')
    ])
    
    sar_data = df.to_dict(as_series=False)
    in_uptrend = True
    for i in range(1, len(sar_data['close'])):
        prior_sar = sar_data['prev_sar'][i-1]
        if prior_sar is None:
            sar_data['parabolic_sar'][i] = sar_data['low'][i-1]
            sar_data['extreme_point'][i] = sar_data['high'][i-1]
        else:
            if in_uptrend:
                sar_data['parabolic_sar'][i] = prior_sar + sar_data['acceleration_factor'][i-1] * (sar_data['extreme_point'][i-1] - prior_sar)
                in_uptrend = sar_data['close'][i] > sar_data['parabolic_sar'][i]
            else:
                sar_data['parabolic_sar'][i] = prior_sar - sar_data['acceleration_factor'][i-1] * (prior_sar - sar_data['extreme_point'][i-1])
                in_uptrend = sar_data['close'][i] < sar_data['parabolic_sar'][i]
            if in_uptrend:
                sar_data['extreme_point'][i] = max(sar_data['extreme_point'][i-1], sar_data['high'][i])
            else:
                sar_data['extreme_point'][i] = min(sar_data['extreme_point'][i-1], sar_data['low'][i])
            if (in_uptrend and sar_data['close'][i] > sar_data['extreme_point'][i-1]) or (not in_uptrend and sar_data['close'][i] < sar_data['extreme_point'][i-1]):
                sar_data['acceleration_factor'][i] = min(max_step, sar_data['acceleration_factor'][i-1] + step)
            sar_data['prev_sar'][i] = sar_data['parabolic_sar'][i]
    
    df = pl.DataFrame(sar_data).drop(['acceleration_factor', 'extreme_point', 'prev_sar'])
    
    # Restore datetime columns
    for col in datetime_columns:
        df = df.with_columns(pl.from_epoch(pl.col(f"{col}_temp"), time_unit="ns").alias(col))
        df = df.drop(f"{col}_temp")
    
    # Reapply the original schema
    for col, dtype in original_schema.items():
        if col in df.columns:
            df = df.with_columns(pl.col(col).cast(dtype))
    
    logger.info("Finished calculating Parabolic SAR")
    return df

# Fibonacci Retracement (simple example using last significant low and high)
def calculate_fibonacci_retracement(df):
    logger.info("Starting Fibonacci Retracement calculation")
    low = df['low'].min()
    high = df['high'].max()
    diff = high - low
    retracements = [0.236, 0.382, 0.5, 0.618, 0.786]

    retracement_levels = {f'fib_{int(level * 100)}': high - diff * level for level in retracements}
    
    df = df.with_columns([pl.lit(value).alias(key) for key, value in retracement_levels.items()])

    logger.info("Finished calclating Fibonacci Retracement")
    return df

df = calculate_fibonacci_retracement(df)

# Ichimoku Cloud
def calculate_ichimoku_cloud(df):
    logger.info("Starting Ichimoku Cloud calculation")
    
    # Ensure columns are of type Float64
    df = df.with_columns([
        pl.col('high').cast(pl.Float64),
        pl.col('low').cast(pl.Float64),
        pl.col('close').cast(pl.Float64)
    ])
    
    high_9 = df['high'].rolling_max(window_size=9)
    low_9 = df['low'].rolling_min(window_size=9)
    df = df.with_columns([
        ((high_9 + low_9) / 2).alias('tenkan_sen'),
        ((df['high'].rolling_max(window_size=26) + df['low'].rolling_min(window_size=26)) / 2).alias('kijun_sen'),
    ])
    df = df.with_columns(
        ((df['tenkan_sen'] + df['kijun_sen']) / 2).shift(26).alias('senkou_span_a')
    )
    df = df.with_columns(
        ((df['high'].rolling_max(window_size=52) + df['low'].rolling_min(window_size=52)) / 2).shift(26).alias('senkou_span_b')
    )
    df = df.with_columns(
        df['close'].shift(-26).alias('chikou_span')
    )

    logger.info("Finished Ichimoku Cloud calculation")
    return df

df = calculate_ichimoku_cloud(df)


In [None]:

# Filter the DataFrame by the date range
start_date = "2024-02-01"
end_date = "2024-03-30"

# Convert strings to Polars date objects using pl.lit and pl.col
filtered_df = df.filter(
    (pl.col("window_start_est") >= pl.lit(start_date).cast(pl.Date)) &
    (pl.col("window_start_est") <= pl.lit(end_date).cast(pl.Date))
)

filtered_df.group_by(["transaction_date"]).len()
# filtered_df.sort(['window_start_est'], descending=True)
# len(filtered_df)
# df_test.tail(10000)

In [None]:
filtered_df.write_csv("/Users/brandon/Documents/indicators_240201_240330.csv")

In [None]:
# hm = df.filter(pl.col("skrong_bullish_hammer") == True).group_by(['transaction_date']).len().sort('transaction_date')
# len(hm)
# hm

hm = (
    df.filter(pl.col("skrong_bullish_hammer") == True)
    .groupby(['window_start_est', 'bullish_hammer_score'])
    .agg(pl.count())
    .sort(['window_start_est', 'bullish_hammer_score'])
)
len(hm)
hm

In [None]:
#CALCULATE SUPPORT AND RESISTANCE
import numpy as np
import polars as pl
import datetime as dt

def calculate_pivot_points(df: pl.DataFrame, n1: int, n2: int) -> pl.DataFrame:
    def pivotid(l, n1, n2, lows, highs):
        if l - n1 < 0 or l + n2 >= len(lows):
            return 0
        
        pividlow = 1
        pividhigh = 1
        for i in range(l - n1, l + n2 + 1):
            if lows[l] > lows[i]:
                pividlow = 0
            if highs[l] < highs[i]:
                pividhigh = 0
        
        if pividlow and pividhigh:
            return 3
        elif pividlow:
            return 1
        elif pividhigh:
            return 2
        else:
            return 0
            
    print(f"STARTING pivot calc at {dt.datetime.now()}")
    lows = df['adjusted_low'].to_numpy()
    highs = df['adjusted_high'].to_numpy()
    pivot_points = [pivotid(i, n1, n2, lows, highs) for i in range(len(lows))]
    
    df = df.with_columns(pl.Series(name='pivot', values=pivot_points))
    return df

def calculate_support_resistance(df: pl.DataFrame, n1: int, n2: int) -> pl.DataFrame:
    # Calculate pivot points
    df = calculate_pivot_points(df, n1, n2)
    
    # Initialize columns for support and resistance levels
    support_levels = np.full(len(df), np.nan)
    resistance_levels = np.full(len(df), np.nan)
    
    # Identify support and resistance levels
    low_series = df['adjusted_low'].to_numpy()
    high_series = df['adjusted_high'].to_numpy()
    pivot_series = df['pivot'].to_numpy()
    
    print(f"STARTING Support and Resistance calc at {dt.datetime.now()}")
    for i in range(len(pivot_series)):
        if pivot_series[i] == 1:
            support_levels[i] = low_series[i]
        elif pivot_series[i] == 2:
            resistance_levels[i] = high_series[i]
    
    # Add the support and resistance levels back to the DataFrame
    df = df.with_columns([
        pl.Series(name='support_level', values=support_levels),
        pl.Series(name='resistance_level', values=resistance_levels)
    ])
    
    return df

def calculate_weighted_support_resistance(df: pl.DataFrame, threshold: float = 0.01) -> pl.DataFrame:
    # Convert Polars DataFrame to NumPy arrays for faster calculations
    support_levels = df['support_level'].drop_nulls().unique().to_numpy()
    resistance_levels = df['resistance_level'].drop_nulls().unique().to_numpy()
    low_prices = df['adjusted_low'].to_numpy()
    high_prices = df['adjusted_high'].to_numpy()

    # Initialize dictionaries to count touches
    support_touch_counts = {}
    resistance_touch_counts = {}

    # Count touches for support levels
    for level in support_levels:
        support_touch_counts[level] = np.sum((low_prices >= (level * (1 - threshold))) &
                                             (low_prices <= (level * (1 + threshold))))

    # Count touches for resistance levels
    for level in resistance_levels:
        resistance_touch_counts[level] = np.sum((high_prices >= (level * (1 - threshold))) & 
                                                (high_prices <= (level * (1 + threshold))))

    # Group and normalize levels
    significant_support_levels = {level: count for level, count in support_touch_counts.items() if count >= 3}
    significant_resistance_levels = {level: count for level, count in resistance_touch_counts.items() if count >= 3}

    def group_levels(levels):
        grouped_levels = {}
        for level, count in levels.items():
            found = False
            for key in grouped_levels:
                if abs(level - key) <= threshold * key:
                    grouped_levels[key] += count
                    found = True
                    break
            if not found:
                grouped_levels[level] = count
        return grouped_levels

    grouped_support_levels = group_levels(significant_support_levels)
    grouped_resistance_levels = group_levels(significant_resistance_levels)

    # Add significant levels back to the DataFrame
    df = df.with_columns([
        pl.col('support_level').map_elements(lambda x: x if x in grouped_support_levels else np.nan).alias('weighted_support_level'),
        pl.col('resistance_level').map_elements(lambda x: x if x in grouped_resistance_levels else np.nan).alias('weighted_resistance_level')
    ])

    return df
    
def preprocess_with_support_resistance(df: pl.DataFrame, n1: int = 10, n2: int = 10) -> pl.DataFrame:
    try:
        df = calculate_support_resistance(df, n1, n2)
        df = calculate_weighted_support_resistance(df)
        return df
        
    except Exception as e:
        current_time = dt.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        raise RuntimeError(f"Error at {current_time}: {str(e)}")

try:
    print(f"STARTING SRChannel calc at {dt.datetime.now()}")
    df = preprocess_with_support_resistance(df)
    print(f"FINISHED SRChannel calc at {dt.datetime.now()}")
    # df.tail(1000)
except RuntimeError as e:
    print(e)


In [30]:
import os
import pandas as pd
import json
import base64
from openai import OpenAI

# Set your OpenAI API key
api_key = "sk-proj-BTvkAwAGbdgQF19UGfXFT3BlbkFJMEsTpMLsh9RpHkiEvX3m"
client = OpenAI(api_key=api_key)

# Base directory where the files are stored
base_dir = os.path.expanduser("/Users/brandon/Documents/chart_images/20240223145500")

# Your res_format definition (as provided in your code)
res_format = {
    "$schema": "http://json-schema.org/draft-07/schema#",
    "type": "object",
    "properties": {
        "symbol": {"type": "string", "description": "The ticker symbol."},
        "date": {"type": "string", "format": "date", "description": "Date in YYYY-MM-DD."},
        "underlying_stock_price": {"type": "number", "description": "Stock price."},
        "option_type": {"type": "string", "enum": ["Call", "Put"], "description": "Option type."},
        "strike_price": {"type": "number", "description": "Strike price."},
        "entry_price": {"type": "number", "description": "Entry price."},
        "exit_price": {"type": "number", "description": "Exit price."},
        "recommendation": {"type": "string", "enum": ["Buy", "Sell", "Hold"], "description": "Recommendation."},
        "confidence_level": {"type": "number", "minimum": 0, "maximum": 100, "description": "Confidence level in %."},
        "support_levels": {"type": "array", "items": {"type": "number"}, "description": "Support levels."},
        "resistance_levels": {"type": "array", "items": {"type": "number"}, "description": "Resistance levels."},
        "indicators_summary": {"type": "string", "description": "Summary of key indicators."},
        "additional_notes": {"type": "string", "description": "Additional notes."}
    },
    "required": ["symbol", "date", "option_type", "strike_price", "recommendation"],
    "additionalProperties": False
}

# Function to extract key metrics from the CSV
def extract_metrics(csv_path):
    df = pd.read_csv(csv_path)
    # Extract or calculate key metrics from the CSV
    metrics = df.describe().to_string()
    return metrics

# Function to encode the image
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def generate_recommendation(indicators, image_path):
    # Encode the image
    base64_image = encode_image(image_path)

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": f"""You are a financial analyst. Analyze the chart and provide a recommendation
                            based on the chart and the following indicators. 
                            Use the following JSON format: {json.dumps(res_format, indent=2)} 
                            as a template to fill in for your response with the answers. Please only return the JSON
                            reponse, so I can consistently parse your answers. Only respond with a parsable json object, 
                            and please omit the schema since I already have it."""
            },
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": f"Here are the key indicators:\n{indicators}"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}"
                        }
                    }
                ]
            }
        ],
        max_tokens=1000
    )
    
    return response.choices[0].message.content

directories = ["/Users/brandon/Documents/chart_images/20240223145500"]

# Run the analysis for each directory
for directory in directories:
    # Extract indicators from CSV
    indicators = extract_metrics(os.path.join(directory, "indicators_20240223145500.csv"))
    
    # Generate the recommendation
    image_path = os.path.join(directory, "daily_chart_20240223145500.png")
    recommendation = generate_recommendation(indicators, image_path)
    print(recommendation)
    
    # Save the recommendation
    with open(os.path.join(directory, "recommendation.txt"), "w") as f:
        f.write(recommendation)

print("Recommendations generated and saved.")

```json
{
  "symbol": "NVDA",
  "date": "2024-02-23",
  "underlying_stock_price": 78.85,
  "option_type": "Call",
  "strike_price": 80,
  "entry_price": 79.5,
  "exit_price": 80.5,
  "recommendation": "Buy",
  "confidence_level": 75,
  "support_levels": [78.5, 78],
  "resistance_levels": [80, 81],
  "indicators_summary": "The stock is showing signs of potential upward movement with the price approaching resistance levels. The RSI is neutral around 50-60, and MACD shows convergence. Volume is relatively high, suggesting strong market participation.",
  "additional_notes": "Monitor closely for any divergence in RSI or MACD, and ensure to evaluate the market context. Set a stop-loss at 77.5 to mitigate potential downside risk."
}
```
Recommendations generated and saved.


In [12]:
import openai

openai.api_key = api_key


from openai import OpenAI
client = OpenAI(api_key=api_key)

# List available models
models = client.models.list()

# Print the names of the models you have access to
for model in models['data']:
    print(model['id'])

TypeError: 'SyncPage[Model]' object is not subscriptable

In [26]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import polars as pl
import plotly.io as pio
import numpy as np
import os
import shutil
from datetime import datetime, time, timedelta

def generate_and_save_chart(df, ticker, window_start, window_end, filename):
    # Filter the Polars DataFrame to only include data within the specified window
    filtered_df = df.filter(
        (pl.col('ticker') == ticker) & 
        (pl.col('window_start_est') >= pl.lit(window_start)) & 
        (pl.col('window_start_est') <= pl.lit(window_end))
    )

    # Determine crossing points
    cross_above = filtered_df.filter(pl.col('MACD_cross_above_Signal') == 1)['window_start_est']
    cross_below = filtered_df.filter(pl.col('MACD_cross_below_Signal') == 1)['window_start_est']

    # Convert to pandas for Plotly compatibility
    filtered_df_pd = filtered_df.to_pandas()
    
    # Create a subplot with 4 rows
    # fig = make_subplots(
    #     rows=4, cols=1, shared_xaxes=True, vertical_spacing=0.1,  # Increased spacing
    #     row_heights=[0.5, 0.2, 0.15, 0.15],  # Adjust the heights of the subplots
    #     subplot_titles=(
    #         f'{ticker} Stock Price with 2-day and 3-day VWAP',
    #         'MACD & Signal Line',
    #         'adjusted_volume',
    #         'RSI'
    #     )
    # )

    fig = make_subplots(
    rows=4, cols=1, 
    shared_xaxes=True, 
    vertical_spacing=0.1,
    row_heights=[0.5, 0.2, 0.15, 0.15],
    subplot_titles=(
        f'{ticker} Stock Price with VWAPs',
        'MACD & Signal',
        'Volume',
        'RSI'
    ),
    figure=go.Figure(layout=go.Layout(height=800, width=1000))  # Increase figure size
)

    # Add the candlestick chart to the first row
    fig.add_trace(
        go.Candlestick(
            x=filtered_df_pd['window_start_est'], open=filtered_df_pd['adjusted_open'],
            high=filtered_df_pd['adjusted_high'], low=filtered_df_pd['adjusted_low'],
            close=filtered_df_pd['adjusted_close']
        ),
        row=1, col=1
    )

    # Adding the 2-day VWAP to the candlestick chart
    fig.add_trace(
        go.Scatter(
            x=filtered_df_pd['window_start_est'], y=filtered_df_pd['vwap_2d'],
            mode='lines', name='2-day VWAP', line=dict(width=2)
        ),
        row=1, col=1
    )

    # Adding the 3-day VWAP to the candlestick chart
    fig.add_trace(
        go.Scatter(
            x=filtered_df_pd['window_start_est'], y=filtered_df_pd['vwap_3d'],
            mode='lines', name='3-day VWAP', line=dict(width=2)
        ),
        row=1, col=1
    )

    # Adding the ema_200 to the candlestick chart
    fig.add_trace(
        go.Scatter(
            x=filtered_df_pd['window_start_est'], y=filtered_df_pd['ema_200_vwap'],
            mode='lines', name='EMA 200', line=dict(width=2)
        ),
        row=1, col=1
    )

    # Add support and resistance lines to the candlestick chart
    support_levels = filtered_df_pd['support_level'].dropna().unique()
    resistance_levels = filtered_df_pd['resistance_level'].dropna().unique()

    for level in support_levels:
        fig.add_hline(y=level, line=dict(color="green", width=1, dash="dash"), row=1, col=1)

    for level in resistance_levels:
        fig.add_hline(y=level, line=dict(color="red", width=1, dash="dash"), row=1, col=1)

    # Add MACD line to the second row
    fig.add_trace(
        go.Scatter(
            x=filtered_df_pd['window_start_est'], y=filtered_df_pd['MACD_line'],
            mode='lines', name='MACD Line'
        ),
        row=2, col=1
    )

    # Add Signal line to the second row
    fig.add_trace(
        go.Scatter(
            x=filtered_df_pd['window_start_est'], y=filtered_df_pd['signal_line'],
            mode='lines', name='Signal Line'
        ),
        row=2, col=1
    )

    # Add vertical lines for MACD crossing above Signal line
    for cross_time in cross_above:
        fig.add_vline(
            x=cross_time, line=dict(color="green", width=2),
            row='all', col=1
        )

    # Add vertical lines for MACD crossing below Signal line
    for cross_time in cross_below:
        fig.add_vline(
            x=cross_time, line=dict(color="red", width=2),
            row='all', col=1
        )

    # Add MACD histogram to the second row (below MACD and Signal Line)
    fig.add_trace(
        go.Bar(
            x=filtered_df_pd['window_start_est'],
            y=filtered_df_pd['MACD_histogram'],
            name='MACD Histogram',
            marker_color=np.where(filtered_df_pd['MACD_histogram'] >= 0, 'green', 'red')  # color bars conditionally
        ),
        row=2, col=1
    )

    # Add horizontal line at y=0 in the MACD plot
    fig.add_hline(y=0, line=dict(color="black", width=2, dash="solid"), row=2, col=1)

    # Add volume bars to the third row
    fig.add_trace(
        go.Bar(
            x=filtered_df_pd['window_start_est'],
            y=filtered_df_pd['adjusted_volume'],
            marker_color='blue',
            name='adjusted_volume'
        ),
        row=3, col=1
    )

    # Add RSI line to the fourth row
    fig.add_trace(
        go.Scatter(
            x=filtered_df_pd['window_start_est'], y=filtered_df_pd['rsi'],
            mode='lines', name='RSI'
        ),
        row=4, col=1
    )

    # Add horizontal lines at RSI levels 30 and 70
    fig.add_hline(y=30, line=dict(color="red", width=1, dash="dash"), row=4, col=1)
    fig.add_hline(y=70, line=dict(color="red", width=1, dash="dash"), row=4, col=1)

    # Updating layout for better readability
    # fig.update_layout(
    #     title=f'{ticker} Stock Price and Technical Indicators on {window_end}',
    #     xaxis_title='Time',
    #     yaxis_title='Price (USD)',
    #     legend_title_text='Indicator',
    #     xaxis_rangeslider_visible=False
    # )

    # # Adjust x-axis and y-axis titles for subplots
    # fig.update_xaxes(title_text="Time", row=1, col=1, tickformat="%H:%M")
    # fig.update_xaxes(title_text="Time", row=2, col=1, tickformat="%H:%M")
    # fig.update_xaxes(title_text="Time", row=3, col=1, tickformat="%H:%M")
    # fig.update_xaxes(title_text="Time", row=4, col=1, tickformat="%H:%M")

    # fig.update_yaxes(title_text="Price (USD)", row=1, col=1)
    # fig.update_yaxes(title_text="MACD", row=2, col=1)
    # fig.update_yaxes(title_text='adjusted_volume', row=3, col=1)

    fig.update_layout(
        title=f'{ticker} Stock Price and Technical Indicators on {window_end.strftime("%Y-%m-%d %H:%M %Z")}',
        xaxis_title='Time',
        yaxis_title='Price (USD)',
        legend_title_text='Indicator',
        xaxis_rangeslider_visible=False,
        font=dict(size=10)  # Reduce overall font size
    )

    # Update x-axes
    for i in range(1, 5):
        fig.update_xaxes(
            title_text="Time" if i == 4 else None,  # Only show "Time" on bottom subplot
            row=i, col=1,
            tickangle=45,  # Rotate labels
            nticks=6  # Reduce number of ticks
        )

    # Update y-axes
    fig.update_yaxes(title_text="Price (USD)", row=1, col=1)
    fig.update_yaxes(title_text="MACD", row=2, col=1)
    fig.update_yaxes(title_text='Vol', row=3, col=1)
    fig.update_yaxes(title_text='RSI', row=4, col=1)

    # Save the plot as an image
    fig.write_image(filename)

# Loop through each minute bar in the dataset and generate the chart up to that minute
trading_start_time = time(9, 30)
trading_end_time = time(16, 0)

# Convert the window start and end to the correct timezone
eastern = pytz.timezone('US/Eastern')


# print(unique_timestamps)

from concurrent.futures import ThreadPoolExecutor

unique_timestamps = filtered_df.select(pl.col('window_start_est')).unique().sort('window_start_est').to_series().to_list()
unique_timestamps = [
    ts for ts in unique_timestamps
    if ts.time() >= time(9, 30) and ts.time() <= time(16, 0)
]

def process_timestamp(timestamp):
    print(f"Processing: {timestamp}")
    # Ensure window_start is set to 9:30 AM Eastern time for the date of the timestamp
    window_start_date = timestamp.date()
    window_start = datetime.combine(window_start_date, time(9, 30), timestamp.tzinfo)
    window_end = timestamp
    # filename = f"/Volumes/WD18TB/chart_images/daily_chart_{timestamp}.png"
    filename = f"/Users/brandon/Documents/chart_images/daily_chart_{timestamp}.png"
    generate_and_save_chart(df, 'NVDA', window_start, window_end, filename)
    print(f"Saved {filename}")

def execute_concurrently(unique_timestamps, max_workers=4, clear_dir=False):
    # Clear the chart_images directory if clear_dir is True
    if clear_dir:
        # dir_path = "/Volumes/WD18TB/chart_images"
        dir_path = "/Users/brandon/Documents/chart_images/"
        if os.path.exists(dir_path):
            shutil.rmtree(dir_path)
        os.makedirs(dir_path)
        print(f"Cleared and recreated directory: {dir_path}")

    # Execute the function concurrently for each timestamp
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        executor.map(process_timestamp, unique_timestamps)

# Example usage with configurable workers
max_workers = 8
execute_concurrently(unique_timestamps, max_workers, clear_dir=True)

Cleared and recreated directory: /Volumes/WD18TB/chart_images
Processing: 2024-02-01 09:30:00-05:00
Processing: 2024-02-01 09:31:00-05:00
Processing: 2024-02-01 09:32:00-05:00
Processing: 2024-02-01 09:33:00-05:00
Processing: 2024-02-01 09:34:00-05:00
Processing: 2024-02-01 09:35:00-05:00
Processing: 2024-02-01 09:36:00-05:00
Processing: 2024-02-01 09:37:00-05:00
Saved /Volumes/WD18TB/chart_images/daily_chart_2024-02-01 09:37:00-05:00.png
Processing: 2024-02-01 09:38:00-05:00
Saved /Volumes/WD18TB/chart_images/daily_chart_2024-02-01 09:32:00-05:00.png
Processing: 2024-02-01 09:39:00-05:00
Saved /Volumes/WD18TB/chart_images/daily_chart_2024-02-01 09:33:00-05:00.png
Processing: 2024-02-01 09:40:00-05:00
Saved /Volumes/WD18TB/chart_images/daily_chart_2024-02-01 09:36:00-05:00.png
Processing: 2024-02-01 09:41:00-05:00
Saved /Volumes/WD18TB/chart_images/daily_chart_2024-02-01 09:34:00-05:00.png
Processing: 2024-02-01 09:42:00-05:00
Saved /Volumes/WD18TB/chart_images/daily_chart_2024-02-01 0

KeyboardInterrupt: 

Saved /Volumes/WD18TB/chart_images/daily_chart_2024-02-01 10:34:00-05:00.png
Processing: 2024-02-01 10:42:00-05:00
Saved /Volumes/WD18TB/chart_images/daily_chart_2024-02-01 10:39:00-05:00.png
Processing: 2024-02-01 10:43:00-05:00
Saved /Volumes/WD18TB/chart_images/daily_chart_2024-02-01 10:33:00-05:00.png
Processing: 2024-02-01 10:44:00-05:00
Saved /Volumes/WD18TB/chart_images/daily_chart_2024-02-01 10:37:00-05:00.png
Processing: 2024-02-01 10:45:00-05:00
Saved /Volumes/WD18TB/chart_images/daily_chart_2024-02-01 10:38:00-05:00.png
Processing: 2024-02-01 10:46:00-05:00
Saved /Volumes/WD18TB/chart_images/daily_chart_2024-02-01 10:40:00-05:00.png
Processing: 2024-02-01 10:47:00-05:00
Saved /Volumes/WD18TB/chart_images/daily_chart_2024-02-01 10:36:00-05:00.png
Processing: 2024-02-01 10:48:00-05:00
Saved /Volumes/WD18TB/chart_images/daily_chart_2024-02-01 10:41:00-05:00.png
Processing: 2024-02-01 10:49:00-05:00
Saved /Volumes/WD18TB/chart_images/daily_chart_2024-02-01 10:42:00-05:00.png
Pro

In [None]:
################MOVE files into timestamp specific directory

import polars as pl
import os
from datetime import timedelta, datetime
from concurrent.futures import ThreadPoolExecutor
import pytz

# Load the indicators CSV
# df = pl.read_csv("/mnt/data/indicators_240201_240330.csv")

# Ensure window_start_est is treated as EST
df = df.with_columns(pl.col("window_start_est").cast(pl.Datetime).dt.replace_time_zone("America/New_York"))

# Set the directory path
base_dir = os.path.expanduser("/Users/brandon/Documents/chart_images/")

# Function to process each directory
def process_directory(directory):
    # Extract the timestamp from the directory name
    timestamp_str = directory  # directory name is already in the correct format YYYYMMDDHHMMSS
    timestamp = datetime.strptime(timestamp_str, "%Y%m%d%H%M%S")
    est = pytz.timezone('America/New_York')
    timestamp_est = est.localize(timestamp)
    
    # Calculate the date range
    start_date = timestamp_est - timedelta(days=20)
    end_date = timestamp_est + timedelta(days=20)
    
    # Filter the DataFrame
    filtered_df = df.filter((pl.col("window_start_est") >= start_date) & (pl.col("window_start_est") <= end_date))
    
    # Save to CSV in the respective directory
    output_path = os.path.join(base_dir, directory, f"indicators_{timestamp_str}.csv")
    filtered_df.write_csv(output_path)
    print(f"Saved CSV for {timestamp_str}")

# Get the list of directories
directories = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d)) and d.isdigit()]

# Use ThreadPoolExecutor to process directories in parallel
with ThreadPoolExecutor() as executor:
    # Map the function to the directories
    list(executor.map(process_directory, directories))

print("All directories processed.")

In [27]:
# Chart with Support/Resistance Channels

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
from pandas import Timestamp
import plotly.io as pio
import numpy as np

# Settings
pd.set_option('display.max_rows', None)
pio.renderers.default = "notebook"

ticker = 'NVDA'

# Convert the Polars DataFrame to pandas
pandas_df = df.filter((pl.col('ticker') == ticker)).to_pandas()
pandas_df['window_start_est'] = pd.to_datetime(pandas_df['window_start_est'])

# Print the hourly counts before timezone conversion
hourly_counts = pandas_df['window_start_est'].dt.hour.value_counts().sort_index()
print("Hourly counts before timezone conversion:")
print(hourly_counts)

# Ensure the 'window_start_est' column is interpreted as UTC initially
# pandas_df['window_start_est'] = pandas_df['window_start_est'].dt.tz_localize('UTC')
# pandas_df['window_start_est'] = pandas_df['window_start_est'].dt.tz_convert('US/Eastern')

# Properly convert the 'window_start_est' column to US/Eastern
pandas_df['window_start_est'] = pandas_df['window_start_est'].dt.tz_convert('US/Eastern')

# Print the hourly counts after timezone conversion
hourly_counts = pandas_df['window_start_est'].dt.hour.value_counts().sort_index()
print("Hourly counts after timezone conversion:")
print(hourly_counts)

# Verify the full range of 'window_start_est' column
print("Full range of 'window_start_est':")
print("Min:", pandas_df['window_start_est'].min())
print("Max:", pandas_df['window_start_est'].max())

# Filter the DataFrame to only include data within the specified window
window_start = Timestamp('2024-05-15 09:30', tz='US/Eastern')
window_end = Timestamp('2024-05-15 16:00', tz='US/Eastern')

window_start = Timestamp('2024-02-02 09:30', tz='US/Eastern')
window_end = Timestamp('2024-02-02 16:00', tz='US/Eastern')

filtered_df = pandas_df[
    (pandas_df['window_start_est'] >= window_start) &
    (pandas_df['window_start_est'] <= window_end)
]

# Print the hourly counts for the filtered data
hourly_counts_filtered = filtered_df['window_start_est'].dt.hour.value_counts().sort_index()
print("Filtered hourly counts:")
print(hourly_counts_filtered)

# Verify the data range after filtering
print("Filtered data range:")
print(filtered_df[['window_start_est', 'adjusted_volume']].describe())
print(filtered_df['window_start_est'].min())
print(filtered_df['window_start_est'].max())

# Check if RSI has been calculated correctly
print(filtered_df[['window_start_est', 'rsi']].dropna().head(10))

# Determine crossing points
cross_above = filtered_df[filtered_df['MACD_cross_above_Signal'] == 1]['window_start_est']
cross_below = filtered_df[filtered_df['MACD_cross_below_Signal'] == 1]['window_start_est']

# Create a subplot with 4 rows
fig = make_subplots(
    rows=4, cols=1, shared_xaxes=True, vertical_spacing=0.1,  # Increased spacing
    row_heights=[0.5, 0.2, 0.15, 0.15],  # Adjust the heights of the subplots
    subplot_titles=(
        f'{ticker} Stock Price with 2-day and 3-day VWAP',
        'MACD & Signal Line',
        'adjusted_volume',
        'RSI'
    )
)

# Add the candlestick chart to the first row
fig.add_trace(
    go.Candlestick(
        x=filtered_df['window_start_est'], open=filtered_df['adjusted_open'],
        high=filtered_df['adjusted_high'], low=filtered_df['adjusted_low'],
        close=filtered_df['adjusted_close']
    ),
    row=1, col=1
)

# Adding the 2-day VWAP to the candlestick chart
fig.add_trace(
    go.Scatter(
        x=filtered_df['window_start_est'], y=filtered_df['vwap_2d'],
        mode='lines', name='2-day VWAP', line=dict(width=2)
    ),
    row=1, col=1
)

# Adding the 3-day VWAP to the candlestick chart
fig.add_trace(
    go.Scatter(
        x=filtered_df['window_start_est'], y=filtered_df['vwap_3d'],
        mode='lines', name='3-day VWAP', line=dict(width=2)
    ),
    row=1, col=1
)

# Adding the ema_200 to the candlestick chart
fig.add_trace(
    go.Scatter(
        x=filtered_df['window_start_est'], y=filtered_df['ema_200_vwap'],
        mode='lines', name='EMA 200', line=dict(width=2)
    ),
    row=1, col=1
)

# Add support and resistance lines to the candlestick chart
support_levels = filtered_df['support_level'].dropna().unique()
resistance_levels = filtered_df['resistance_level'].dropna().unique()

for level in support_levels:
    fig.add_hline(y=level, line=dict(color="green", width=1, dash="dash"), row=1, col=1)

for level in resistance_levels:
    fig.add_hline(y=level, line=dict(color="red", width=1, dash="dash"), row=1, col=1)

# Add MACD line to the second row
fig.add_trace(
    go.Scatter(
        x=filtered_df['window_start_est'], y=filtered_df['MACD_line'],
        mode='lines', name='MACD Line'
    ),
    row=2, col=1
)

# Add Signal line to the second row
fig.add_trace(
    go.Scatter(
        x=filtered_df['window_start_est'], y=filtered_df['signal_line'],
        mode='lines', name='Signal Line'
    ),
    row=2, col=1
)

# Add vertical lines for MACD crossing above Signal line
for cross_time in cross_above:
    fig.add_vline(
        x=cross_time, line=dict(color="green", width=2),
        row='all', col=1
    )

# Add vertical lines for MACD crossing below Signal line
for cross_time in cross_below:
    fig.add_vline(
        x=cross_time, line=dict(color="red", width=2),
        row='all', col=1
    )

# Add MACD histogram to the second row (below MACD and Signal Line)
fig.add_trace(
    go.Bar(
        x=filtered_df['window_start_est'],
        y=filtered_df['MACD_histogram'],
        name='MACD Histogram',
        marker_color=np.where(filtered_df['MACD_histogram'] >= 0, 'green', 'red')  # color bars conditionally
    ),
    row=2, col=1
)

# Add horizontal line at y=0 in the MACD plot
fig.add_hline(y=0, line=dict(color="black", width=2, dash="solid"), row=2, col=1)

# Add volume bars to the third row
fig.add_trace(
    go.Bar(
        x=filtered_df['window_start_est'],
        y=filtered_df['adjusted_volume'],
        marker_color='blue',
        name='adjusted_volume'
    ),
    row=3, col=1
)

# Add RSI line to the fourth row
fig.add_trace(
    go.Scatter(
        x=filtered_df['window_start_est'], y=filtered_df['rsi'],
        mode='lines', name='RSI'
    ),
    row=4, col=1
)

# Add horizontal lines at RSI levels 30 and 70
fig.add_hline(y=30, line=dict(color="red", width=1, dash="dash"), row=4, col=1)
fig.add_hline(y=70, line=dict(color="red", width=1, dash="dash"), row=4, col=1)

# Updating layout for better readability
fig.update_layout(
    title=f'{ticker} Stock Price and Technical Indicators on {window_start.strftime("%Y-%m-%d %H:%M %Z")}',
    xaxis_title='Time',
    yaxis_title='Price (USD)',
    legend_title_text='Indicator',
    xaxis_rangeslider_visible=False
)

# Adjust x-axis and y-axis titles for subplots
fig.update_xaxes(title_text="Time", row=1, col=1, tickformat="%H:%M")
fig.update_xaxes(title_text="Time", row=2, col=1, tickformat="%H:%M")
fig.update_xaxes(title_text="Time", row=3, col=1, tickformat="%H:%M")
fig.update_xaxes(title_text="Time", row=4, col=1, tickformat="%H:%M")

fig.update_yaxes(title_text="Price (USD)", row=1, col=1)
fig.update_yaxes(title_text="MACD", row=2, col=1)
fig.update_yaxes(title_text='adjusted_volume', row=3, col=1)

# Show the plot
pio.renderers.default = "notebook"
fig.show(renderer='browser')

Saved /Volumes/WD18TB/chart_images/daily_chart_2024-02-01 12:54:00-05:00.pngProcessing: 2024-02-01 13:02:00-05:00
Saved /Volumes/WD18TB/chart_images/daily_chart_2024-02-01 12:55:00-05:00.png
Processing: 2024-02-01 13:03:00-05:00
Saved /Volumes/WD18TB/chart_images/daily_chart_2024-02-01 12:56:00-05:00.png
Processing: 2024-02-01 13:04:00-05:00
Saved /Volumes/WD18TB/chart_images/daily_chart_2024-02-01 12:57:00-05:00.png
Processing: 2024-02-01 13:05:00-05:00
Saved /Volumes/WD18TB/chart_images/daily_chart_2024-02-01 12:58:00-05:00.png
Processing: 2024-02-01 13:06:00-05:00
Saved /Volumes/WD18TB/chart_images/daily_chart_2024-02-01 12:59:00-05:00.png
Processing: 2024-02-01 13:07:00-05:00
Saved /Volumes/WD18TB/chart_images/daily_chart_2024-02-01 13:00:00-05:00.png
Processing: 2024-02-01 13:08:00-05:00
Saved /Volumes/WD18TB/chart_images/daily_chart_2024-02-01 13:01:00-05:00.png
Processing: 2024-02-01 13:09:00-05:00
Saved /Volumes/WD18TB/chart_images/daily_chart_2024-02-01 13:02:00-05:00.png
Proc

In [28]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
from pandas import Timestamp
import plotly.io as pio
import numpy as np

# Settings
pd.set_option('display.max_rows', None)
pio.renderers.default = "notebook"

ticker = 'NVDA'

# Convert the Polars DataFrame to pandas
pandas_df = df.filter((pl.col('ticker') == ticker)).to_pandas()
pandas_df['window_start_est'] = pd.to_datetime(pandas_df['window_start_est'])

# Ensure the 'window_start_est' column is interpreted as UTC initially
pandas_df['window_start_est'] = pandas_df['window_start_est'].dt.tz_localize('UTC')

# Properly convert the 'window_start_est' column to US/Eastern
pandas_df['window_start_est'] = pandas_df['window_start_est'].dt.tz_convert('US/Eastern')

# Filter the DataFrame to only include data within the specified window
window_start = Timestamp('2024-05-15 09:30', tz='US/Eastern')
window_end = Timestamp('2024-05-15 16:00', tz='US/Eastern')
window_start = Timestamp('2024-05-07 09:30', tz='US/Eastern')
window_end = Timestamp('2024-05-07 16:00', tz='US/Eastern')
window_start = Timestamp('2024-02-01 09:30', tz='US/Eastern')
window_end = Timestamp('2024-02-01 16:00', tz='US/Eastern')

filtered_df = pandas_df[
    (pandas_df['window_start_est'] >= window_start) &
    (pandas_df['window_start_est'] <= window_end)
]

# Determine crossing points
cross_above = filtered_df[filtered_df['MACD_cross_above_Signal'] == 1]['window_start_est']
cross_below = filtered_df[filtered_df['MACD_cross_below_Signal'] == 1]['window_start_est']

# Create a subplot with 5 rows
fig = make_subplots(
    rows=5, cols=1, shared_xaxes=True, vertical_spacing=0.1,  # Increased spacing
    row_heights=[0.4, 0.2, 0.15, 0.15, 0.1],  # Adjust the heights of the subplots
    subplot_titles=(
        f'{ticker} Stock Price with 2-day and 3-day VWAP',
        'MACD & Signal Line',
        'adjusted_volume',
        'RSI',
        'Hammer Candlesticks'
    )
)

# Add the candlestick chart to the first row
fig.add_trace(
    go.Candlestick(
        x=filtered_df['window_start_est'], open=filtered_df['adjusted_open'],
        high=filtered_df['adjusted_high'], low=filtered_df['adjusted_low'],
        close=filtered_df['adjusted_close']
    ),
    row=1, col=1
)

# Adding the 2-day VWAP to the candlestick chart
fig.add_trace(
    go.Scatter(
        x=filtered_df['window_start_est'], y=filtered_df['vwap_2d'],
        mode='lines', name='2-day VWAP', line=dict(width=2)
    ),
    row=1, col=1
)

# Adding the 3-day VWAP to the candlestick chart
fig.add_trace(
    go.Scatter(
        x=filtered_df['window_start_est'], y=filtered_df['vwap_3d'],
        mode='lines', name='3-day VWAP', line=dict(width=2)
    ),
    row=1, col=1
)

# Adding the ema_200 to the candlestick chart
fig.add_trace(
    go.Scatter(
        x=filtered_df['window_start_est'], y=filtered_df['ema_200_vwap'],
        mode='lines', name='EMA 200', line=dict(width=2)
    ),
    row=1, col=1
)

# Add support and resistance lines to the candlestick chart
support_levels = filtered_df['support_level'].dropna().unique()
resistance_levels = filtered_df['resistance_level'].dropna().unique()

for level in support_levels:
    fig.add_hline(y=level, line=dict(color="green", width=1, dash="dash"), row=1, col=1)

for level in resistance_levels:
    fig.add_hline(y=level, line=dict(color="red", width=1, dash="dash"), row=1, col=1)

# Add MACD line to the second row
fig.add_trace(
    go.Scatter(
        x=filtered_df['window_start_est'], y=filtered_df['MACD_line'],
        mode='lines', name='MACD Line'
    ),
    row=2, col=1
)

# Add Signal line to the second row
fig.add_trace(
    go.Scatter(
        x=filtered_df['window_start_est'], y=filtered_df['signal_line'],
        mode='lines', name='Signal Line'
    ),
    row=2, col=1
)

# Add vertical lines for MACD crossing above Signal line
for cross_time in cross_above:
    fig.add_vline(
        x=cross_time, line=dict(color="green", width=2),
        row='all', col=1
    )

# Add vertical lines for MACD crossing below Signal line
for cross_time in cross_below:
    fig.add_vline(
        x=cross_time, line=dict(color="red", width=2),
        row='all', col=1
    )

# Add MACD histogram to the second row (below MACD and Signal Line)
fig.add_trace(
    go.Bar(
        x=filtered_df['window_start_est'],
        y=filtered_df['MACD_histogram'],
        name='MACD Histogram',
        marker_color=np.where(filtered_df['MACD_histogram'] >= 0, 'green', 'red')  # color bars conditionally
    ),
    row=2, col=1
)

# Add horizontal line at y=0 in the MACD plot
fig.add_hline(y=0, line=dict(color="black", width=2, dash="solid"), row=2, col=1)

# Add volume bars to the third row
fig.add_trace(
    go.Bar(
        x=filtered_df['window_start_est'],
        y=filtered_df['adjusted_volume'],
        marker_color='blue',
        name='adjusted_volume'
    ),
    row=3, col=1
)

# Add RSI line to the fourth row
fig.add_trace(
    go.Scatter(
        x=filtered_df['window_start_est'], y=filtered_df['rsi'],
        mode='lines', name='RSI'
    ),
    row=4, col=1
)

# Add horizontal lines at RSI levels 30 and 70
fig.add_hline(y=30, line=dict(color="red", width=1, dash="dash"), row=4, col=1)
fig.add_hline(y=70, line=dict(color="red", width=1, dash="dash"), row=4, col=1)

# Create a DataFrame for hammer candles
hammer_df = filtered_df[filtered_df['skrong_bullish_hammer'] == True]

# Add hammer candlestick chart to the fifth row
# fig.add_trace(
#     go.Candlestick(
#         x=hammer_df['window_start_est'], open=hammer_df['adjusted_open'],
#         high=hammer_df['adjusted_high'], low=hammer_df['adjusted_low'],
#         close=hammer_df['adjusted_close'],
#         increasing_line_color='purple', decreasing_line_color='purple',
#         name='Hammer Candlestick'
#     ),
#     row=5, col=1
# )

# Updating layout for better readability
fig.update_layout(
    title=f'{ticker} Stock Price and Technical Indicators on {window_start.strftime("%Y-%m-%d %H:%M %Z")}',
    xaxis_title='Time',
    yaxis_title='Price (USD)',
    legend_title_text='Indicator',
    xaxis_rangeslider_visible=False
)

# Adjust x-axis and y-axis titles for subplots
fig.update_xaxes(title_text="Time", row=1, col=1, tickformat="%H:%M")
fig.update_xaxes(title_text="Time", row=2, col=1, tickformat="%H:%M")
fig.update_xaxes(title_text="Time", row=3, col=1, tickformat="%H:%M")
fig.update_xaxes(title_text="Time", row=4, col=1, tickformat="%H:%M")
fig.update_xaxes(title_text="Time", row=5, col=1, tickformat="%H:%M")

fig.update_yaxes(title_text="Price (USD)", row=1, col=1)
fig.update_yaxes(title_text="MACD", row=2, col=1)
fig.update_yaxes(title_text='adjusted_volume', row=3, col=1)
fig.update_yaxes(title_text='RSI', row=4, col=1)
fig.update_yaxes(title_text='Price (USD)', row=5, col=1)

# Show the plot
pio.renderers.default = "notebook"
fig.show(renderer='browser')


Saved /Volumes/WD18TB/chart_images/daily_chart_2024-02-01 15:35:00-05:00.png
Processing: 2024-02-01 15:42:00-05:00
Saved /Volumes/WD18TB/chart_images/daily_chart_2024-02-01 15:34:00-05:00.png
Processing: 2024-02-01 15:43:00-05:00
Saved /Volumes/WD18TB/chart_images/daily_chart_2024-02-01 15:37:00-05:00.png
Processing: 2024-02-01 15:44:00-05:00
Saved /Volumes/WD18TB/chart_images/daily_chart_2024-02-01 15:36:00-05:00.png
Processing: 2024-02-01 15:45:00-05:00
Saved /Volumes/WD18TB/chart_images/daily_chart_2024-02-01 15:39:00-05:00.png
Processing: 2024-02-01 15:46:00-05:00
Saved /Volumes/WD18TB/chart_images/daily_chart_2024-02-01 15:38:00-05:00.png
Processing: 2024-02-01 15:47:00-05:00
Saved /Volumes/WD18TB/chart_images/daily_chart_2024-02-01 15:40:00-05:00.pngProcessing: 2024-02-01 15:48:00-05:00
Saved /Volumes/WD18TB/chart_images/daily_chart_2024-02-01 15:41:00-05:00.png
Processing: 2024-02-01 15:49:00-05:00
Saved /Volumes/WD18TB/chart_images/daily_chart_2024-02-01 15:42:00-05:00.png
Proc

TypeError: Already tz-aware, use tz_convert to convert.

Saved /Volumes/WD18TB/chart_images/daily_chart_2024-02-01 15:59:00-05:00.png
Processing: 2024-02-02 09:41:00-05:00
Saved /Volumes/WD18TB/chart_images/daily_chart_2024-02-01 15:58:00-05:00.png
Processing: 2024-02-02 09:42:00-05:00
Saved /Volumes/WD18TB/chart_images/daily_chart_2024-02-02 09:35:00-05:00.png
Processing: 2024-02-02 09:43:00-05:00
Saved /Volumes/WD18TB/chart_images/daily_chart_2024-02-02 09:36:00-05:00.png
Processing: 2024-02-02 09:44:00-05:00
Saved /Volumes/WD18TB/chart_images/daily_chart_2024-02-02 09:37:00-05:00.png
Processing: 2024-02-02 09:45:00-05:00
Saved /Volumes/WD18TB/chart_images/daily_chart_2024-02-02 09:38:00-05:00.png
Processing: 2024-02-02 09:46:00-05:00
Saved /Volumes/WD18TB/chart_images/daily_chart_2024-02-02 09:39:00-05:00.pngProcessing: 2024-02-02 09:47:00-05:00
Saved /Volumes/WD18TB/chart_images/daily_chart_2024-02-02 09:40:00-05:00.png
Processing: 2024-02-02 09:48:00-05:00
Saved /Volumes/WD18TB/chart_images/daily_chart_2024-02-02 09:41:00-05:00.png
Proc

In [None]:
#######ADD OPTIONS DATA
import os
import polars as pl

# Directory path where the CSV files are downloaded
directory_path = "/Volumes/WD18TB/us_options/minute_aggs/2024/05/"

# List all CSV files in the directory that end with "14.csv.gz"
# files = [file for file in os.listdir(directory_path) if file.endswith("15.csv.gz")]
files = [file for file in os.listdir(directory_path)]
print(files)

def apply_adjustments(df: pl.DataFrame, splits_df: pl.DataFrame, type: str, price_cols: list, volume_cols: list) -> pl.DataFrame:
    # Ensure splits_df is sorted by execution_date
    splits_df = splits_df.sort('execution_date')

    # Determine the prefix and ticker column based on the type
    prefix = "adjusted_" if type == "stock" else "adjusted_option_"
    ticker_col = "ticker" if type == "stock" else "symbol"

    # Initialize the applied_splits column as an empty list
    df = df.with_columns([pl.lit("").alias('applied_splits')])

    # Create adjusted columns for price and volume
    for col in price_cols + volume_cols:
        df = df.with_columns([pl.col(col).alias(f'{prefix}{col}')])

    for split in splits_df.to_dicts():
        ticker = split['ticker']
        execution_date = dt.datetime.strptime(split['execution_date'], '%Y-%m-%d')
        split_from = split['split_from']
        split_to = split['split_to']
        ratio = split_to / split_from

        # Convert the split dictionary to a string for storage
        split_str = str(split)

        # Apply split to relevant rows
        adjustments = []
        for col in price_cols:
            adjustments.append(
                pl.when((pl.col(ticker_col) == ticker) & (pl.col('window_start_est') < execution_date))
                .then(pl.col(f'{prefix}{col}') / ratio)
                .otherwise(pl.col(f'{prefix}{col}')).alias(f'{prefix}{col}')
            )
        
        for col in volume_cols:
            adjustments.append(
                pl.when((pl.col(ticker_col) == ticker) & (pl.col('window_start_est') < execution_date))
                .then(pl.col(f'{prefix}{col}') * ratio)
                .otherwise(pl.col(f'{prefix}{col}')).alias(f'{prefix}{col}')
            )

        adjustments.append(
            pl.when((pl.col(ticker_col) == ticker) & (pl.col('window_start_est') < execution_date))
            .then(pl.concat_str([pl.col('applied_splits'), pl.lit(f",{split_str}")]))
            .otherwise(pl.col('applied_splits')).alias('applied_splits')
        )

        df = df.with_columns(adjustments)

    return df

# Create an empty list to store DataFrames
dfs = []

# Read each CSV file and append its DataFrame to the list
for file in files:
    op_df = pl.read_csv(os.path.join(directory_path, file))
    dfs.append(op_df)

# Concatenate all DataFrames into a single DataFrame
option_df = pl.concat(dfs)

# Filter the DataFrame for a specific substring in the ticker column
option_df = option_df.filter(pl.col("ticker").str.contains("NVDA"))  # Replace "NVDA" with the desired substring

# Extract symbol, year, month, day, option_type, and strike_price
option_df = option_df.with_columns([
    pl.col("ticker").str.slice(2, 4).alias("symbol"),
    (pl.col("ticker").str.slice(6, 2).cast(pl.Int32) + 2000).cast(pl.Utf8).alias("year"),
    pl.col("ticker").str.slice(8, 2).cast(pl.Utf8).alias("month"),
    pl.col("ticker").str.slice(10, 2).cast(pl.Utf8).alias("day"),
    pl.col("ticker").str.slice(12, 1).alias("option_type"),
    (pl.col("ticker").str.slice(13).cast(pl.Float64) / 1000).alias("strike_price")
])

# Construct expiry date
option_df = option_df.with_columns(
    (pl.col("year") + "-" + pl.col("month") + "-" + pl.col("day")).alias("expiry_date")
)

# Drop intermediate columns used for parsing
option_df = option_df.drop(["year", "month", "day"])

if "symbol" not in df.columns:
    df = df.rename({"ticker": "symbol"})
    
joined_df = df.join(option_df, on=["window_start", "symbol"], how="left")

joined_df = joined_df.filter(pl.col("strike_price").is_not_null())
price_cols = ["strike_price", "open_right", "close_right", "high_right", "low_right"]
joined_split_df = apply_adjustments(joined_df, splits_df, "option", price_cols , ["volume_right"])

# Define a function to add the time_to_expiry column
def add_time_to_expiry(df: pl.DataFrame) -> pl.DataFrame:
    # Ensure expiry_date and transaction_date are in date format
    df = df.with_columns([
        pl.col('expiry_date').str.strptime(pl.Date, "%Y-%m-%d").alias('expiry_date'),
        pl.col('transaction_date').cast(pl.Date).alias('transaction_date')
    ])
    
    # Calculate the time_to_expiry in days
    df = df.with_columns([
        (pl.col('expiry_date') - pl.col('transaction_date')).dt.total_days().alias('time_to_expiry')
    ])
    
    return df

joined_split_df = add_time_to_expiry(joined_split_df)
print("Options Data Added")


In [None]:
# import datetime
# # len(df.filter(pl.col("transaction_date") == datetime.date(2024, 3, 14)))
# count_per_window_start = joined_df.groupby("window_start_est").agg(pl.count().alias("count"))
# count_per_window_start.sort("window_start_est", "strike_price")
the_cols = ['transaction_date', 'expiry_date', 'time_to_expiry']
# joined_split_df.filter(pl.col('time_to_expiry') >= 1)
# joined_split_df
test_df = joined_split_df.filter(pl.col('time_to_expiry') > 1)
test_df.select(the_cols)
len(joined_split_df.filter(pl.col('hammer') == True))

In [None]:
####### ADD RISK FREE RATE FOR BLACK SCHOLES CALC

import polars as pl

# Read the rate DataFrame
rate_df = pl.read_csv("/Users/brandon/Downloads/daily-treasury-rates.csv")

# Convert the Date column to yyyy-mm-dd format and rename it to rate_date
rate_df = rate_df.with_columns([
    pl.col('Date').str.strptime(pl.Date, '%m/%d/%Y').alias('rate_date')
])

# Drop the original Date column
rate_df = rate_df.drop('Date')

# List of terms and corresponding days
terms = ["1 Mo", "2 Mo", "3 Mo", "4 Mo", "6 Mo", "1 Yr", "2 Yr", "3 Yr", "5 Yr"]
term_days = [30, 60, 90, 120, 180, 365, 2*365, 3*365, 5*365]

# Create a mapping from the terms to the desired column names
rename_mapping = {term: f"{days}_term_rate_period" for term, days in zip(terms, term_days)}

# Rename the columns
rate_df = rate_df.rename(rename_mapping)
rate_df = rate_df.drop(["7 Yr", "10 Yr", "20 Yr", "30 Yr"])

# Reorder columns to move rate_date to the first position
cols = ['rate_date'] + [col for col in rate_df.columns if col != 'rate_date']
rate_df = rate_df.select(cols)
# print(rate_df)

# Verify columns in rate_df
# print("Columns in rate_df:", rate_df.columns)

# Verify columns in joined_split_df before the join
# print("Columns in joined_split_df before the join:", joined_split_df.columns)

# Perform the join
joined_split_with_rates_df = joined_split_df.join(rate_df, left_on='transaction_date', right_on='rate_date', how='left')
# joined_split_with_rates_df
# Verify columns in the resulting DataFrame
# print("Columns in joined_split_with_rates_df after the join:", joined_split_with_rates_df.columns)
print("Rate data added")

In [None]:
# joined_split_with_rates_df.select('rate_date')
joined_split_with_rates_df.columns
len(joined_split_with_rates_df)
# rate_df

In [None]:
########## CALCULATE BLACK-SCHOLES OPTION PRICES AND COMPARE TO ACTUAL

import numpy as np
from scipy.stats import norm
import polars as pl

def black_scholes(S, K, T, r, sigma, option_type='C'):
    """
    Calculate Black-Scholes option price.

    Parameters:
    S (float): Current stock price
    K (float): Strike price
    T (float): Time to expiry in years
    r (float): Risk-free rate (annualized)
    sigma (float): Volatility (annualized)
    option_type (str): 'C' for call, 'P' for put

    Returns:
    float: Theoretical option price
    """
    d1 = (np.log(S / K) + (r + 0.5 * sigma ** 2) * T) / (sigma * np.sqrt(T))
    d2 = d1 - sigma * np.sqrt(T)

    if option_type == 'C':
        price = S * norm.cdf(d1) - K * np.exp(-r * T) * norm.cdf(d2)
    elif option_type == 'P':
        price = K * np.exp(-r * T) * norm.cdf(-d2) - S * norm.cdf(-d1)
    else:
        raise ValueError("option_type must be 'C' for call or 'P' for put")
    
    return price

def compare_actual_vs_theoretical_prices(df):
    """
    Compare actual option prices with theoretical Black-Scholes prices.

    Parameters:
    df (pl.DataFrame): DataFrame containing options data

    Returns:
    pl.DataFrame: DataFrame with added theoretical prices and price differences
    """
    # Ensure necessary columns are present
    required_columns = ['close', 'strike_price', 'time_to_expiry', 
                        '30_term_rate_period', 'hv', 'option_type', 'close_right']
    if not all(col in df.columns for col in required_columns):
        raise ValueError(f"DataFrame is missing one or more required columns: {required_columns}")

    # Convert time to expiry to years
    df = df.with_columns([
        (pl.col('time_to_expiry') / 365).alias('time_to_expiry_years')
    ])

    # Define the lambda function for calculating theoretical option prices
    bs_lambda = lambda row: black_scholes(
        S=row['close'],
        K=row['strike_price'],
        T=row['time_to_expiry_years'],
        r=row['30_term_rate_period'],
        sigma=row['hv'],
        option_type=row['option_type']
    )

    # Calculate theoretical option prices and add to the DataFrame
    df = df.with_columns([
        pl.struct([
            'close',
            'strike_price',
            'time_to_expiry_years',
            '30_term_rate_period',
            'hv',
            'option_type'
        ]).map_elements(bs_lambda).alias('bs_theoretical_price')
    ])

    # Calculate the difference between actual and theoretical prices
    df = df.with_columns([
        (pl.col('close_right') - pl.col('bs_theoretical_price')).alias('price_difference'),
        (
            pl.when(pl.col('bs_theoretical_price') != 0)
            .then((pl.col('close_right') - pl.col('bs_theoretical_price')) / pl.col('bs_theoretical_price'))
            .otherwise(None)
        ).alias('price_difference_pct')
    ])
    
    return df

# Example usage:
try:
    df_with_price_comparison = compare_actual_vs_theoretical_prices(joined_split_with_rates_df)
    print("Price comparison completed")
except Exception as e:
    print(f"Error in price comparison: {e}")

In [None]:
# cols = ['close', 'strike_price', 'time_to_expiry', 'risk_free_rate', 'hv', 'option_type', 'close_right']
df_with_price_comparison.filter(pl.col("bullish_hammer") == True)

In [None]:
#####APPLY TRADING STRATEGIIES
import os
from datetime import timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Constants for default signal generation
DEFAULT_STOP_LOSS_PCT = 0.02
DEFAULT_TAKE_PROFIT_PCT = 0.04
EXPIRATION_DAYS = 30
STRIKE_PRICE_FACTOR = 1.02


def process_batch(batch):
    # Function to process each batch of data
    result = generate_all_signals(batch)
    logging.info(f"Processed a batch with {len(batch)} rows.")
    return result

def generate_trade_signals(df: pl.DataFrame, pattern: str, trade_signal: str, 
                           stop_loss_pct: float, take_profit_pct: float) -> pl.DataFrame:
    return df.with_columns([
        pl.when(pl.col(pattern))
        .then(pl.lit(trade_signal))
        .otherwise(None).alias(f"{pattern}_trade_signal"),
        pl.when(pl.col(pattern))
        .then(pl.col("close_right"))
        .otherwise(None).alias(f"{pattern}_entry_price"),
        pl.when(pl.col(pattern))
        .then(pl.col("close_right") * (1 - stop_loss_pct))
        .otherwise(None).alias(f"{pattern}_stop_loss"),
        pl.when(pl.col(pattern))
        .then(pl.col("close_right") * (1 + take_profit_pct))
        .otherwise(None).alias(f"{pattern}_take_profit"),
        pl.when(pl.col(pattern))
        .then(pl.col("expiry_date"))  # Use the extracted expiry date
        .otherwise(None).alias(f"{pattern}_expiration_date"),
        pl.when(pl.col(pattern))
        .then(pl.col("strike_price"))  # Use the extracted strike price
        .otherwise(None).alias(f"{pattern}_strike_price"),
        pl.when(pl.col(pattern))
        .then(pl.col("option_type"))  # Use the extracted option type
        .otherwise(None).alias(f"{pattern}_option_type")
    ])

def generate_all_signals(df: pl.DataFrame) -> pl.DataFrame:
    patterns = [
    {"pattern": "bullish_engulfing", "trade_signal": "buy call option", "option_type": "call", "stop_loss_pct": DEFAULT_STOP_LOSS_PCT, "take_profit_pct": DEFAULT_TAKE_PROFIT_PCT},
    {"pattern": "bearish_engulfing", "trade_signal": "buy put option", "option_type": "put", "stop_loss_pct": DEFAULT_STOP_LOSS_PCT, "take_profit_pct": DEFAULT_TAKE_PROFIT_PCT},
    {"pattern": "momentum", "trade_signal": "buy call option", "option_type": "call", "stop_loss_pct": DEFAULT_STOP_LOSS_PCT, "take_profit_pct": DEFAULT_TAKE_PROFIT_PCT},
    {"pattern": "multiple_upper_wicks", "trade_signal": "buy put option", "option_type": "put", "stop_loss_pct": DEFAULT_STOP_LOSS_PCT, "take_profit_pct": DEFAULT_TAKE_PROFIT_PCT},
    {"pattern": "multiple_lower_wicks", "trade_signal": "buy call option", "option_type": "call", "stop_loss_pct": DEFAULT_STOP_LOSS_PCT, "take_profit_pct": DEFAULT_TAKE_PROFIT_PCT},
    {"pattern": "bullish_hammer", "trade_signal": "buy call option", "option_type": "call", "stop_loss_pct": DEFAULT_STOP_LOSS_PCT, "take_profit_pct": DEFAULT_TAKE_PROFIT_PCT},
    {"pattern": "shooting_star", "trade_signal": "buy put option", "option_type": "put", "stop_loss_pct": DEFAULT_STOP_LOSS_PCT, "take_profit_pct": DEFAULT_TAKE_PROFIT_PCT},
    {"pattern": "bullish_tweezer", "trade_signal": "buy call option", "option_type": "call", "stop_loss_pct": DEFAULT_STOP_LOSS_PCT, "take_profit_pct": DEFAULT_TAKE_PROFIT_PCT},
    {"pattern": "bearish_tweezer", "trade_signal": "buy put option", "option_type": "put", "stop_loss_pct": DEFAULT_STOP_LOSS_PCT, "take_profit_pct": DEFAULT_TAKE_PROFIT_PCT},
    {"pattern": "bullish_marubozu", "trade_signal": "buy call option", "option_type": "call", "stop_loss_pct": DEFAULT_STOP_LOSS_PCT, "take_profit_pct": DEFAULT_TAKE_PROFIT_PCT},
    {"pattern": "bearish_marubozu", "trade_signal": "buy put option", "option_type": "put", "stop_loss_pct": DEFAULT_STOP_LOSS_PCT, "take_profit_pct": DEFAULT_TAKE_PROFIT_PCT},
]
    
    for p in patterns:
        df = generate_trade_signals(df, p["pattern"], p["trade_signal"], p["stop_loss_pct"], p["take_profit_pct"])
    
    return df

def main(df, batch_size=10000, num_workers=None):
    # Split the DataFrame into batches
    batches = [df[i:i + batch_size] for i in range(0, len(df), batch_size)]
    
    # Determine the number of workers
    if num_workers is None:
        num_workers = os.cpu_count()
    
    # Use ThreadPoolExecutor for parallel processing
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        futures = {executor.submit(process_batch, batch): batch for batch in batches}
        
        results = []
        for future in as_completed(futures):
            logging.info("Started processing a future.")
            results.append(future.result())
            logging.info("Completed processing a future.")
    
    # Combine the results back into a single DataFrame
    combined_df = pl.concat(results)
    logging.info("Completed processing all batches.")
    return combined_df

# Example usage:
# parallel_df = main(joined_split_with_rates_df, num_workers=8)
parallel_df = main(df_with_price_comparison, num_workers=8)



In [None]:
import polars as pl
from concurrent.futures import ThreadPoolExecutor, as_completed
import logging
from datetime import datetime, timedelta

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def generate_trade_signals(df: pl.DataFrame, pattern: str, trade_signal: str, 
                           stop_loss_pct: float, take_profit_pct: float) -> pl.DataFrame:
    # Ensure the pattern column is Boolean
    df = df.with_columns(pl.col(pattern).cast(pl.Boolean))

    return df.with_columns([
        pl.when(pl.col(pattern))
        .then(pl.lit(trade_signal))
        .otherwise(None).alias(f"{pattern}_trade_signal"),
        pl.when(pl.col(pattern))
        .then(pl.col("close_right"))
        .otherwise(None).alias(f"{pattern}_entry_price"),
        pl.when(pl.col(pattern))
        .then(pl.col("close_right") * (1 - stop_loss_pct))
        .otherwise(None).alias(f"{pattern}_stop_loss"),
        pl.when(pl.col(pattern))
        .then(pl.col("close_right") * (1 + take_profit_pct))
        .otherwise(None).alias(f"{pattern}_take_profit"),
        pl.when(pl.col(pattern))
        .then(pl.col("expiry_date"))  # Use the extracted expiry date
        .otherwise(None).alias(f"{pattern}_expiration_date"),
        pl.when(pl.col(pattern))
        .then(pl.col("strike_price"))  # Use the extracted strike price
        .otherwise(None).alias(f"{pattern}_strike_price"),
        pl.when(pl.col(pattern))
        .then(pl.col("option_type"))  # Use the extracted option type
        .otherwise(None).alias(f"{pattern}_option_type")
    ])

def generate_all_signals(df: pl.DataFrame) -> pl.DataFrame:
    patterns = [
        {"pattern": "bullish_engulfing", "trade_signal": "buy call option", "option_type": "call", "stop_loss_pct": DEFAULT_STOP_LOSS_PCT, "take_profit_pct": DEFAULT_TAKE_PROFIT_PCT},
        {"pattern": "bearish_engulfing", "trade_signal": "buy put option", "option_type": "put", "stop_loss_pct": DEFAULT_STOP_LOSS_PCT, "take_profit_pct": DEFAULT_TAKE_PROFIT_PCT},
        {"pattern": "momentum", "trade_signal": "buy call option", "option_type": "call", "stop_loss_pct": DEFAULT_STOP_LOSS_PCT, "take_profit_pct": DEFAULT_TAKE_PROFIT_PCT},
        {"pattern": "multiple_upper_wicks", "trade_signal": "buy put option", "option_type": "put", "stop_loss_pct": DEFAULT_STOP_LOSS_PCT, "take_profit_pct": DEFAULT_TAKE_PROFIT_PCT},
        {"pattern": "multiple_lower_wicks", "trade_signal": "buy call option", "option_type": "call", "stop_loss_pct": DEFAULT_STOP_LOSS_PCT, "take_profit_pct": DEFAULT_TAKE_PROFIT_PCT},
        {"pattern": "hammer", "trade_signal": "buy call option", "option_type": "call", "stop_loss_pct": DEFAULT_STOP_LOSS_PCT, "take_profit_pct": DEFAULT_TAKE_PROFIT_PCT},
        {"pattern": "shooting_star", "trade_signal": "buy put option", "option_type": "put", "stop_loss_pct": DEFAULT_STOP_LOSS_PCT, "take_profit_pct": DEFAULT_TAKE_PROFIT_PCT},
        {"pattern": "bullish_tweezer", "trade_signal": "buy call option", "option_type": "call", "stop_loss_pct": DEFAULT_STOP_LOSS_PCT, "take_profit_pct": DEFAULT_TAKE_PROFIT_PCT},
        {"pattern": "bearish_tweezer", "trade_signal": "buy put option", "option_type": "put", "stop_loss_pct": DEFAULT_STOP_LOSS_PCT, "take_profit_pct": DEFAULT_TAKE_PROFIT_PCT},
        {"pattern": "bullish_marubozu", "trade_signal": "buy call option", "option_type": "call", "stop_loss_pct": DEFAULT_STOP_LOSS_PCT, "take_profit_pct": DEFAULT_TAKE_PROFIT_PCT},
        {"pattern": "bearish_marubozu", "trade_signal": "buy put option", "option_type": "put", "stop_loss_pct": DEFAULT_STOP_LOSS_PCT, "take_profit_pct": DEFAULT_TAKE_PROFIT_PCT},
    ]

    for p in patterns:
        df = generate_trade_signals(df, p["pattern"], p["trade_signal"], p["stop_loss_pct"], p["take_profit_pct"])

    return df

def process_batch(batch):
    # Function to process each batch of data
    result = generate_all_signals(batch)
    logging.info(f"Processed a batch with {len(batch)} rows.")
    return result

def main(df, batch_size=10000, num_workers=4):
    # Split DataFrame into batches
    batches = [df[i:i + batch_size] for i in range(0, len(df), batch_size)]

    results = []
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        futures = [executor.submit(process_batch, batch) for batch in batches]
        total_futures = len(futures)
        for i, future in enumerate(as_completed(futures)):
            logging.info(f"Started processing future {i + 1}/{total_futures}.")
            results.append(future.result())
            logging.info(f"Completed processing future {i + 1}/{total_futures}.")

    # Combine the results back into a single DataFrame
    combined_df = pl.concat(results, how='vertical')
    return combined_df

# Example usage:
parallel_df = main(df_with_price_comparison, num_workers=8)


In [None]:
#######NARROW TO SINGLE TRADE(code above executes on all available derivatives for that minute)

import polars as pl

def select_closest_strike_price(df: pl.DataFrame, pattern: str) -> pl.DataFrame:
    # Calculate the absolute difference between the strike price and the asset price
    df = df.with_columns(
        (pl.col(f'{pattern}_strike_price') - pl.col(f'{pattern}_entry_price')).abs().alias('strike_price_diff')
    )
    
    # Group by the pattern and select the trade with the minimum difference
    closest_strike_price_df = (
        df.filter(pl.col(f'{pattern}_trade_signal').is_not_null())
          .sort(['strike_price_diff'])
          .group_by([f'{pattern}_trade_signal', 'window_start_est'], maintain_order=True)
          .first()
    )
    
    # Drop the intermediate column used for calculation
    closest_strike_price_df = closest_strike_price_df.drop('strike_price_diff')
    
    return closest_strike_price_df

def select_closest_strike_prices_for_all_patterns(df: pl.DataFrame) -> pl.DataFrame:
    patterns = [
        # "bullish_engulfing",
        # "bearish_engulfing",
        # "momentum",
        # "multiple_upper_wicks",
        # "multiple_lower_wicks",
        "bullish_hammer",
        # "shooting_star",
        # "bullish_tweezer",
        # "bearish_tweezer",
        # "bullish_marubozu",
        # "bearish_marubozu"
    ]
    
    closest_strike_prices_df_list = []
    for pattern in patterns:
        closest_strike_prices_df = select_closest_strike_price(df, pattern)
        closest_strike_prices_df_list.append(closest_strike_prices_df)
    
    # Concatenate all the dataframes
    combined_df = pl.concat(closest_strike_prices_df_list, how='vertical')
    
    return combined_df

# Example usage
final_df = select_closest_strike_prices_for_all_patterns(parallel_df)
final_df

In [None]:
########EVALUATE TRADE OUTCOMES

import polars as pl
from datetime import datetime, timedelta

def find_trade_outcome_vectorized(df: pl.DataFrame, pattern: str) -> pl.DataFrame:
    outcomes = [None] * len(df)
    exit_dates = [None] * len(df)
    exit_timestamps = [None] * len(df)
    exit_prices = [None] * len(df)
    times_to_exit = [None] * len(df)

    transaction_dates = df['transaction_date'].to_list()
    window_start_ests = df['window_start_est'].to_list()
    option_closes = df['close_right'].to_list()  # Using option close price
    pattern_flags = df[pattern].to_list()
    stop_losses = df[f"{pattern}_stop_loss"].to_list()
    take_profits = df[f"{pattern}_take_profit"].to_list()
    expiration_dates = df[f"{pattern}_expiration_date"].to_list()

    for i in range(len(df)):
        if pattern_flags[i]:
            entry_price = option_closes[i]  # Option entry price
            stop_loss = stop_losses[i]
            take_profit = take_profits[i]
            expiration_date = expiration_dates[i]
            entry_timestamp = window_start_ests[i]
            
            trade_outcome = "undecided"
            exit_date = None
            exit_ts = None
            exit_price = None
            time_to_exit = None

            for j in range(i + 1, len(df)):
                shifted_transaction_date = transaction_dates[j]
                shifted_exit_ts = window_start_ests[j]
                
                if shifted_exit_ts <= entry_timestamp:
                    continue

                if shifted_transaction_date > expiration_date:
                    break
                
                shifted_exit_ts = window_start_ests[j]
                current_price = option_closes[j]  # Current option price
                
                if current_price <= stop_loss:
                    trade_outcome = "loss"
                    exit_date = shifted_transaction_date
                    exit_ts = shifted_exit_ts
                    exit_price = current_price
                    time_to_exit = (exit_ts - entry_timestamp).total_seconds() / 60
                    break
                if current_price >= take_profit:
                    trade_outcome = "win"
                    exit_date = shifted_transaction_date
                    exit_ts = shifted_exit_ts
                    exit_price = current_price
                    time_to_exit = (exit_ts - entry_timestamp).total_seconds() / 60
                    break

            if trade_outcome == "undecided":
                trade_outcome = "expired"
                exit_date = expiration_date
                exit_ts = window_start_ests[j]
                exit_price = option_closes[j]  # Option price at expiration
                time_to_exit = (exit_ts - entry_timestamp).total_seconds() / 60

            outcomes[i] = trade_outcome
            exit_dates[i] = exit_date
            exit_timestamps[i] = exit_ts
            exit_prices[i] = exit_price
            times_to_exit[i] = time_to_exit


    df = df.with_columns([
        pl.Series(f"{pattern}_outcome", outcomes),
        pl.Series(f"{pattern}_exit_date", exit_dates).cast(pl.Date),
        pl.Series(f"{pattern}_exit_ts", exit_timestamps).cast(pl.Datetime),
        pl.Series(f"{pattern}_exit_price", exit_prices).cast(pl.Float64),
        pl.Series(f"{pattern}_time_to_exit", times_to_exit).cast(pl.Float64),
    ])

    df = df.with_columns([
        ((pl.col(f"{pattern}_exit_price") - pl.col(f"{pattern}_entry_price")) * 100).alias(f"{pattern}_trade_profit")
    ])

    return df

def find_all_trade_outcomes(df: pl.DataFrame) -> pl.DataFrame:
    patterns = [
        "bullish_hammer",
        # Add other patterns here as needed
    ]
    
    for pattern in patterns:
        start_time = datetime.now().time()
        print(f"***** {start_time} Starting evaluating {pattern} trade results")
        
        df = find_trade_outcome_vectorized(df, pattern)
        
        end_time = datetime.now().time()
        print(f"***** {end_time} Finished evaluating {pattern} trade results")
    
    return df

# Example usage:
vectorized_outcomes_df = find_all_trade_outcomes(final_df)

In [None]:
vectorized_outcomes_df.select(cols)
vectorized_outcomes_df.select(cols).filter(
   # (pl.col(col) == 'loss') &
    (pl.col('bullish_hammer_option_type') == 'C') &
    (pl.col('time_to_expiry') >= 8) & 
    (pl.col('time_to_expiry') <= 60)
).sort('window_start_est')


In [None]:
col = 'bullish_hammer_outcome'
cols = [ 
    'window_start_est',
    'transaction_date',
    'ticker',
    'close',
    'close_right',
    'time_to_expiry',
    'bullish_hammer_trade_signal',
    'bullish_hammer_outcome',
 'bullish_hammer_entry_price',
    'bullish_hammer_exit_price',
    'bullish_hammer_trade_profit',
 'bullish_hammer_stop_loss',
 'bullish_hammer_take_profit',
    
 'bullish_hammer_time_to_exit',
 'bullish_hammer_strike_price',
'bullish_hammer_exit_ts',
 'bullish_hammer_option_type',
'bullish_hammer_expiration_date',
 'bullish_hammer_exit_date',

 
    #'hammer_time_to_exit'
]
# vectorized_outcomes_df = vectorized_outcomes_df.filter(pl.col('time_to_expiry') > 1)
# vectorized_outcomes_df = vectorized_outcomes_df.filter(
#     (pl.col('time_to_expiry') >= 30) & (pl.col('time_to_expiry') <= 60)
# )
# vectorized_outcomes_df = vectorized_outcomes_df.filter(
#     ((pl.col('strike_price') / pl.col('close')).alias('moneyness')).is_between(0.9, 1.1)
# )

# # vectorized_outcomes_df.select(cols)
vectorized_outcomes_df.select(cols).filter(pl.col(col) == 'win').filter(pl.col('bullish_hammer_option_type') == 'C')
vectorized_outcomes_df.filter(pl.col(col) == 'win').filter(pl.col('bullish_hammer_option_type') == 'C')

vectorized_outcomes_df.select(cols) 

In [None]:
# df.tail(10000)
df.columns

In [None]:
# df.filter(pl.col("bullish_engulfing") == True).filter(pl.col("outcome") == "loss")
df.tail(10000)
col = 'hammer'
df.filter(pl.col(col) == True)
# len(df.filter(pl.col(col) == True))

In [None]:
vectorized_outcomes_df.select(cols).filter(pl.col(col) == 'expired')

In [None]:
outcomes_df.write_parquet("/Users/brandon/Documents/stock_backtest_outcomes.pq")

In [None]:
# Function to calculate win/loss ratio and overall profit
def calculate_strategy_performance(df: pl.DataFrame, strategies: list) -> pl.DataFrame:
    performance_data = []

    for strategy in strategies:
        # Filter data for the strategy
        strategy_df = df.filter(pl.col(f"{strategy}_trade_signal").is_not_null())
        
        # Calculate wins and losses
        wins = strategy_df.filter(pl.col(f"{strategy}_outcome") == "win").shape[0]
        losses = strategy_df.filter(pl.col(f"{strategy}_outcome") == "loss").shape[0]
        total_trades = wins + losses
        
        # Calculate win/loss ratio
        win_loss_ratio = wins / total_trades if total_trades > 0 else 0
        
        # Calculate overall profit
        profit = strategy_df.filter(pl.col(f"{strategy}_outcome") == "win").with_columns(
            (pl.col(f"{strategy}_exit_price") - pl.col(f"{strategy}_entry_price")).alias(f"{strategy}_profit")
        ).select(pl.col(f"{strategy}_profit").sum()).item()
        
        performance_data.append((strategy, total_trades, win_loss_ratio, profit))

    # Create a new DataFrame with performance metrics
    performance_df = pl.DataFrame(performance_data, schema=["Strategy", "Total Trades", "Win/Loss Ratio", "Overall Profit"])
    return performance_df

# List of strategies
strategies = [
    # "bullish_engulfing",
    # "bearish_engulfing",
    # "momentum", 
    # "multiple_upper_wicks",
    # "multiple_lower_wicks",
    "hammer",
    # "shooting_star",
    # "bullish_tweezer", 
    # "bearish_tweezer", 
    # "bullish_marubozu",
    # "bearish_marubozu"
]

# Example usage
performance_df = calculate_strategy_performance(outcomes_df, strategies)
performance_df

In [None]:
col = 'hammer'
# df.filter(pl.col(col) == True)
parallel_df.filter(pl.col(col) == True)

In [None]:
columns = ['hammer',
           'hammer_trade_signal',
         'hammer_entry_price',
         'hammer_stop_loss',
         'hammer_take_profit',
         'hammer_expiration_date',
         'hammer_strike_price',
         'hammer_option_type',
         'hammer_outcome',
         'hammer_exit_date',
         'hammer_exit_ts',
         'hammer_exit_price',
        ]

col = 'hammer'

vectorized_outcomes_df.select(columns).filter(pl.col(col) == True)

vectorized_outcomes_df.filter(pl.col(col) == 'loss').filter(pl.col('hammer_option_type') == 'C')


In [None]:

%history -g 2