In [63]:
import yfinance as yf
import numpy as np
import pandas as pd
from scipy.stats import norm
from scipy.optimize import minimize
from statsmodels.tsa.stattools import coint
from statsmodels.tsa.api import VAR
from datetime import datetime, timedelta
import logging
import pickle
import databento as db
import os
from tqdm import tqdm

logging.basicConfig(level=logging.INFO)

In [65]:
# DATA_FOLDER = "./data/stock/"
# # NOTE: Decrease num_months_traing low if running too slow
# num_months_train = 11
# train_files = sorted_files[:num_months_train]   # 11 months as training set
# test_files = sorted_files[-1:]    # choose last month as the test set

# df = pd.DataFrame()

# for file_name in tqdm(train_files, desc="Loading Training files"):
#     path = DATA_FOLDER + file_name
#     stored_data = db.DBNStore.from_file(path)
#     month_options = stored_data.to_df()
#     df = pd.concat([df, month_options])

# train_prices = df.pivot_table(index='ts_event', columns='symbol', values='close', aggfunc='first')
# train_prices

In [66]:
test_prices= pd.read_csv('/Users/kavitakar/Downloads/mse244-main/data/stock/1m_fullyear_equities.csv')


print(test_prices["AAPL"], test_prices["MSFT"])

0        169.220
1        169.220
2        169.220
3        169.220
4        169.220
          ...   
98428    174.020
98429    174.135
98430    174.130
98431    174.150
98432    174.115
Name: AAPL, Length: 98433, dtype: float64 0           NaN
1           NaN
2           NaN
3           NaN
4        307.31
          ...  
98428    397.48
98429    397.44
98430    397.37
98431    397.37
98432    397.18
Name: MSFT, Length: 98433, dtype: float64


In [67]:
# df_test = pd.DataFrame()

# for file_name in tqdm(test_files, desc="Loading Testing files"):
#     path = DATA_FOLDER + file_name
#     stored_data = db.DBNStore.from_file(path)
#     month_options = stored_data.to_df()
#     df_test = pd.concat([df_test, month_options])

# test_prices = df_test.pivot_table(index='ts_event', columns='symbol', values='close', aggfunc='first')
# test_prices

In [76]:
import pandas as pd
import numpy as np
from tqdm import tqdm  # For displaying progress bars
import logging  # For logging information
from statsmodels.tsa.api import VAR  # Vector Autoregression model
from statsmodels.tsa.stattools import coint  # Cointegration test
from scipy.stats import norm  # For statistical functions

class StraddleSelector:
    def __init__(self, equities, lag_period, window_size=30):
        """
        Initialize the StraddleSelector class with the list of equities, lag period, and window size for rolling calculations.

        Parameters:
        equities (list): List of equity tickers to analyze.
        lag_period (int): The window size for rolling calculations.
        window_size (int): The window size for rolling correlation (default is 30 minutes).
        """
        self.equities = equities
        self.lag_period = lag_period
        self.window_size = window_size
        self.var_models = {}  # Dictionary to store VAR models for each pair of equities
        self.results = []
        self.p_values_df = pd.DataFrame(columns=['Ticker1', 'Ticker2', 'P_Value', 'Correlation_Significant', 'Params', "Best_Lag"])
        self.initial_calculation_done = False  # Flag to indicate if initial calculations are done
        self.trade_signals = []
        self.all_trades_df = pd.DataFrame()

    def fetch_historical_data(self, ticker):
        """
        Fetch historical data for a given ticker.

        Parameters:
        ticker (str): The equity ticker.

        Returns:
        pandas.Series: Historical adjusted close prices for the ticker.
        """
        return test_prices[ticker]

    def calculate_returns(self, historical_data):
        """
        Calculate logarithmic returns from historical price data.

        Parameters:
        historical_data (pandas.Series): Historical adjusted close prices.

        Returns:
        pandas.Series: Logarithmic returns.
        """
        return np.log(historical_data / historical_data.shift(1)).dropna()

    def calculate_rolling_correlation(self, returns1, returns2):
        """
        Calculate rolling correlation between two sets of returns.

        Parameters:
        returns1 (pandas.Series): Returns of the first equity.
        returns2 (pandas.Series): Returns of the second equity.

        Returns:
        pandas.Series: Rolling correlation.
        """
        return returns1.rolling(window=self.window_size).corr(returns2)

    def fit_var_model(self, returns1, returns2, ticker1, ticker2):
        """
        Fit a Vector Autoregression (VAR) model to the returns of two equities.

        Parameters:
        returns1 (pandas.Series): Returns of the first equity.
        returns2 (pandas.Series): Returns of the second equity.
        ticker1 (str): Ticker of the first equity.
        ticker2 (str): Ticker of the second equity.

        Returns:
        VARResults: Fitted VAR model.
        """
        model_data = pd.concat([returns1, returns2], axis=1).dropna()
        model_data.columns = [ticker1, ticker2]
        var_model = VAR(model_data)
        return var_model.fit(maxlags=15, ic='aic')

    def initial_calculations(self):
        """
        Perform initial calculations including fetching data, calculating returns, and fitting VAR models.
        """
        for i, ticker1 in enumerate(tqdm(self.equities)):
            for ticker2 in self.equities[i + 1:]:
                logging.info(f"Initial analysis for pair: {ticker1}, {ticker2}")

                # Fetch historical data for the pair of equities
                hist1 = self.fetch_historical_data(ticker1)
                hist2 = self.fetch_historical_data(ticker2)

                # Skip pairs with no data
                if hist1.empty or hist2.empty:
                    logging.warning(f"No data for one or both tickers: {ticker1}, {ticker2}")
                    continue

                merged_hist = pd.concat([hist1, hist2], axis=1).dropna()
                if merged_hist.empty:
                    logging.warning(f"No data for pair: {ticker1}, {ticker2}")
                    continue

                # Calculate returns for the pair
                returns1 = self.calculate_returns(hist1)
                returns2 = self.calculate_returns(hist2)

                # Calculate rolling correlation and test its significance
                rolling_corr = self.calculate_rolling_correlation(returns1, returns2)
                significant = test_significance(rolling_corr.dropna())

                if not significant:
                    continue

                # Fit VAR model and store results
                var_result = self.fit_var_model(returns1, returns2, ticker1, ticker2)
                self.store_p_values_and_models(ticker1, ticker2, var_result)
                print(f"Stored p values and models for {ticker1}, {ticker2}")

        self.initial_calculation_done = True

    def store_p_values_and_models(self, ticker1, ticker2, var_result):
        """
        Store p-values and VAR model parameters for a pair of equities.

        Parameters:
        ticker1 (str): Ticker of the first equity.
        ticker2 (str): Ticker of the second equity.
        var_result (VARResults): Fitted VAR model.
        """
        p_value = coint(var_result.endog[:, 0], var_result.endog[:, 1])[1]
        correlation_significant = p_value < 0.05
        best_lag = var_result.k_ar
        self.var_models[(ticker1, ticker2)] = var_result
        new_row = pd.DataFrame({
            'Ticker1': [ticker1],
            'Ticker2': [ticker2],
            'P_Value': [p_value],
            'Correlation_Significant': [correlation_significant],
            'Params': [var_result.params],
            'Best_Lag': [best_lag]
        })
        self.p_values_df = pd.concat([self.p_values_df, new_row], ignore_index=True)

    def forecast_volatility_change(self, var_model, test_data, ticker1, ticker2):
        """
        Forecast changes in volatility using the VAR model.

        Parameters:
        var_model (VARResults): Fitted VAR model.
        test_data (pandas.DataFrame): Test data for forecasting.
        ticker1 (str): Ticker of the first equity.
        ticker2 (str): Ticker of the second equity.

        Returns:
        pandas.DataFrame: Predicted volatility changes.
        """
        lag_order = var_model.k_ar
        if len(test_data) == 0 or lag_order == 0:
            logging.warning(f"Not enough data points to perform forecasting for {ticker1}, {ticker2}")
            return None

        predicted_data = pd.DataFrame(columns=[ticker1, ticker2], index=test_data.index[lag_order:])
        returns1 = self.calculate_returns(test_data[ticker1])
        returns2 = self.calculate_returns(test_data[ticker2])
        returns_concat = pd.concat([returns1, returns2], axis=1).dropna()

        for i in range(0, len(returns_concat) - lag_order):
            window = returns_concat.iloc[i:i + lag_order].values
            forecast_results = var_model.forecast(y=window, steps=1)
            predicted_data.loc[returns_concat.index[i + lag_order], ticker1] = forecast_results[0][0]
            predicted_data.loc[returns_concat.index[i + lag_order], ticker2] = forecast_results[0][1]

        final_predicted_data = pd.merge(returns_concat, predicted_data, left_index=True, right_index=True, suffixes=('_actual', '_predicted'))
        window_size = 20

        for ticker in [ticker1, ticker2]:
            final_predicted_data[f'vol_{ticker}'] = final_predicted_data[f'{ticker}_predicted'].rolling(window=window_size).std()
            final_predicted_data[f'shifted_vol_{ticker}'] = final_predicted_data[f'vol_{ticker}'].shift(1)
            final_predicted_data[f'vol_change_{ticker}'] = final_predicted_data[f'vol_{ticker}'] - final_predicted_data[f'shifted_vol_{ticker}']
            final_predicted_data[f'vol_std_{ticker}'] = final_predicted_data[f'vol_{ticker}'].rolling(window=window_size).std()
            final_predicted_data.drop(columns=[f'shifted_vol_{ticker}'], inplace=True)

        return final_predicted_data

    def output_trades(self, final_predicted_data, ticker1, ticker2):
        """
        Generate trade signals based on predicted data.

        Parameters:
        final_predicted_data (pandas.DataFrame): Predicted volatility changes.
        ticker1 (str): Ticker of the first equity.
        ticker2 (str): Ticker of the second equity.

        Returns:
        pandas.DataFrame: DataFrame containing trade signals.
        """
        if final_predicted_data is None:
            return None

        for ticker in [ticker1, ticker2]:
            percentile_25 = final_predicted_data[f'{ticker}_predicted'].quantile(0.25)
            percentile_75 = final_predicted_data[f'{ticker}_predicted'].quantile(0.75)
            final_predicted_data[f'{ticker}_trade'] = np.where(
                final_predicted_data[f'{ticker}_predicted'] > percentile_75, 'C',
                np.where(final_predicted_data[f'{ticker}_predicted'] < percentile_25, 'P', np.nan)
            )

        trades = final_predicted_data[[f'{ticker1}_trade', f'{ticker2}_trade']].dropna().reset_index()
        trades.columns = ['date', 'trade_ticker1', 'trade_ticker2']
        trades['ticker1'] = ticker1
        trades['ticker2'] = ticker2
        self.trade_signals.append(trades)

        return trades

    def update_decisions(self, test_prices):
        """
        Update trading decisions based on new test prices.

        Parameters:
        test_prices (pandas.DataFrame): New test prices for equities.

        Returns:
        pandas.DataFrame: Updated DataFrame with all trades.
        """
        if not self.initial_calculation_done:
            logging.info("Initial calculations not done yet.")
            return None

        for (ticker1, ticker2), var_result in self.var_models.items():
            logging.info(f"Updating decisions for pair: {ticker1}, {ticker2}")
            test_data = test_prices[[ticker1, ticker2]].dropna()
            final_predicted_data = self.forecast_volatility_change(var_result, test_data, ticker1, ticker2)
            trades_df = self.output_trades(final_predicted_data, ticker1, ticker2)
            if trades_df is not None:
                self.all_trades_df = pd.concat([self.all_trades_df, trades_df])

        return self.all_trades_df

    def run_iterations(self, num_iterations, test_prices):
        """
        Run multiple iterations to update trading decisions.

        Parameters:
        num_iterations (int): Number of iterations to run.
        test_prices (pandas.DataFrame): Test prices for equities.
        """
        for i in range(num_iterations):
            logging.info(f"Running iteration {i + 1}/{num_iterations}")
            self.update_decisions(test_prices)
            logging.info(f"Iteration {i + 1} completed")

def rolling_correlation(data1, data2, window=30):
    """
    Calculate rolling correlation between two data series.

    Parameters:
    data1 (pandas.Series): First data series.
    data2 (pandas.Series): Second data series.
    window (int): Rolling window size.

    Returns:
    pandas.Series: Rolling correlation.
    """
    return data1.rolling(window).corr(data2)

def test_significance(correlation_series, alpha=0.05):
    """
    Test the significance of the rolling correlation series.

    Parameters:
    correlation_series (pandas.Series): Rolling correlation series.
    alpha (float): Significance level.

    Returns:
    bool: True if correlation is significant, False otherwise.
    """
    mean_corr = correlation_series.mean()
    std_corr = correlation_series.std()
    t_stat = mean_corr / (std_corr / np.sqrt(len(correlation_series)))
    p_value = 2 * (1 - norm.cdf(np.abs(t_stat)))  # Two-tailed test
    return p_value < alpha

def bayesian_inference(S, returns, window_size, n=10000):
    """
    Perform Bayesian inference to predict future prices.

    Parameters:
    S (float): Current stock price.
    returns (pandas.Series): Historical returns.
    window_size (int): Rolling window size for correlation.
    n (int): Number of samples.

    Returns:
    tuple: Mean and standard deviation of predicted future prices.
    """
    rolling_corr = returns.rolling(window=window_size).corr().dropna()
    vol_change = rolling_corr.mean()
    std_change = rolling_corr.std()
    predicted_changes = np.random.normal(loc=vol_change, scale=std_change, size=n)
    future_prices = S * (1 + predicted_changes)
    return future_prices.mean(), future_prices.std()

def binomial_tree_american(S, K, T, r, sigma, option_type='call', steps=100):
    """
    Calculate the price of an American option using the binomial tree model.

    Parameters:
    S (float): Current stock price.
    K (float): Strike price.
    T (float): Time to maturity (in years).
    r (float): Risk-free interest rate.
    sigma (float): Volatility of the stock.
    option_type (str): Type of the option ('call' or 'put').
    steps (int): Number of steps in the binomial tree.

    Returns:
    float: Option price.
    """
    dt = T / steps
    u = np.exp(sigma * np.sqrt(dt))
    d = 1 / u
    p = (np.exp(r * dt) - d) / (u - d)
    
    option_values = np.zeros((steps + 1, steps + 1))
    
    for i in range(steps + 1):
        if option_type == 'call':
            option_values[i, steps] = max(0, S * (u ** (steps - i)) * (d ** i) - K)
        elif option_type == 'put':
            option_values[i, steps] = max(0, K - S * (u ** (steps - i)) * (d ** i))
    
    for j in range(steps - 1, -1, -1):
        for i in range(j + 1):
            option_value_if_held = np.exp(-r * dt) * (p * option_values[i, j + 1] + (1 - p) * option_values[i + 1, j + 1])
            if option_type == 'call':
                option_value_if_exercised = S * (u ** i) * (d ** (j - i)) - K
            elif option_type == 'put':
                option_value_if_exercised = K - S * (u ** i) * (d ** (j - i))
            
            option_values[i, j] = max(option_value_if_held.item(), option_value_if_exercised.item())

    return option_values[0, 0]

def straddle_profit(S, K, T, r, sigma, returns, window_size, steps=100):
    """
    Calculate the profit of a straddle option strategy.

    Parameters:
    S (float): Current stock price.
    K (float): Strike price of the options.
    T (float): Time to maturity (in years).
    r (float): Risk-free interest rate.
    sigma (float): Volatility of the stock.
    returns (pandas.Series): Historical returns.
    window_size (int): Rolling window size for correlation.
    steps (int): Number of steps in the binomial tree (default is 100).

    Returns:
    dict: A dictionary containing potential profits for upward, downward, and no price movement scenarios.
    """
    # Ensure inputs are valid
    if S <= 0 or K <= 0 or T <= 0 or sigma <= 0 or steps <= 0:
        raise ValueError("All input parameters must be positive.")

    # Incorporate Bayesian inference to adjust the initial stock price
    S = bayesian_inference(S, returns, window_size, n=10000)[0]

    # Calculate call and put option prices using the binomial tree model
    call_price = binomial_tree_american(S, K, T, r, sigma, 'call', steps)
    put_price = binomial_tree_american(S, K, T, r, sigma, 'put', steps)
    total_cost = call_price + put_price

    # Expected stock prices based on volatility
    S_up = S * np.exp(sigma * np.sqrt(T))
    S_down = S * np.exp(-sigma * np.sqrt(T))

    # Calculate profits for different scenarios
    profit_up = max(S_up - K, 0) + max(K - S_up, 0) - total_cost
    profit_down = max(K - S_down, 0) + max(S_down - K, 0) - total_cost
    profit_no_move = -total_cost

    return {
        'profit_up': profit_up,
        'profit_down': profit_down,
        'profit_no_move': profit_no_move,
        'total_cost': total_cost
    }

def objective(K, S, T, r, sigma, returns, window_size, steps=100):
    """
    Objective function to optimize the straddle profit.

    Parameters:
    K (float): Strike price.
    S (float): Current stock price.
    T (float): Time to maturity.
    r (float): Risk-free interest rate.
    sigma (float): Volatility.
    returns (pandas.Series): Historical returns.
    window_size (int): Rolling window size for correlation.
    steps (int): Number of steps in the binomial tree.

    Returns:
    float: Negative of the sum of potential profits.
    """
    profit = straddle_profit(S, K, T, r, sigma, returns, window_size, steps)
    return - (profit['profit_up'] + profit['profit_down'] + profit['profit_no_move'])

def sharpe_ratio(profits, risk_free_rate):
    """
    Calculate the Sharpe ratio for a series of profits.

    Parameters:
    profits (list or numpy.ndarray): Series of profits.
    risk_free_rate (float): Risk-free interest rate.

    Returns:
    float: Sharpe ratio.
    """
    expected_return = np.mean(profits)
    std_dev = np.std(profits)
    return (expected_return - risk_free_rate) / std_dev




In [77]:
if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)

    equities = ['AAPL', 'MSFT']
    lag_period = 30
    straddle_selector = StraddleSelector(equities, lag_period)

    straddle_selector.initial_calculations()

    test_prices = pd.DataFrame()
    for ticker in equities:
        test_prices[ticker] = yf.Ticker(ticker).history(period="1y")['Close']

    all_trades_df = straddle_selector.update_decisions(test_prices)
    print(all_trades_df)

    num_iterations = 20
    straddle_selector.run_iterations(num_iterations, test_prices)
    print(straddle_selector.all_trades_df)

  0%|          | 0/2 [00:00<?, ?it/s]INFO:root:Initial analysis for pair: AAPL, MSFT
  self._init_dates(dates, freq)
  self.p_values_df = pd.concat([self.p_values_df, new_row], ignore_index=True)
100%|██████████| 2/2 [00:00<00:00, 75.79it/s]


Stored p values and models for AAPL, MSFT


INFO:root:Updating decisions for pair: AAPL, MSFT
INFO:root:Running iteration 1/20
INFO:root:Updating decisions for pair: AAPL, MSFT
INFO:root:Iteration 1 completed
INFO:root:Running iteration 2/20
INFO:root:Updating decisions for pair: AAPL, MSFT
INFO:root:Iteration 2 completed
INFO:root:Running iteration 3/20
INFO:root:Updating decisions for pair: AAPL, MSFT
INFO:root:Iteration 3 completed
INFO:root:Running iteration 4/20
INFO:root:Updating decisions for pair: AAPL, MSFT
INFO:root:Iteration 4 completed
INFO:root:Running iteration 5/20
INFO:root:Updating decisions for pair: AAPL, MSFT
INFO:root:Iteration 5 completed
INFO:root:Running iteration 6/20
INFO:root:Updating decisions for pair: AAPL, MSFT
INFO:root:Iteration 6 completed
INFO:root:Running iteration 7/20
INFO:root:Updating decisions for pair: AAPL, MSFT
INFO:root:Iteration 7 completed
INFO:root:Running iteration 8/20
INFO:root:Updating decisions for pair: AAPL, MSFT


                         date trade_ticker1 trade_ticker2 ticker1 ticker2
0   2023-06-09 00:00:00-04:00           nan           nan    AAPL    MSFT
1   2023-06-12 00:00:00-04:00           nan           nan    AAPL    MSFT
2   2023-06-13 00:00:00-04:00             C             P    AAPL    MSFT
3   2023-06-14 00:00:00-04:00             P             P    AAPL    MSFT
4   2023-06-15 00:00:00-04:00           nan           nan    AAPL    MSFT
..                        ...           ...           ...     ...     ...
246 2024-06-03 00:00:00-04:00           nan           nan    AAPL    MSFT
247 2024-06-04 00:00:00-04:00             C             C    AAPL    MSFT
248 2024-06-05 00:00:00-04:00           nan           nan    AAPL    MSFT
249 2024-06-06 00:00:00-04:00           nan             P    AAPL    MSFT
250 2024-06-07 00:00:00-04:00             P           nan    AAPL    MSFT

[251 rows x 5 columns]


INFO:root:Iteration 8 completed
INFO:root:Running iteration 9/20
INFO:root:Updating decisions for pair: AAPL, MSFT
INFO:root:Iteration 9 completed
INFO:root:Running iteration 10/20
INFO:root:Updating decisions for pair: AAPL, MSFT
INFO:root:Iteration 10 completed
INFO:root:Running iteration 11/20
INFO:root:Updating decisions for pair: AAPL, MSFT
INFO:root:Iteration 11 completed
INFO:root:Running iteration 12/20
INFO:root:Updating decisions for pair: AAPL, MSFT
INFO:root:Iteration 12 completed
INFO:root:Running iteration 13/20
INFO:root:Updating decisions for pair: AAPL, MSFT
INFO:root:Iteration 13 completed
INFO:root:Running iteration 14/20
INFO:root:Updating decisions for pair: AAPL, MSFT
INFO:root:Iteration 14 completed
INFO:root:Running iteration 15/20
INFO:root:Updating decisions for pair: AAPL, MSFT
INFO:root:Iteration 15 completed
INFO:root:Running iteration 16/20
INFO:root:Updating decisions for pair: AAPL, MSFT
INFO:root:Iteration 16 completed
INFO:root:Running iteration 17/20


                         date trade_ticker1 trade_ticker2 ticker1 ticker2
0   2023-06-09 00:00:00-04:00           nan           nan    AAPL    MSFT
1   2023-06-12 00:00:00-04:00           nan           nan    AAPL    MSFT
2   2023-06-13 00:00:00-04:00             C             P    AAPL    MSFT
3   2023-06-14 00:00:00-04:00             P             P    AAPL    MSFT
4   2023-06-15 00:00:00-04:00           nan           nan    AAPL    MSFT
..                        ...           ...           ...     ...     ...
246 2024-06-03 00:00:00-04:00           nan           nan    AAPL    MSFT
247 2024-06-04 00:00:00-04:00             C             C    AAPL    MSFT
248 2024-06-05 00:00:00-04:00           nan           nan    AAPL    MSFT
249 2024-06-06 00:00:00-04:00           nan             P    AAPL    MSFT
250 2024-06-07 00:00:00-04:00             P           nan    AAPL    MSFT

[5271 rows x 5 columns]


In [71]:
straddle_selector.all_trades_df.to_csv('straddle_trades.csv')