In [2]:
import pandas as pd
import wrds
import yfinance as yf
import datetime as dt
import numpy as np
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Create and login to your connection using your WRDS credentials
conn = wrds.Connection(wrds_username='dreamspartan')

# Define start and end dates for data extraction
start_date = '2010-01-01'
end_date = '2020-01-01'

# Fetch stock and options data for biotech equities from WRDS
def get_wrds_data(tickers):
    # Placeholder for WRDS query - customize as needed
    data = {}
    for ticker in tickers:
        query = f"""
        SELECT date, bid, ask, implied_volatility, delta, gamma, vega, theta
        FROM optionm.opprcd{ticker}
        WHERE date BETWEEN '{start_date}' AND '{end_date}'
        """
        data[ticker] = conn.raw_sql(query)
    return data

# Fetch stock data from Yahoo Finance for high-volatility biotech equities
def get_stock_data(tickers):
    stock_data = yf.download(tickers, start=start_date, end=end_date)
    return stock_data

# Example tickers for biotech companies
tickers = ['AMGN', 'BIIB', 'GILD']  # Use actual high-volatility biotech tickers

# Get options data
options_data = get_wrds_data(tickers)

# Get stock data
stock_data = get_stock_data(tickers)

# Clean and preprocess the stock data
def preprocess_data(stock_data):
    # Fill missing values using forward and backward fill
    stock_data = stock_data.fillna(method='ffill').fillna(method='bfill')

    # Calculate returns
    stock_data['Returns'] = stock_data['Adj Close'].pct_change()

    # Volatility calculation (rolling standard deviation of returns)
    stock_data['Volatility'] = stock_data['Returns'].rolling(window=20).std() * np.sqrt(252)

    return stock_data

# Preprocess the stock data
preprocessed_stock_data = preprocess_data(stock_data)

# Feature engineering (combining stock and options data)
def combine_data(stock_data, options_data):
    combined_data = stock_data.copy()
    for ticker in options_data.keys():
        option_df = options_data[ticker]

        # Merge option data with stock data on dates
        combined_data = pd.merge(combined_data, option_df, on='date', how='left')

        # Calculate additional features such as option spread (ask - bid)
        combined_data['Option_Spread'] = combined_data['ask'] - combined_data['bid']

    return combined_data

# Combine stock and options data
combined_data = combine_data(preprocessed_stock_data, options_data)

# Split data into training and test sets for modeling
train_data, test_data = train_test_split(combined_data, test_size=0.2, shuffle=False)

# Standardize the features (important for models like Random Forest)
scaler = StandardScaler()
X_train = scaler.fit_transform(train_data.drop(['Returns'], axis=1))
y_train = train_data['Returns']

# Example: Using a Random Forest model for return prediction
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predictions on test data
X_test = scaler.transform(test_data.drop(['Returns'], axis=1))
y_pred = rf_model.predict(X_test)

# Calculate performance metrics like MSE
mse = np.mean((y_pred - test_data['Returns'])**2)
print(f'Mean Squared Error: {mse}')

# Summary of backtest performance (you can also use Sharpe ratio, max drawdown, etc.)
def evaluate_performance(predictions, actual_returns):
    performance = pd.DataFrame({
        'Predicted': predictions,
        'Actual': actual_returns
    })
    performance['Strategy_Returns'] = performance['Predicted'] * actual_returns.shift(-1)  # Lag prediction by 1 day
    cumulative_return = (1 + performance['Strategy_Returns']).cumprod() - 1
    return cumulative_return

# Evaluate performance
performance = evaluate_performance(y_pred, test_data['Returns'])
print(performance)


Enter your WRDS username [dreamspartan]:dreamspartan
Enter your password:··········


OperationalError: (psycopg2.OperationalError) connection to server at "wrds-pgdata.wharton.upenn.edu" (165.123.60.118), port 9737 failed: SSL connection has been closed unexpectedly

(Background on this error at: https://sqlalche.me/e/20/e3q8)

In [None]:
import pandas as pd
import numpy as np
import wrds
import yfinance as yf
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from arch import arch_model
from statsmodels.tsa.arima.model import ARIMA
from textblob import TextBlob
from scipy.stats import norm
from scipy.optimize import minimize

# Create and login to your connection using your WRDS credentials
conn = wrds.Connection(wrds_username='dreamspartan')

def collect_data(tickers, start_date, end_date):
    """
    Collect historical price data, options data, and fundamental data for the given tickers and date range.
    """
    price_data = yf.download(tickers, start=start_date, end=end_date)
    options_data = conn.raw_sql("SELECT * FROM optionm.opprcd WHERE date BETWEEN '{}' AND '{}' AND symbol IN ({})".format(start_date, end_date, ','.join(["'" + t + "'" for t in tickers])))
    fundamental_data = conn.raw_sql("SELECT * FROM comp.fundq WHERE datadate BETWEEN '{}' AND '{}' AND tic IN ({})".format(start_date, end_date, ','.join(["'" + t + "'" for t in tickers])))

    return price_data, options_data, fundamental_data

def preprocess_data(price_data, options_data, fundamental_data):
    """
    Preprocess the collected data by cleaning, merging, and scaling.
    """
    # Clean and merge the data
    merged_data = pd.merge(price_data, options_data, on=['date', 'symbol'])
    merged_data = pd.merge(merged_data, fundamental_data, on=['datadate', 'tic'])

    # Handle missing values and outliers
    merged_data.fillna(method='ffill', inplace=True)
    merged_data = merged_data[(merged_data > merged_data.quantile(0.01)) & (merged_data < merged_data.quantile(0.99))]

    # Scale the data
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(merged_data)

    return scaled_data

def sentiment_analysis(news_data):
    """
    Perform sentiment analysis on the news data using TextBlob.
    """
    sentiments = []
    for news in news_data:
        analysis = TextBlob(news)
        sentiment = analysis.sentiment.polarity
        sentiments.append(sentiment)

    return sentiments

def volatility_forecasting(price_data):
    """
    Forecast volatility using GARCH and stochastic volatility models.
    """
    returns = price_data['Close'].pct_change().dropna()

    # GARCH model
    garch_model = arch_model(returns, vol='GARCH', p=1, q=1)
    garch_results = garch_model.fit()
    garch_volatility = garch_results.conditional_volatility

    # Stochastic volatility model
    sv_model = ARIMA(returns, order=(1, 0, 1))
    sv_results = sv_model.fit()
    sv_volatility = np.sqrt(sv_results.params[0] / (1 - sv_results.params[1]))

    return garch_volatility, sv_volatility

def options_analysis(options_data):
    """
    Analyze options data to derive insights into market sentiment and potential trading opportunities.
    """
    # Calculate implied volatility
    implied_volatility = []
    for _, option in options_data.iterrows():
        S = option['underlying_price']
        K = option['strike_price']
        r = option['risk_free_rate']
        T = option['time_to_maturity']
        price = option['price']

        def bs_call(sigma):
            d1 = (np.log(S / K) + (r + 0.5 * sigma ** 2) * T) / (sigma * np.sqrt(T))
            d2 = d1 - sigma * np.sqrt(T)
            return S * norm.cdf(d1) - K * np.exp(-r * T) * norm.cdf(d2) - price

        iv = minimize(lambda x: bs_call(x) ** 2, 0.5, method='L-BFGS-B', bounds=[(0.01, 1.0)]).x[0]
        implied_volatility.append(iv)

    options_data['implied_volatility'] = implied_volatility

    # Calculate greeks
    options_data['delta'] = 0.5  # Placeholder value, replace with actual calculation
    options_data['gamma'] = 0.1  # Placeholder value, replace with actual calculation
    options_data['vega'] = 0.2   # Placeholder value, replace with actual calculation
    options_data['theta'] = -0.01  # Placeholder value, replace with actual calculation

    return options_data

def develop_strategies(price_data, options_data, sentiment_data, volatility_data):
    """
    Develop enhanced trading strategies incorporating options analysis, sentiment analysis, and volatility forecasting.
    """
    # Placeholder strategy development logic
    strategy_signals = pd.DataFrame(index=price_data.index, columns=['Strategy1', 'Strategy2', 'Strategy3'])
    strategy_signals['Strategy1'] = (price_data['Close'] > price_data['Close'].rolling(window=50).mean()) & (options_data['implied_volatility'] < 0.3) & (sentiment_data > 0.5)
    strategy_signals['Strategy2'] = (volatility_data['garch_volatility'] > volatility_data['garch_volatility'].rolling(window=30).mean()) & (options_data['delta'] > 0.6)
    strategy_signals['Strategy3'] = (price_data['Close'] < price_data['Close'].rolling(window=100).mean()) & (sentiment_data < -0.5) & (options_data['vega'] > 0.4)

    return strategy_signals

def backtest_strategies(price_data, strategy_signals):
    """
    Backtest the developed strategies and evaluate their performance.
    """
    # Placeholder backtesting logic
    backtest_results = pd.DataFrame(index=price_data.index, columns=['Strategy1', 'Strategy2', 'Strategy3'])
    initial_capital = 10000
    positions = pd.DataFrame(index=price_data.index, columns=['Strategy1', 'Strategy2', 'Strategy3'])

    for strategy in ['Strategy1', 'Strategy2', 'Strategy3']:
        positions[strategy] = np.where(strategy_signals[strategy], 1, 0)
        positions[strategy] = positions[strategy].shift(1)
        backtest_results[strategy] = initial_capital * (1 + (positions[strategy] * price_data['Close'].pct_change())).cumprod()

    return backtest_results

def evaluate_performance(backtest_results):
    """
    Evaluate the performance of the backtested strategies using various metrics.
    """
    performance_metrics = pd.DataFrame(columns=['Strategy1', 'Strategy2', 'Strategy3'])

    for strategy in ['Strategy1', 'Strategy2', 'Strategy3']:
        returns = backtest_results[strategy].pct_change()
        sharpe_ratio = np.sqrt(252) * returns.mean() / returns.std()
        max_drawdown = (backtest_results[strategy] / backtest_results[strategy].cummax() - 1).min()

        performance_metrics.loc['Sharpe Ratio', strategy] = sharpe_ratio
        performance_metrics.loc['Max Drawdown', strategy] = max_drawdown

    return performance_metrics

def main():
    tickers = ['AMGN', 'GILD', 'REGN']  # Example biotech tickers
    start_date = '2020-01-01'
    end_date = '2021-12-31'

    price_data, options_data, fundamental_data = collect_data(tickers, start_date, end_date)
    preprocessed_data = preprocess_data(price_data, options_data, fundamental_data)

    sentiment_data = sentiment_analysis(news_data)  # Assuming news_data is available
    garch_volatility, sv_volatility = volatility_forecasting(price_data)
    volatility_data = pd.DataFrame({'garch_volatility': garch_volatility, 'sv_volatility': sv_volatility}, index=price_data.index)

    options_data = options_analysis(options_data)
    strategy_signals = develop_strategies(price_data, options_data, sentiment_data, volatility_data)
    backtest_results = backtest_strategies(price_data, strategy_signals)
    performance_metrics = evaluate_performance(backtest_results)

    print("Backtest Results:")
    print(backtest_results)
    print("\nPerformance Metrics:")
    print(performance_metrics)

if __name__ == '__main__':
    main()