In [14]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
import warnings
import logging

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)

# Suppress specific warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)


In [15]:
def monte_carlo_split(data_length, n_splits=30, test_size=5):
    """Monte Carlo Time Series Validation."""
    indices = np.arange(data_length)
    splits = []
    for _ in range(n_splits):
        if len(indices) <= test_size:
            return [(indices, indices)]
        start = np.random.randint(0, len(indices)-test_size)
        splits.append((indices[:start], indices[start:start+test_size]))
    return splits

def calculate_position_size(probability):
    """Calculate position size based on prediction confidence."""
    risk = 0.02  # 2% per trade
    confidence = abs(probability - 0.5) * 2
    return risk * confidence


In [16]:
logger.info("Loading and preparing data...")
# Read the data
nifty_df = pd.read_csv('nifty_1min_data.csv')
vix_df = pd.read_csv('vix_1min_data.csv')

# Data Preparation & Cleaning
for df in [nifty_df, vix_df]:
    df['Datetime'] = pd.to_datetime(df['Datetime'])
    df.set_index('Datetime', inplace=True)
    df.sort_index(inplace=True)
    
# Filter market hours only (9:15 AM to 3:30 PM IST)
nifty_df = nifty_df.between_time('09:15', '15:30')
vix_df = vix_df.between_time('09:15', '15:30')

# Get unique dates using pandas datetime index
unique_dates = pd.Series(nifty_df.index.date).nunique()
logger.info(f"Total trading days in data: {unique_dates}")

if unique_dates < 100:
    logger.warning("WARNING: Dataset contains less than 100 trading days. Results may be unreliable.")

2025-02-23 15:49:49 - INFO - Loading and preparing data...
2025-02-23 15:49:49 - INFO - Total trading days in data: 18


In [17]:
def get_first_5min_features(df):
    """Extract features from first 5 minutes and previous day's last 30 minutes."""
    def process_group(group):
        if len(group) >= 5:
            first_5_min = group.iloc[:5]
            
            # Get previous day's data
            prev_day = group.index[0].date() - pd.Timedelta(days=1)
            prev_day_data = df[df.index.date == prev_day].between_time('15:00', '15:30')
            
            features = {
                # Current day first 5 min features
                'vix_open': first_5_min['Open'].iloc[0],
                'vix_high': first_5_min['High'].max(),
                'vix_low': first_5_min['Low'].min(),
                'vix_close': first_5_min['Close'].iloc[-1],
                'vix_volatility': first_5_min['Close'].std(),
                'vix_pct_change': (first_5_min['Close'].iloc[-1] / first_5_min['Close'].iloc[0] - 1) * 100,
                'vix_range': (first_5_min['High'].max() - first_5_min['Low'].min()) / first_5_min['Open'].iloc[0] * 100,
                'vix_oc_ratio': (first_5_min['Close'].iloc[-1] - first_5_min['Open'].iloc[0]) / first_5_min['Open'].iloc[0]
            }
            
            # Add previous day's last 30 min features if available
            if len(prev_day_data) >= 25:  # At least 25 minutes of data
                features.update({
                    'prev_day_close': prev_day_data['Close'].iloc[-1],
                    'prev_day_vwap': (prev_day_data['Close'] * prev_day_data['Volume']).sum() / prev_day_data['Volume'].sum(),
                    'prev_day_volatility': prev_day_data['Close'].std(),
                    'prev_day_trend': (prev_day_data['Close'].iloc[-1] / prev_day_data['Close'].iloc[0] - 1) * 100,
                    'prev_day_high': prev_day_data['High'].max(),
                    'prev_day_low': prev_day_data['Low'].min(),
                    'prev_day_range': (prev_day_data['High'].max() - prev_day_data['Low'].min()) / prev_day_data['Open'].iloc[0] * 100,
                    'gap_pct': (first_5_min['Open'].iloc[0] / prev_day_data['Close'].iloc[-1] - 1) * 100
                })
            else:
                # If previous day data not available, use neutral values
                features.update({
                    'prev_day_close': first_5_min['Open'].iloc[0],  # Use current day open
                    'prev_day_vwap': first_5_min['Open'].iloc[0],
                    'prev_day_volatility': 0,
                    'prev_day_trend': 0,
                    'prev_day_high': first_5_min['Open'].iloc[0],
                    'prev_day_low': first_5_min['Open'].iloc[0],
                    'prev_day_range': 0,
                    'gap_pct': 0
                })
            
            return pd.Series(features)
        return None
    
    return df.groupby(pd.Grouper(freq='D')).apply(process_group).dropna()

In [18]:
def get_nifty_target(df):
    """Calculate 3-hour return for NIFTY."""
    def calculate_return(group):
        if len(group) >= 180:  # 3 hours = 180 minutes
            start_price = group['Open'].iloc[0]
            end_price = group['Close'].iloc[179]  # 180th minute
            return (end_price / start_price - 1) * 100
        return None
    
    return df.groupby(pd.Grouper(freq='D')).apply(calculate_return).dropna().to_frame('nifty_3hr_pct_change')


In [19]:
logger.info("Generating features...")
# Generate VIX features
vix_features = get_first_5min_features(vix_df)
logger.info(f"Generated features for {len(vix_features)} trading days")

# Generate NIFTY target
nifty_target = get_nifty_target(nifty_df)
logger.info(f"Generated targets for {len(nifty_target)} trading days")

# Ensure data alignment
common_dates = vix_features.index.intersection(nifty_target.index)
vix_features = vix_features.loc[common_dates]
nifty_target = nifty_target.loc[common_dates]

# Merge features and target
merged_df = pd.concat([vix_features, nifty_target], axis=1).dropna()
logger.info(f"Final dataset size: {len(merged_df)} trading days")

if len(merged_df) < 10:
    raise ValueError("Insufficient data: Need at least 10 trading days")

# Create binary trend target (1 for positive returns, 0 for negative)
merged_df['trend'] = (merged_df['nifty_3hr_pct_change'] > 0).astype(int)
class_balance = merged_df['trend'].mean() * 100
logger.info(f"Positive trend days: {merged_df['trend'].sum()} ({class_balance:.1f}%)")

if not (40 <= class_balance <= 60):
    logger.warning(f"WARNING: Significant class imbalance detected ({class_balance:.1f}% positive)")

logger.info("Performing feature engineering...")
# Additional Feature Engineering
merged_df['vix_nifty_correlation'] = merged_df['vix_pct_change'].rolling(3, min_periods=1).corr(merged_df['nifty_3hr_pct_change'])
merged_df['vix_acceleration'] = merged_df['vix_pct_change'].diff()
merged_df['vol_ratio'] = merged_df['vix_volatility'] / merged_df['vix_volatility'].rolling(2, min_periods=1).mean()
merged_df['day_of_week'] = merged_df.index.dayofweek

# Add moving averages with smaller windows
merged_df['vix_ma3'] = merged_df['vix_close'].rolling(3, min_periods=1).mean()
merged_df['vix_ma5'] = merged_df['vix_close'].rolling(5, min_periods=1).mean()

# Fill NaN values with appropriate methods
for col in merged_df.columns:
    if col not in ['trend', 'nifty_3hr_pct_change']:
        if 'ratio' in col or 'correlation' in col:
            merged_df[col] = merged_df[col].fillna(0)
        else:
            merged_df[col] = merged_df[col].fillna(method='ffill').fillna(method='bfill')

logger.info(f"Dataset size after feature engineering: {len(merged_df)} trading days")

logger.info("Analyzing feature correlations...")
# Visual Analysis
plt.figure(figsize=(12, 8))
corr_matrix = merged_df.corr()
sns.heatmap(corr_matrix[['nifty_3hr_pct_change']], annot=True, fmt='.2f')
plt.title('Feature Correlation with Target')
plt.tight_layout()
plt.savefig('correlation_heatmap.png')
plt.close()

# Display correlation results
logger.info("\nFeature Correlations with NIFTY 3-hour Returns:")
correlations = corr_matrix['nifty_3hr_pct_change'].sort_values(ascending=False)
for feature, corr in correlations.items():
    if feature not in ['nifty_3hr_pct_change', 'trend']:
        logger.info(f"{feature:20} : {corr:+.3f}")

logger.info("\nGenerating pair plots...")
# Select most important features for pair plot
important_features = ['vix_pct_change', 'vix_volatility', 'vix_range', 'nifty_3hr_pct_change']
sns.pairplot(merged_df[important_features], diag_kind='kde')
plt.savefig('pair_plots.png')
plt.close()

logger.info("Building and evaluating model...")
# Model Building - Explicitly exclude target variables from features
features = [col for col in merged_df.columns if col not in ['nifty_3hr_pct_change', 'trend']]
X = merged_df[features].values  # Convert to numpy array
y = merged_df['trend'].values   # Convert to numpy array

# Handle class imbalance with SMOTE
if len(X) >= 10:
    logger.info("Applying SMOTE to handle class imbalance...")
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)
else:
    logger.warning("Dataset too small for SMOTE. Using original data.")
    X_resampled, y_resampled = X, y

# Use Monte Carlo cross-validation for small datasets
n_splits = max(1, len(merged_df) // 5)  # 5 samples per test set
splits = monte_carlo_split(len(X_resampled), n_splits=min(30, n_splits), test_size=5)

model = GradientBoostingClassifier(
    loss='log_loss',
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    min_samples_split=2,
    min_samples_leaf=1,
    subsample=0.8,
    random_state=42
)

# Model Evaluation
logger.info("\nModel Performance Across Monte Carlo Splits:")
logger.info("-" * 50)
fold_accuracies = []
fold_probas = []

for fold, (train_index, test_index) in enumerate(splits, 1):
    X_train, X_test = X_resampled[train_index], X_resampled[test_index]
    y_train, y_test = y_resampled[train_index], y_resampled[test_index]
    
    model.fit(X_train, y_train)
    accuracy = model.score(X_test, y_test)
    probas = model.predict_proba(X_test)[:, 1]
    
    fold_accuracies.append(accuracy)
    fold_probas.extend(probas)
    
    logger.info(f"\nFold {fold}:")
    logger.info(f"Training samples: {len(X_train)}, Test samples: {len(X_test)}")
    logger.info(f"Accuracy: {accuracy:.2f}")
    
    report = classification_report(y_test, model.predict(X_test), zero_division=0)
    logger.info("\nClassification Report:")
    logger.info(report)

logger.info(f"\nAverage Model Accuracy: {np.mean(fold_accuracies):.2f}")
logger.info(f"Std Dev of Accuracy: {np.std(fold_accuracies):.2f}")
logger.info(f"Average Prediction Confidence: {np.mean(np.abs(np.array(fold_probas) - 0.5)) * 2:.2f}")

logger.info("\nAnalyzing feature importance...")
# Feature Importance Analysis
plt.figure(figsize=(12, 8))
importance = pd.Series(model.feature_importances_, index=features)
importance_sorted = importance.sort_values(ascending=True)
importance_sorted.plot(kind='barh')
plt.title('Feature Importance')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.close()

# Display feature importance
logger.info("\nTop 10 Most Important Features:")
for feature, importance in importance_sorted.nlargest(10).items():
    logger.info(f"{feature:20} : {importance:.3f}")

2025-02-23 15:49:49 - INFO - Generating features...
  'prev_day_vwap': (prev_day_data['Close'] * prev_day_data['Volume']).sum() / prev_day_data['Volume'].sum(),
2025-02-23 15:49:49 - INFO - Generated features for 4 trading days
2025-02-23 15:49:49 - INFO - Generated targets for 18 trading days
2025-02-23 15:49:49 - INFO - Final dataset size: 4 trading days


ValueError: Insufficient data: Need at least 10 trading days

In [20]:
def create_features(vix_data, prev_day_data=None):
    """
    Create features from live VIX data for prediction.
    
    Args:
        vix_data (pd.DataFrame): Live VIX data with OHLCV prices for first 5 minutes
        prev_day_data (pd.DataFrame): Previous day's last 30 minutes of data (optional)
        
    Returns:
        list: Feature vector for prediction
    """
    if len(vix_data) < 5:
        raise ValueError("Need at least 5 minutes of VIX data")
        
    first_5min = vix_data.iloc[:5]
    features = {
        'vix_open': first_5min['Open'].iloc[0],
        'vix_high': first_5min['High'].max(),
        'vix_low': first_5min['Low'].min(),
        'vix_close': first_5min['Close'].iloc[-1],
        'vix_volatility': first_5min['Close'].std(),
        'vix_pct_change': (first_5min['Close'].iloc[-1] / first_5min['Close'].iloc[0] - 1) * 100,
        'vix_range': (first_5min['High'].max() - first_5min['Low'].min()) / first_5min['Open'].iloc[0] * 100,
        'vix_oc_ratio': (first_5min['Close'].iloc[-1] - first_5min['Open'].iloc[0]) / first_5min['Open'].iloc[0],
        'vix_acceleration': 0,
        'vol_ratio': 1,
        'day_of_week': vix_data.index[0].dayofweek,
        'vix_ma3': first_5min['Close'].mean(),
        'vix_ma5': first_5min['Close'].mean(),
        'vix_nifty_correlation': 0
    }
    
    # Add previous day features if available
    if prev_day_data is not None and len(prev_day_data) >= 25:
        features.update({
            'prev_day_close': prev_day_data['Close'].iloc[-1],
            'prev_day_vwap': (prev_day_data['Close'] * prev_day_data['Volume']).sum() / prev_day_data['Volume'].sum(),
            'prev_day_volatility': prev_day_data['Close'].std(),
            'prev_day_trend': (prev_day_data['Close'].iloc[-1] / prev_day_data['Close'].iloc[0] - 1) * 100,
            'prev_day_high': prev_day_data['High'].max(),
            'prev_day_low': prev_day_data['Low'].min(),
            'prev_day_range': (prev_day_data['High'].max() - prev_day_data['Low'].min()) / prev_day_data['Open'].iloc[0] * 100,
            'gap_pct': (first_5min['Open'].iloc[0] / prev_day_data['Close'].iloc[-1] - 1) * 100
        })
    else:
        features.update({
            'prev_day_close': first_5min['Open'].iloc[0],
            'prev_day_vwap': first_5min['Open'].iloc[0],
            'prev_day_volatility': 0,
            'prev_day_trend': 0,
            'prev_day_high': first_5min['Open'].iloc[0],
            'prev_day_low': first_5min['Open'].iloc[0],
            'prev_day_range': 0,
            'gap_pct': 0
        })
    
    # Convert to list in the same order as training features
    return [features[feature] for feature in features]

def daily_prediction(live_vix, prev_day_data=None):
    """
    Make predictions using live VIX data.
    
    Args:
        live_vix (pd.DataFrame): First 5 minutes of live VIX data
        prev_day_data (pd.DataFrame): Previous day's last 30 minutes of data (optional)
        
    Returns:
        tuple: (signal, probability, position_size)
    """
    features = create_features(live_vix, prev_day_data)
    proba = model.predict_proba([features])[0][1]
    position_size = calculate_position_size(proba)
    
    if proba > 0.65:
        signal = "STRONG BUY"
    elif proba > 0.55:
        signal = "BUY"
    elif proba < 0.35:
        signal = "STRONG SELL"
    else:
        signal = "NEUTRAL"
        
    return signal, proba, position_size


In [21]:
logger.info("\nAnalysis complete! The following files have been generated:")
logger.info("1. correlation_heatmap.png - Shows feature correlations with target")
logger.info("2. pair_plots.png - Visualizes relationships between key variables")
logger.info("3. feature_importance.png - Shows relative importance of each feature")
logger.info("4. vix_patterns.png - Displays discovered VIX patterns")

# Final warnings and recommendations
if len(merged_df) < 100:
    logger.warning("\nWARNING: Current dataset is too small for reliable predictions.")
    logger.warning("Recommended: Collect at least 6 months of historical data.")
if np.std(fold_accuracies) > 0.15:
    logger.warning("\nWARNING: High variance in model performance detected.")
    logger.warning("Recommended: Use more conservative position sizing.")

2025-02-23 15:49:51 - INFO - 
Analysis complete! The following files have been generated:
2025-02-23 15:49:51 - INFO - 1. correlation_heatmap.png - Shows feature correlations with target
2025-02-23 15:49:51 - INFO - 2. pair_plots.png - Visualizes relationships between key variables
2025-02-23 15:49:51 - INFO - 3. feature_importance.png - Shows relative importance of each feature
2025-02-23 15:49:51 - INFO - 4. vix_patterns.png - Displays discovered VIX patterns


NameError: name 'fold_accuracies' is not defined