## 1. Import Libraries

In [10]:
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')

print("‚úì Libraries imported successfully")

‚úì Libraries imported successfully


## 2. Configuration

In [11]:
# Configuration
DATA_PATH = r'C:\Users\ananyas\OneDrive - Synopsys, Inc\Documents\iisc\subject\semester 1\data science in practice\project\dataset'
CLUSTER_STOCKS = ['ICICIPRULI', 'ICICIGI', 'HDFCAMC']
OPENING_MINUTES = 60

print("‚úì Configuration loaded")
print(f"  Stocks: {', '.join(CLUSTER_STOCKS)}")
print(f"  Opening Minutes: {OPENING_MINUTES}")

‚úì Configuration loaded
  Stocks: ICICIPRULI, ICICIGI, HDFCAMC
  Opening Minutes: 60


## 3. Load Raw Minute Data

In [12]:
# Load minute-by-minute data for each stock
minute_data = {}

for stock in CLUSTER_STOCKS:
    file_path = os.path.join(DATA_PATH, f"{stock}_minute.csv")
    
    if os.path.exists(file_path):
        try:
            df = pd.read_csv(file_path)
            
            # Standardize column names
            df.columns = df.columns.str.lower().str.strip()
            
            # Parse datetime from 'date' column (which contains both date and time)
            df['datetime'] = pd.to_datetime(df['date'])
            df['date_only'] = df['datetime'].dt.date
            
            # Sort by datetime
            df = df.sort_values('datetime').reset_index(drop=True)
            
            minute_data[stock] = df
            print(f"‚úì {stock}: {len(df)} minute records loaded")
        except Exception as e:
            print(f"‚ö†Ô∏è {stock}: Error loading file - {str(e)}")
    else:
        print(f"‚ö†Ô∏è {stock}: File not found at {file_path}")

print(f"\n‚úì Total stocks loaded: {len(minute_data)}")

‚úì ICICIPRULI: 816504 minute records loaded
‚úì ICICIGI: 724352 minute records loaded
‚úì HDFCAMC: 644432 minute records loaded

‚úì Total stocks loaded: 3


## 4. Technical Indicator Functions

In [13]:
def calculate_rsi(prices, period=14):
    """Calculate RSI (Relative Strength Index)"""
    if len(prices) < period + 1:
        return 50.0  # Default neutral value
    
    deltas = np.diff(prices)
    seed = deltas[:period+1]
    up = seed[seed >= 0].sum() / period
    down = -seed[seed < 0].sum() / period
    rs = up / down if down != 0 else 0
    rsi = np.zeros_like(prices)
    rsi[:period] = 100. - 100. / (1. + rs)

    for i in range(period, len(prices)):
        delta = deltas[i - 1]
        if delta > 0:
            upval = delta
            downval = 0.
        else:
            upval = 0.
            downval = -delta

        up = (up * (period - 1) + upval) / period
        down = (down * (period - 1) + downval) / period
        rs = up / down if down != 0 else 0
        rsi[i] = 100. - 100. / (1. + rs)

    return rsi[-1]

def calculate_technical_indicators(opening_data):
    """Calculate various technical indicators"""
    closes = opening_data['close'].values
    highs = opening_data['high'].values
    lows = opening_data['low'].values
    
    # RSI
    rsi = calculate_rsi(closes)
    
    # Bollinger Bands
    bb_period = min(20, len(closes))
    sma = closes[-bb_period:].mean()
    std = closes[-bb_period:].std()
    upper_band = sma + (2 * std)
    lower_band = sma - (2 * std)
    bb_position = (closes[-1] - lower_band) / (upper_band - lower_band) if upper_band != lower_band else 0.5
    
    # Price momentum (rate of change)
    if len(closes) >= 10:
        price_momentum = (closes[-1] - closes[-10]) / closes[-10] if closes[-10] != 0 else 0
    else:
        price_momentum = (closes[-1] - closes[0]) / closes[0] if closes[0] != 0 else 0
    
    # Price acceleration (momentum of momentum)
    if len(closes) >= 20:
        mid_momentum = (closes[-10] - closes[-20]) / closes[-20] if closes[-20] != 0 else 0
        price_acceleration = price_momentum - mid_momentum
    else:
        price_acceleration = 0
    
    return rsi, bb_position, price_momentum, price_acceleration

print("‚úì Technical indicator functions defined")

‚úì Technical indicator functions defined


## 5. Feature Engineering Function

In [14]:
def create_enhanced_features(minute_df, opening_minutes=60):
    """
    Create enhanced features from opening period minute data.
    
    Args:
        minute_df: DataFrame with minute-level stock data
        opening_minutes: Number of opening minutes to use (default 60)
    
    Returns:
        DataFrame with features for each trading day
    """
    features = []
    
    # Group by trading day
    for date, day_data in minute_df.groupby('date_only'):
        # Get opening period data
        opening_data = day_data.head(opening_minutes)
        
        # Skip if insufficient data
        if len(opening_data) < opening_minutes * 0.5:  # At least 50% of expected minutes
            continue
        
        # Basic opening period statistics
        open_price = opening_data['open'].iloc[0]
        open_high = opening_data['high'].max()
        open_low = opening_data['low'].min()
        open_close = opening_data['close'].iloc[-1]
        open_range = open_high - open_low
        open_range_pct = (open_range / open_price * 100) if open_price != 0 else 0
        open_change_pct = ((open_close - open_price) / open_price * 100) if open_price != 0 else 0
        
        # Volatility metrics
        returns = opening_data['close'].pct_change().dropna()
        open_volatility = returns.std() if len(returns) > 0 else 0
        open_returns_std = returns.std() if len(returns) > 0 else 0
        
        # Time-segmented returns (3 equal periods)
        period_size = len(opening_data) // 3
        if period_size > 0:
            first_period = opening_data.iloc[:period_size]
            second_period = opening_data.iloc[period_size:2*period_size]
            third_period = opening_data.iloc[2*period_size:]
            
            first_20min_return = ((first_period['close'].iloc[-1] - first_period['open'].iloc[0]) / 
                                 first_period['open'].iloc[0] * 100) if first_period['open'].iloc[0] != 0 else 0
            second_20min_return = ((second_period['close'].iloc[-1] - second_period['open'].iloc[0]) / 
                                  second_period['open'].iloc[0] * 100) if len(second_period) > 0 and second_period['open'].iloc[0] != 0 else 0
            third_20min_return = ((third_period['close'].iloc[-1] - third_period['open'].iloc[0]) / 
                                 third_period['open'].iloc[0] * 100) if len(third_period) > 0 and third_period['open'].iloc[0] != 0 else 0
        else:
            first_20min_return = second_20min_return = third_20min_return = 0
        
        # Volume features
        total_volume_opening = opening_data['volume'].sum()
        avg_volume_per_min = opening_data['volume'].mean()
        volume_std = opening_data['volume'].std()
        volume_trend = opening_data['volume'].iloc[-1] - opening_data['volume'].iloc[0]
        volume_surge = (opening_data['volume'] > avg_volume_per_min * 1.5).sum() / len(opening_data)
        volume_consistency = 1 / (volume_std / avg_volume_per_min) if avg_volume_per_min > 0 and volume_std > 0 else 0
        
        # Position features
        high_in_first_half = 1 if opening_data['high'].idxmax() <= len(opening_data) // 2 else 0
        low_in_first_half = 1 if opening_data['low'].idxmin() <= len(opening_data) // 2 else 0
        price_above_open = (opening_data['close'] > open_price).sum() / len(opening_data)
        
        # Technical indicators
        opening_rsi, bb_position, price_momentum, price_acceleration = calculate_technical_indicators(opening_data)
        
        # Additional features
        hl_ratio = (open_high - open_low) / open_low if open_low != 0 else 0
        upper_shadow_avg = ((opening_data['high'] - opening_data[['open', 'close']].max(axis=1)) / 
                           opening_data['close']).mean()
        lower_shadow_avg = ((opening_data[['open', 'close']].min(axis=1) - opening_data['low']) / 
                           opening_data['close']).mean()
        trend_strength = abs(opening_data['close'].iloc[-1] - opening_data['close'].iloc[0]) / open_range if open_range != 0 else 0
        
        # Target variables (day's actual high and low)
        day_high = day_data['high'].max()
        day_low = day_data['low'].min()
        
        # Store features
        features.append({
            'date': date,
            'open_price': open_price,
            'open_high': open_high,
            'open_low': open_low,
            'open_close': open_close,
            'open_range': open_range,
            'open_range_pct': open_range_pct,
            'open_change_pct': open_change_pct,
            'open_volatility': open_volatility,
            'open_returns_std': open_returns_std,
            'first_20min_return': first_20min_return,
            'second_20min_return': second_20min_return,
            'third_20min_return': third_20min_return,
            'avg_volume_per_min': avg_volume_per_min,
            'volume_trend': volume_trend,
            'high_in_first_half': high_in_first_half,
            'low_in_first_half': low_in_first_half,
            'price_above_open': price_above_open,
            'total_volume_opening': total_volume_opening,
            'price_momentum': price_momentum,
            'price_acceleration': price_acceleration,
            'opening_rsi': opening_rsi,
            'bb_position': bb_position,
            'volume_surge': volume_surge,
            'volume_consistency': volume_consistency,
            'hl_ratio': hl_ratio,
            'upper_shadow_avg': upper_shadow_avg,
            'lower_shadow_avg': lower_shadow_avg,
            'trend_strength': trend_strength,
            'day_high': day_high,
            'day_low': day_low
        })
    
    return pd.DataFrame(features)

print("‚úì Feature engineering function defined")

‚úì Feature engineering function defined


## 6. Calculate Features for All Stocks

In [15]:
print("="*80)
print("CALCULATING FEATURES")
print("="*80)

feature_data = {}

for stock in CLUSTER_STOCKS:
    print(f"\nProcessing {stock}...")
    feature_data[stock] = create_enhanced_features(minute_data[stock], OPENING_MINUTES)
    print(f"  ‚úì {stock}: {len(feature_data[stock])} trading days processed")
    print(f"  ‚úì Features: {len(feature_data[stock].columns) - 1} (excluding date)")

print("\n" + "="*80)
print("‚úÖ FEATURE ENGINEERING COMPLETE!")
print("="*80)

CALCULATING FEATURES

Processing ICICIPRULI...
  ‚úì ICICIPRULI: 2188 trading days processed
  ‚úì Features: 30 (excluding date)

Processing ICICIGI...
  ‚úì ICICIGI: 1941 trading days processed
  ‚úì Features: 30 (excluding date)

Processing HDFCAMC...
  ‚úì HDFCAMC: 1727 trading days processed
  ‚úì Features: 30 (excluding date)

‚úÖ FEATURE ENGINEERING COMPLETE!


## 7. Display Sample Features

In [16]:
# Display sample features for first stock
sample_stock = CLUSTER_STOCKS[0]
print(f"Sample features for {sample_stock}:")
print("="*80)
print(feature_data[sample_stock].head())

print("\nFeature Statistics:")
print(feature_data[sample_stock].describe())

Sample features for ICICIPRULI:
         date  open_price  open_high  open_low  open_close  open_range  \
0  2016-09-29      330.00     333.60    315.50      321.55       18.10   
1  2016-09-30      297.40     304.60    293.30      304.60       11.30   
2  2016-10-03      312.90     313.65    307.45      309.35        6.20   
3  2016-10-04      308.95     309.50    304.70      305.45        4.80   
4  2016-10-05      305.45     306.25    304.10      304.95        2.15   

   open_range_pct  open_change_pct  open_volatility  open_returns_std  ...  \
0        5.484848        -2.560606         0.003619          0.003619  ...   
1        3.799597         2.420982         0.002392          0.002392  ...   
2        1.981464        -1.134548         0.001492          0.001492  ...   
3        1.553649        -1.132869         0.001223          0.001223  ...   
4        0.703880        -0.163693         0.000698          0.000698  ...   

   opening_rsi  bb_position  volume_surge  volume_cons

## 8. Save Features to CSV

In [17]:
print("="*80)
print("SAVING FEATURES TO CSV FILES")
print("="*80)

for stock in CLUSTER_STOCKS:
    output_file = os.path.join(DATA_PATH, f"{stock}_opening_{OPENING_MINUTES}min_features.csv")
    feature_data[stock].to_csv(output_file, index=False)
    print(f"‚úì {stock}: Saved {len(feature_data[stock])} records to {output_file}")

print("\n" + "="*80)
print("‚úÖ ALL FEATURE FILES SAVED SUCCESSFULLY!")
print("="*80)

SAVING FEATURES TO CSV FILES
‚úì ICICIPRULI: Saved 2188 records to C:\Users\ananyas\OneDrive - Synopsys, Inc\Documents\iisc\subject\semester 1\data science in practice\project\dataset\ICICIPRULI_opening_60min_features.csv
‚úì ICICIGI: Saved 1941 records to C:\Users\ananyas\OneDrive - Synopsys, Inc\Documents\iisc\subject\semester 1\data science in practice\project\dataset\ICICIGI_opening_60min_features.csv
‚úì HDFCAMC: Saved 1727 records to C:\Users\ananyas\OneDrive - Synopsys, Inc\Documents\iisc\subject\semester 1\data science in practice\project\dataset\HDFCAMC_opening_60min_features.csv

‚úÖ ALL FEATURE FILES SAVED SUCCESSFULLY!


## 9. Feature Summary

In [18]:
print("="*80)
print("FEATURE ENGINEERING SUMMARY")
print("="*80)

print("\nüìä Features Calculated (28 total):")
print("\n1. Opening Period Statistics (8):")
print("   - open_price, open_high, open_low, open_close")
print("   - open_range, open_range_pct, open_change_pct, open_volatility")

print("\n2. Time-Segmented Returns (4):")
print("   - first_20min_return, second_20min_return, third_20min_return")
print("   - open_returns_std")

print("\n3. Volume Features (6):")
print("   - avg_volume_per_min, volume_trend, total_volume_opening")
print("   - volume_surge, volume_consistency")

print("\n4. Position Features (3):")
print("   - high_in_first_half, low_in_first_half, price_above_open")

print("\n5. Technical Indicators (4):")
print("   - opening_rsi, bb_position, price_momentum, price_acceleration")

print("\n6. Additional Features (5):")
print("   - hl_ratio, upper_shadow_avg, lower_shadow_avg, trend_strength")

print("\n7. Target Variables (2):")
print("   - day_high, day_low")

print("\nüìà Stock Coverage:")
for stock in CLUSTER_STOCKS:
    print(f"   {stock}: {len(feature_data[stock])} trading days")

print("\n" + "="*80)
print("‚úÖ Feature engineering pipeline complete!")
print("   These CSV files can now be used for model training.")
print("="*80)

FEATURE ENGINEERING SUMMARY

üìä Features Calculated (28 total):

1. Opening Period Statistics (8):
   - open_price, open_high, open_low, open_close
   - open_range, open_range_pct, open_change_pct, open_volatility

2. Time-Segmented Returns (4):
   - first_20min_return, second_20min_return, third_20min_return
   - open_returns_std

3. Volume Features (6):
   - avg_volume_per_min, volume_trend, total_volume_opening
   - volume_surge, volume_consistency

4. Position Features (3):
   - high_in_first_half, low_in_first_half, price_above_open

5. Technical Indicators (4):
   - opening_rsi, bb_position, price_momentum, price_acceleration

6. Additional Features (5):
   - hl_ratio, upper_shadow_avg, lower_shadow_avg, trend_strength

7. Target Variables (2):
   - day_high, day_low

üìà Stock Coverage:
   ICICIPRULI: 2188 trading days
   ICICIGI: 1941 trading days
   HDFCAMC: 1727 trading days

‚úÖ Feature engineering pipeline complete!
   These CSV files can now be used for model training.
