# Reading Dataset

In [102]:
import pandas as pd

In [103]:
# Read the CSV file into a DataFrame
df = []
df = pd.read_csv('new_data_set_all_indicator.csv')
# Optional: display the first few rows

In [104]:
df.head()

Unnamed: 0.1,Unnamed: 0,Date,Close,High,Low,Open,Volume,Symbol,Prev_Open,Prev_Close,...,MACDs_12_26_9,rsi,BBL_3_2.0,BBM_3_2.0,BBU_3_2.0,BBB_3_2.0,BBP_3_2.0,atr,obv,vwap
0,0,1980-12-12,0.098834,0.099264,0.098834,0.098834,469033600,AAPL,,,...,,,,,,,,,469033600.0,
1,1,1980-12-15,0.093678,0.094108,0.093678,0.094108,175884800,AAPL,0.098834,0.098834,...,,,,,,,,,293148800.0,
2,2,1980-12-16,0.086802,0.087232,0.086802,0.087232,105728000,AAPL,0.094108,0.093678,...,,,0.083248,0.093105,0.102962,21.1749,0.180314,,187420800.0,0.096075
3,3,1980-12-17,0.088951,0.089381,0.088951,0.088951,86441600,AAPL,0.087232,0.086802,...,,23.808075,0.084066,0.089811,0.095555,12.791741,0.425172,0.004478,273862400.0,0.090736
4,4,1980-12-18,0.09153,0.091959,0.09153,0.09153,73449600,AAPL,0.088951,0.088951,...,,46.669571,0.085229,0.089094,0.09296,8.676562,0.815045,0.003868,347312000.0,0.088952


In [105]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1964328 entries, 0 to 1964327
Data columns (total 31 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Unnamed: 0         int64  
 1   Date               object 
 2   Close              float64
 3   High               float64
 4   Low                float64
 5   Open               float64
 6   Volume             int64  
 7   Symbol             object 
 8   Prev_Open          float64
 9   Prev_Close         float64
 10  Doji               bool   
 11  Hammer             bool   
 12  Shooting_Star      bool   
 13  Bullish_Engulfing  bool   
 14  Bearish_Engulfing  bool   
 15  Sector             object 
 16  ema_12             float64
 17  ema_26             float64
 18  ema_short          float64
 19  MACD_12_26_9       float64
 20  MACDh_12_26_9      float64
 21  MACDs_12_26_9      float64
 22  rsi                float64
 23  BBL_3_2.0          float64
 24  BBM_3_2.0          float64
 25  BBU_3_2.0         

# Scaling Data

In [106]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
import numpy as np

In [107]:
def normalize_features(df, numerical_cols):
    """Normalize numerical columns to [0, 1] range."""
    scaler = MinMaxScaler()
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    return df

# Example usage:
# numerical_cols = ['Close', 'rsi', 'MACD_12_26_9', ...]
# df = normalize_features(df, numerical_cols)
def encode_categorical(df, categorical_cols):
    """One-hot encode categorical columns (e.g., Sector)."""
    encoder = OneHotEncoder(sparse=False)  # Use `sparse=False` for older scikit-learn
    encoded = encoder.fit_transform(df[categorical_cols])
    encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names(categorical_cols))
    return pd.concat([df.drop(categorical_cols, axis=1), encoded_df], axis=1)

# Example usage:
# categorical_cols = ['Sector', 'Doji', 'Hammer', ...]
# df = encode_categorical(df, categorical_cols)

def create_target(df, window=3, threshold=0.01):
    """Create binary target: 1 if price rises > threshold in next `window` days."""
    df['future_close'] = df.groupby('Symbol')['Close'].shift(-window)
    df['target'] = (df['future_close'] > df['Close'] * (1 + threshold)).astype(int)
    return df.dropna(subset=['target'])

# Example usage:
# df = create_target(df, window=3, threshold=0.01)

# Image Conversion Logic

In [108]:
def create_ohlc_channel(df_group, window=3):
    """Convert OHLC data to a 4xW matrix (channel 1)."""
    ohlc = df_group[['Open', 'High', 'Low', 'Close']].values.T  # Shape: (4, W)
    return ohlc

def create_indicator_channel(df_group, window=3, indicators=['rsi', 'MACD_12_26_9']):
    """Convert indicators to a NxW matrix (channel 2)."""
    indicator_data = df_group[indicators].values.T  # Shape: (N_indicators, W)
    return indicator_data

def tabular_to_image(df, window=3):
    """Convert each sample to a multi-channel image."""
    images, tabular_data = [], []
    symbols = df['Symbol'].unique()
    symbolCount = 0
    for symbol in symbols:
        symbolCount = symbolCount + 1
        print("PROCESSING: {} - {} of {} symbols...".format(symbol, symbolCount, len(symbols)))
        symbol_df = df[df['Symbol'] == symbol].reset_index(drop=True)
        print("Records in this Symbol: {}...".format(len(symbol_df) - window + 1))     
        for i in range(len(symbol_df) - window + 1):
            # Channel 1: OHLC
            ohlc = create_ohlc_channel(symbol_df.iloc[i:i+window])
            # Channel 2: Indicators
            indicators = create_indicator_channel(symbol_df.iloc[i:i+window])
            # Combine channels
            img = np.vstack([ohlc, indicators])  # Shape: (4 + N_indicators, W)
            images.append(img[..., np.newaxis])  # Add channel dim
            # Tabular features (e.g., Volume, Sector)
            tabular_data.append(symbol_df.iloc[i][['Volume', 'obv']].values)
    
    return np.array(images), np.array(tabular_data)  # Shapes: (B, H, W, 1), (B, N_tabular)

# Example usage:
# images, tabular_data = tabular_to_image(df, window=3)


# Model Setup

In [109]:
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Concatenate, Dropout
from tensorflow.keras.models import Model

# RUNNER - Driver Code

In [110]:
# Pipeline
numerical_cols = [
    'Close', 'High', 'Low', 'Open', 'Volume', 'Prev_Open', 'Prev_Close',
    'ema_12', 'ema_26', 'ema_short', 'MACD_12_26_9', 'MACDh_12_26_9', 'MACDs_12_26_9',
    'rsi', 'BBL_3_2.0', 'BBM_3_2.0', 'BBU_3_2.0', 'BBB_3_2.0', 'BBP_3_2.0',
    'atr', 'obv', 'vwap'
]

categorical_cols = [
    'Sector',  # Object type
    'Doji', 'Hammer', 'Shooting_Star', 'Bullish_Engulfing', 'Bearish_Engulfing'  # Boolean flags
]

df = normalize_features(df, numerical_cols)
df = encode_categorical(df, categorical_cols)
df = create_target(df, window=3)

In [111]:
images, tabular_data = tabular_to_image(df, window=3)
targets = df['target'].values[2:]  # Align with window

PROCESSING: AAPL - 1 of 220 symbols...
Records in this Symbol: 11100...
PROCESSING: ABBV - 2 of 220 symbols...
Records in this Symbol: 3015...
PROCESSING: ABT - 3 of 220 symbols...
Records in this Symbol: 11288...
PROCESSING: ACN - 4 of 220 symbols...
Records in this Symbol: 5895...
PROCESSING: ADBE - 5 of 220 symbols...
Records in this Symbol: 9668...
PROCESSING: ADI - 6 of 220 symbols...
Records in this Symbol: 11288...
PROCESSING: ADM - 7 of 220 symbols...
Records in this Symbol: 11288...
PROCESSING: ADP - 8 of 220 symbols...
Records in this Symbol: 11288...
PROCESSING: AEP - 9 of 220 symbols...
Records in this Symbol: 15853...
PROCESSING: AIG - 10 of 220 symbols...
Records in this Symbol: 13108...
PROCESSING: ALGN - 11 of 220 symbols...
Records in this Symbol: 6013...
PROCESSING: AMAT - 12 of 220 symbols...
Records in this Symbol: 11288...
PROCESSING: AMD - 13 of 220 symbols...
Records in this Symbol: 11288...
PROCESSING: AMGN - 14 of 220 symbols...
Records in this Symbol: 10465...

PROCESSING: LUV - 116 of 220 symbols...
Records in this Symbol: 11340...
PROCESSING: LYV - 117 of 220 symbols...
Records in this Symbol: 4783...
PROCESSING: MA - 118 of 220 symbols...
Records in this Symbol: 4677...
PROCESSING: MCD - 119 of 220 symbols...
Records in this Symbol: 14718...
PROCESSING: MCK - 120 of 220 symbols...
Records in this Symbol: 7582...
PROCESSING: MCO - 121 of 220 symbols...
Records in this Symbol: 7590...
PROCESSING: MDLZ - 122 of 220 symbols...
Records in this Symbol: 5920...
PROCESSING: MDT - 123 of 220 symbols...
Records in this Symbol: 13025...
PROCESSING: MELI - 124 of 220 symbols...
Records in this Symbol: 4373...
PROCESSING: META - 125 of 220 symbols...
Records in this Symbol: 3170...
PROCESSING: MET - 126 of 220 symbols...
Records in this Symbol: 6219...
PROCESSING: MGM - 127 of 220 symbols...
Records in this Symbol: 9234...
PROCESSING: MMC - 128 of 220 symbols...
Records in this Symbol: 13074...
PROCESSING: MMM - 129 of 220 symbols...
Records in this Sy

In [112]:
def evaluate_model(model, X_test_img, X_test_tab, y_test, nan_strategy='mean'):
    import numpy as np
    from sklearn.metrics import classification_report

    # Convert all to np arrays
    X_test_img = np.asarray(X_test_img, dtype=np.float32)
    X_test_tab = np.asarray(X_test_tab, dtype=np.float32)
    y_test = np.asarray(y_test)

    # Ensure all inputs have same length before continuing
    min_len = min(X_test_img.shape[0], X_test_tab.shape[0], y_test.shape[0])
    X_test_img = X_test_img[:min_len]
    X_test_tab = X_test_tab[:min_len]
    y_test = y_test[:min_len]

    # Handle NaNs
    combined_mask = ~(
        np.isnan(X_test_img).any(axis=(1, 2, 3)) |
        np.isnan(X_test_tab).any(axis=1)
    )

    dropped_rows = np.sum(~combined_mask)
    if dropped_rows > 0:
        print(f"Warning: {dropped_rows} rows dropped due to NaNs")
        X_test_img = X_test_img[combined_mask]
        X_test_tab = X_test_tab[combined_mask]
        y_test = y_test[combined_mask]

    # Final shape check
    assert X_test_img.shape[0] == X_test_tab.shape[0] == y_test.shape[0], \
        f"Shape mismatch: img={X_test_img.shape}, tab={X_test_tab.shape}, y={y_test.shape}"

    # Predict and evaluate
    y_pred = (model.predict([X_test_img, X_test_tab], verbose=0) > 0.5).astype(int)
    return classification_report(y_test, y_pred)


def build_cnn(input_shape=(6, 3, 1)):
    """CNN for processing financial 'images'."""
    inputs = Input(shape=input_shape)
    x = Conv2D(32, (2, 2), activation='relu')(inputs)
    x = MaxPooling2D((1, 1))(x)
    x = Flatten()(x)
    return Model(inputs, x, name='cnn_branch')

def build_tabular_net(input_dim):
    """Dense network for tabular features."""
    inputs = Input(shape=(input_dim,))
    x = Dense(64, activation='relu')(inputs)
    return Model(inputs, x, name='tabular_branch')

def build_hybrid_model(cnn_input_shape, tabular_input_dim):
    """Combine CNN and tabular branches."""
    # Branches
    cnn_model = build_cnn(cnn_input_shape)
    tabular_model = build_tabular_net(tabular_input_dim)
    
    # Combined
    combined = Concatenate()([cnn_model.output, tabular_model.output])
    z = Dense(32, activation='relu')(combined)
    z = Dropout(0.5)(z)
    outputs = Dense(1, activation='sigmoid')(z)
    
    # Full model
    return Model(inputs=[cnn_model.input, tabular_model.input], outputs=outputs)

# Example usage:
# model = build_hybrid_model(cnn_input_shape=(7, 3, 1), tabular_input_dim=2)

import numpy as np
from tensorflow.keras.utils import to_categorical

def handle_nans(array, strategy='mean'):
    """Handle NaN values in numpy arrays with specified strategy."""
    if not np.isnan(array).any():
        return array
        
    print(f"Warning: NaN values detected ({np.isnan(array).sum()} elements)")
    
    if strategy == 'mean':
        fill_value = np.nanmean(array)
    elif strategy == 'median':
        fill_value = np.nanmedian(array)
    elif strategy == 'zero':
        fill_value = 0
    else:
        raise ValueError(f"Unknown strategy: {strategy}")
        
    return np.nan_to_num(array, nan=fill_value)

def train_model(model, X_train_img, X_train_tab, y_train, epochs=10, batch_size=32, 
               nan_strategy='mean', verbose=1):
    """Robust training function with NaN handling and validation.
    
    Args:
        model: Compiled Keras model
        X_train_img: Image data (n_samples, height, width, channels)
        X_train_tab: Tabular data (n_samples, n_features)
        y_train: Target values
        epochs: Training epochs
        batch_size: Batch size
        nan_strategy: How to handle NaNs ('mean', 'median', 'zero', or 'error')
        verbose: Verbosity level
        
    Returns:
        Training history
    """
    # Convert inputs to numpy arrays
    X_train_img = np.asarray(X_train_img, dtype='float32')
    X_train_tab = np.asarray(X_train_tab, dtype='float32')
    y_train = np.asarray(y_train)
    
    # Handle NaN values
    if nan_strategy == 'error':
        for name, arr in [('Image data', X_train_img), 
                         ('Tabular data', X_train_tab),
                         ('Target values', y_train)]:
            if np.isnan(arr).any():
                raise ValueError(f"{name} contains {np.isnan(arr).sum()} NaN values")
    else:
        X_train_img = handle_nans(X_train_img, nan_strategy)
        X_train_tab = handle_nans(X_train_tab, nan_strategy)
        y_train = handle_nans(y_train, nan_strategy)
    
    # Validate shapes
    if X_train_img.ndim != 4:
        raise ValueError(f"Image data must be 4D (got {X_train_img.shape})")
    if X_train_tab.ndim != 2:
        raise ValueError(f"Tabular data must be 2D (got {X_train_tab.shape})")
    
    # Prepare targets
    if y_train.ndim == 1 or y_train.shape[1] == 1:
        if len(np.unique(y_train)) > 2:
            y_train = to_categorical(y_train)
        else:
            y_train = y_train.astype('float32')
    
    # Compile and train
    model.compile(optimizer='adam',
                 loss='binary_crossentropy',
                 metrics=['accuracy'])
    
    history = model.fit(
        [X_train_img, X_train_tab],
        y_train,
        epochs=epochs,
        batch_size=batch_size,
        validation_split=0.1,
        verbose=verbose
    )
    return history

# Saving Data after Image Generation for Future Use

In [114]:
import os
# --- Save data to thesis_data directory ---
os.makedirs("thesis_data", exist_ok=True)
np.save("thesis_data/images.npy", images)
np.save("thesis_data/tabular_data.npy", tabular_data)
np.save("thesis_data/targets.npy", targets)

#np.save("thesis_data/images.npy", images.astype(np.float64))  # Save as float64 to ensure precision
#np.save("thesis_data/tabular_data.npy", tabular_data.astype(np.float64))
#np.save("thesis_data/targets.npy", targets.astype(np.float64))


In [115]:
images_loaded = np.load("thesis_data/images.npy", allow_pickle=True)
tabular_data_loaded = np.load("thesis_data/tabular_data.npy", allow_pickle=True)
targets_loaded = np.load("thesis_data/targets.npy", allow_pickle=True)


# Compare to ensure data integrity
print("Images match:", np.array_equal(images, images_loaded))
print("Tabular data match:", np.array_equal(tabular_data, tabular_data_loaded))
print("Targets match:", np.array_equal(targets, targets_loaded))
print("Images approximately match:", np.allclose(images, images_loaded))

Images match: False
Tabular data match: True
Targets match: True
Images approximately match: False


# Testing And Validation

In [116]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

def train_test_split_ts(images, tabular_data, targets, test_size=0.2):
    """Time-series aware split (no shuffling)."""
    split_idx = int(len(images) * (1 - test_size))
    X_train_img, X_test_img = images[:split_idx], images[split_idx:]
    X_train_tab, X_test_tab = tabular_data[:split_idx], tabular_data[split_idx:]
    y_train, y_test = targets[:split_idx], targets[split_idx:]
    return (X_train_img, X_train_tab, y_train), (X_test_img, X_test_tab, y_test)

In [117]:
(X_train_img, X_train_tab, y_train), (X_test_img, X_test_tab, y_test) = \
    train_test_split_ts(images, tabular_data, targets)


In [118]:
model = build_hybrid_model(cnn_input_shape=(6, 3, 1), tabular_input_dim=2)


In [119]:
history = train_model(model, X_train_img, X_train_tab, y_train)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [120]:
print(evaluate_model(model, X_test_img, X_test_tab, y_test))



  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.64      1.00      0.78    250509
           1       0.00      0.00      0.00    141244

    accuracy                           0.64    391753
   macro avg       0.32      0.50      0.39    391753
weighted avg       0.41      0.64      0.50    391753



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
