In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from xgboost import XGBClassifier
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import warnings
import copy
import os
import joblib


warnings.filterwarnings('ignore')

TARGET_COL = 'target'
COLS_TO_DROP = ['date']
LOOKBACK_DAYS = 5
BATCH_SIZE = 8
EPOCHS = 50
PATIENCE = 5
LEARNING_RATE = 0.001

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


def load_data():
    """Loads the pre-split data from CSV files."""
    print("Loading data...")
    try:
        train_df = pd.read_csv('train_data.csv')
        test_df = pd.read_csv('test_data.csv')
        val_df = pd.read_csv('validation_data.csv')
        
        X_train = train_df.drop(columns=[TARGET_COL])
        y_train = train_df[TARGET_COL]
        
        X_test = test_df.drop(columns=[TARGET_COL])
        y_test = test_df[TARGET_COL]
        
        X_val = val_df.drop(columns=[TARGET_COL])
        y_val = val_df[TARGET_COL]
        
        return X_train, y_train, X_test, y_test, X_val, y_val
    except FileNotFoundError as e:
        print(f"Error: {e}. Did you run 'process_market_data.py' first?")
        return None, None, None, None, None, None

def define_feature_cols(df: pd.DataFrame) -> (list, list):
    """
    Identifies numerical and categorical features to be processed.
    """
    features = [col for col in df.columns if col not in COLS_TO_DROP]
    
    categorical_cols = ['day_of_week', 'fiscal_week']
    
    numerical_cols = [col for col in features if col not in categorical_cols]
    
    print(f"Identified {len(numerical_cols)} numerical features.")
    print(f"Identified {len(categorical_cols)} categorical features.")
    
    return numerical_cols, categorical_cols

def build_preprocessing_pipeline(numerical_cols: list, categorical_cols: list) -> ColumnTransformer:
    """
    Creates a scikit-learn pipeline to process features:
    - Numerical: StandardScaler
    - Categorical: OneHotEncoder
    """
    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ],
        remainder='drop'
    )
    return preprocessor

def evaluate_model(name: str, y_true: pd.Series, y_pred: np.ndarray, y_prob: np.ndarray = None):
    """Prints a standard set of classification metrics."""
    print(f"\n--- Evaluation Report: {name} ---")
    
    if len(y_true) == 0 or len(y_pred) == 0:
        print("Not enough data to evaluate (empty y_true or y_pred).")
        return

    accuracy = accuracy_score(y_true, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    
    if y_prob is not None:
        try:
            auc = roc_auc_score(y_true, y_prob)
            print(f"ROC-AUC:  {auc:.4f}")
        except ValueError as e:
            print(f"Could not calculate ROC-AUC: {e}")
            
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, zero_division=0))
    print("-" * (30 + len(name)))

def create_sequences(X: np.ndarray, y: np.ndarray, lookback: int) -> (torch.Tensor, torch.Tensor):
    """
    Converts 2D feature/target data into 3D sequences for RNNs.
    Returns PyTorch Tensors.
    """
    X_seq, y_seq = [], []
    
    if len(X) <= lookback:
        return torch.empty(0, lookback, X.shape[1]), torch.empty(0)
        
    for i in range(lookback, len(X)):
        X_seq.append(X[i-lookback:i, :])
        y_seq.append(y[i])
        
    X_seq_np = np.array(X_seq, dtype=np.float32)
    y_seq_np = np.array(y_seq, dtype=np.float32)

    return torch.from_numpy(X_seq_np), torch.from_numpy(y_seq_np)

class GRUModel(nn.Module):
    def __init__(self, input_features, gru_units=32, dropout_rate=0.3):
        super(GRUModel, self).__init__()
        self.gru = nn.GRU(
            input_size=input_features,
            hidden_size=gru_units,
            batch_first=True
        )
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)
        self.dense = nn.Linear(gru_units, 1)

    def forward(self, x):
        gru_out, hidden = self.gru(x)
        
        last_hidden_state = hidden.squeeze(0)
        
        x = self.relu(last_hidden_state)
        x = self.dropout(x)
        x = self.dense(x)
        
        return x

def train_pytorch_model(model, train_loader, val_loader, loss_fn, optimizer, epochs, patience, device):
    best_val_loss = float('inf')
    patience_counter = 0
    best_model_state = None

    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            
            optimizer.zero_grad()
            
            y_pred_logits = model(X_batch).squeeze()
            
            loss = loss_fn(y_pred_logits, y_batch)
            
            loss.backward()
            
            optimizer.step()
            
            train_loss += loss.item()

        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for X_batch_val, y_batch_val in val_loader:
                X_batch_val, y_batch_val = X_batch_val.to(device), y_batch_val.to(device)
                
                y_val_logits = model(X_batch_val).squeeze()
                loss = loss_fn(y_val_logits, y_batch_val)
                val_loss += loss.item()

        avg_train_loss = train_loss / len(train_loader)
        avg_val_loss = val_loss / len(val_loader)
        
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
            best_model_state = copy.deepcopy(model.state_dict())
        else:
            patience_counter += 1
            
        if (epoch + 1) % 10 == 0:
             print(f"Epoch {epoch+1}/{epochs}.. "
                   f"Train Loss: {avg_train_loss:.4f}.. "
                   f"Val Loss: {avg_val_loss:.4f}")

        if patience_counter >= patience:
            print(f"Early stopping triggered at epoch {epoch+1} with best val loss: {best_val_loss:.4f}")
            break
            
    if best_model_state:
        model.load_state_dict(best_model_state)
    print("GRU training complete.")

def predict_pytorch_model(model, loader, device):
    model.eval()
    all_probs = []
    with torch.no_grad():
        for X_batch in loader:
            X_batch = X_batch[0].to(device)
            
            y_logits = model(X_batch)
            y_probs = torch.sigmoid(y_logits).squeeze()
            
            all_probs.append(y_probs.cpu().numpy())
            
    if not all_probs:
        return np.array([]), np.array([])
        
    all_probs = np.concatenate(all_probs)
    all_preds = (all_probs > 0.5).astype(int)
    
    return all_preds, all_probs

def main():
    data = load_data()
    if data[0] is None:
        return
        
    X_train, y_train, X_test, y_test, X_val, y_val = data
    
    numerical_cols, categorical_cols = define_feature_cols(X_train)
    preprocessor = build_preprocessing_pipeline(numerical_cols, categorical_cols)

    print("\nTraining Model 1: Logistic Regression...")
    logit_pipeline = Pipeline(steps=[
        ('preprocess', preprocessor),
        ('model', LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced'))
    ])
    
    logit_pipeline.fit(X_train, y_train)
    
    y_pred_logit = logit_pipeline.predict(X_test)
    y_prob_logit = logit_pipeline.predict_proba(X_test)[:, 1]
    evaluate_model("Logistic Regression (Test Set)", y_test, y_pred_logit, y_prob_logit)

    if (y_train == 1).sum() > 0:
        scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
    else:
        scale_pos_weight = 1

    xgb_pipeline = Pipeline(steps=[
        ('preprocess', preprocessor),
        ('model', XGBClassifier(use_label_encoder=False, 
                                eval_metric='logloss', 
                                random_state=42,
                                scale_pos_weight=scale_pos_weight
                               ))
    ])
    
    xgb_pipeline.fit(X_train, y_train)
    
    y_pred_xgb = xgb_pipeline.predict(X_test)
    y_prob_xgb = xgb_pipeline.predict_proba(X_test)[:, 1]
    evaluate_model("XGBoost (Test Set)", y_test, y_pred_xgb, y_prob_xgb)

    print("\nTraining Model 3: GRU Network (PyTorch)...")
    
    print("Processing data for GRU...")
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)
    X_val_processed = preprocessor.transform(X_val)
    
    n_features = X_train_processed.shape[1]
    
    X_train_seq, y_train_seq = create_sequences(X_train_processed, y_train.values, LOOKBACK_DAYS)
    X_test_seq, y_test_seq = create_sequences(X_test_processed, y_test.values, LOOKBACK_DAYS)
    X_val_seq, y_val_seq = create_sequences(X_val_processed, y_val.values, LOOKBACK_DAYS)
    
    print(f"Created {X_train_seq.shape[0]} training sequences.")
    print(f"Created {X_test_seq.shape[0]} test sequences.")
    

    if X_train_seq.shape[0] == 0 or X_test_seq.shape[0] == 0:
        print("WARNING: Not enough data to train or test GRU model. Skipping.")
    else:
        train_dataset = TensorDataset(X_train_seq, y_train_seq)
        train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
        
        val_dataset = TensorDataset(X_test_seq, y_test_seq)
        val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
        
        test_pred_dataset = TensorDataset(X_test_seq)
        test_pred_loader = DataLoader(test_pred_dataset, batch_size=BATCH_SIZE, shuffle=False)

        model_gru = GRUModel(input_features=n_features).to(device)
        
        pos_weight_tensor = torch.tensor(scale_pos_weight, dtype=torch.float32).to(device)
        
        loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight_tensor)
        optimizer = optim.Adam(model_gru.parameters(), lr=LEARNING_RATE)
        
        train_pytorch_model(
            model_gru,
            train_loader,
            val_loader,
            loss_fn,
            optimizer,
            EPOCHS,
            PATIENCE,
            device
        )
        
        y_pred_gru, y_prob_gru = predict_pytorch_model(model_gru, test_pred_loader, device)
        evaluate_model("GRU (Test Set)", y_test_seq, y_pred_gru, y_prob_gru)

    print("\n" + "="*50)
    print("--- Final Validation on Holdout Set ---")
    print("This is the 'true' unseen performance of your best model.")
    
    y_pred_val_xgb = xgb_pipeline.predict(X_val)
    y_prob_val_xgb = xgb_pipeline.predict_proba(X_val)[:, 1]
    evaluate_model("XGBoost (FINAL VALIDATION)", y_val, y_pred_val_xgb, y_prob_val_xgb)
    
    if 'model_gru' in locals() and X_val_seq.shape[0] > 0:
        val_pred_dataset = TensorDataset(X_val_seq)
        val_pred_loader = DataLoader(val_pred_dataset, batch_size=BATCH_SIZE, shuffle=False)
        
        y_pred_val_gru, y_prob_val_gru = predict_pytorch_model(model_gru, val_pred_loader, device)
        evaluate_model("GRU (FINAL VALIDATION)", y_val_seq, y_pred_val_gru, y_prob_val_gru)
    elif 'model_gru' in locals():
         print("\nWARNING: Not enough data in validation set for GRU sequence. Skipping.")
    elif 'model_gru' in locals():
         print("\nWARNING: Not enough data in validation set for GRU sequence. Skipping.")

    MODEL_DIR = 'final_model_checkpoints'
    os.makedirs(MODEL_DIR, exist_ok=True)
    
    try:
        X_train_processed_temp = preprocessor.transform(X_train.head())
        n_features = X_train_processed_temp.shape[1]
        print(f"\nModel has {n_features} input features.")
        with open(os.path.join(MODEL_DIR, 'n_features.txt'), 'w') as f:
             f.write(str(n_features))
        print(f"Saved feature count ({n_features}) to {MODEL_DIR}/n_features.txt")
    except Exception as e:
        print(f"Warning: Could not save n_features: {e}")

    if 'model_gru' not in locals():
        model_gru = None
        
    print("Returning trained models...")
    return logit_pipeline, xgb_pipeline, model_gru, preprocessor

if __name__ == "__main__":
    main()

In [None]:
print("Starting model training...")

MODEL_DIR = 'final_model_checkpoints' 
os.makedirs(MODEL_DIR, exist_ok=True)

logit_pipeline, xgb_pipeline, model_gru, preprocessor = main()

print(f"\nTraining complete. Saving models to {MODEL_DIR}...")

try:
    joblib.dump(logit_pipeline, os.path.join(MODEL_DIR, 'logit_pipeline.joblib'))
    joblib.dump(xgb_pipeline, os.path.join(MODEL_DIR, 'xgb_pipeline.joblib'))
    joblib.dump(preprocessor, os.path.join(MODEL_DIR, 'preprocessor.joblib'))
    print("Saved Logit, XGBoost, and Preprocessor pipelines.")
    
    if model_gru:
        torch.save(model_gru.state_dict(), os.path.join(MODEL_DIR, 'gru_model.pth'))
        print("Saved GRU model state dictionary.")
    else:
        print("GRU model was not trained, skipping save.")
        
    print("\nAll models and artifacts saved successfully.")

except Exception as e:
    print(f"\nAn error occurred while saving models: {e}")

In [None]:
import pandas as pd
import numpy as np
import joblib
import os
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from google.cloud import bigquery
from google.cloud.exceptions import NotFound

MODEL_DIR = 'final_model_checkpoints'
LOOKBACK_DAYS = 5 
BATCH_SIZE = 8
TARGET_COL = 'target' 
GCP_PROJECT_ID = 'pivotal-glider-472219-r7' 
BIGQUERY_DATASET = 'market_data'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

try:
    bq_client = bigquery.Client(project=GCP_PROJECT_ID)
    print(f"BigQuery client initialized for project {GCP_PROJECT_ID}.")
except Exception as e:
    print(f"ERROR: Could not initialize BigQuery client. {e}")

def create_table_if_not_exists(table_name, schema, partition_field):
    table_id = f"{GCP_PROJECT_ID}.{BIGQUERY_DATASET}.{table_name}"
    try:
        bq_client.get_table(table_id)
    except NotFound:
        print(f"Table {table_id} not found. Creating...")
        try:
            table = bigquery.Table(table_id, schema=schema)
            table.time_partitioning = bigquery.TimePartitioning(
                type_=bigquery.TimePartitioningType.DAY,
                field=partition_field,
            )
            bq_client.create_table(table)
            print(f"  Success. Table {table_id} created.")
        except Exception as e:
            print(f"  --- ERROR creating table {table_name} ---: {e}")
            raise

def upload_to_bigquery(df, table_name, schema, partition_field='date'):
    table_id = f"{GCP_PROJECT_ID}.{BIGQUERY_DATASET}.{table_name}"
    try:
        create_table_if_not_exists(table_name, schema, partition_field)
    except Exception as e:
        print(f"Aborting upload to {table_name} due to table creation error.")
        return
    
    df_upload = df.copy()
    if df_upload.empty:
        print(f"No data provided for {table_name}, skipping upload.")
        return

    df_upload[partition_field] = pd.to_datetime(df_upload[partition_field])

    for col_schema in schema:
        col_name = col_schema.name
        if col_name not in df_upload.columns:
            continue
        if col_schema.field_type == 'NUMERIC':
            df_upload[col_name] = pd.to_numeric(df_upload[col_name]).round(4)
        elif col_schema.field_type == 'INTEGER':
             df_upload[col_name] = df_upload[col_name].astype(float).astype('Int64')
             
    df_upload = df_upload.fillna(pd.NA).where(pd.notna(df_upload), None).replace({np.nan: None})
    
    print(f"Uploading {len(df_upload)} rows to {table_name} (replacing all data)...")
    job_config = bigquery.LoadJobConfig(
        schema=schema,
        write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
        time_partitioning=bigquery.TimePartitioning(
            type_=bigquery.TimePartitioningType.DAY,
            field=partition_field,
        ),
    )
    
    try:
        job = bq_client.load_table_from_dataframe(df_upload, table_id, job_config=job_config)
        job.result()
        print(f"Success. Loaded {job.output_rows} rows into {table_id}.")
    except Exception as e:
        print(f"--- FATAL ERROR uploading {table_name} ---")
        print(f"  {e}")

def load_data():
    try:
        train_df = pd.read_csv('train_data.csv')
        test_df = pd.read_csv('test_data.csv')
        val_df = pd.read_csv('validation_data.csv')
        
        X_train = train_df.drop(columns=[TARGET_COL])
        y_train = train_df[TARGET_COL]
        X_test = test_df.drop(columns=[TARGET_COL])
        y_test = test_df[TARGET_COL]
        X_val = val_df.drop(columns=[TARGET_COL])
        y_val = val_df[TARGET_COL]
        
        return X_train, y_train, X_test, y_test, X_val, y_val
    except FileNotFoundError as e:
        print(f"Error: {e}. Cannot load data splits.")
        return None, None, None, None, None, None

class GRUModel(nn.Module):
    def __init__(self, input_features, gru_units=32, dropout_rate=0.3):
        super(GRUModel, self).__init__()
        self.gru = nn.GRU(
            input_size=input_features,
            hidden_size=gru_units,
            batch_first=True 
        )
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)
        self.dense = nn.Linear(gru_units, 1)

    def forward(self, x):
        gru_out, hidden = self.gru(x)
        last_hidden_state = hidden.squeeze(0)
        x = self.relu(last_hidden_state)
        x = self.dropout(x)
        x = self.dense(x)
        return x

def create_sequences(X: np.ndarray, y: np.ndarray, lookback: int):
    X_seq, y_seq = [], []
    if len(X) <= lookback:
        return torch.empty(0, lookback, X.shape[1]), torch.empty(0)
    for i in range(lookback, len(X)):
        X_seq.append(X[i-lookback:i, :])
        y_seq.append(y[i])
    X_seq_np = np.array(X_seq, dtype=np.float32)
    y_seq_np = np.array(y_seq, dtype=np.float32)
    return torch.from_numpy(X_seq_np), torch.from_numpy(y_seq_np)

def predict_pytorch_model(model, loader, device):
    model.eval()
    all_probs = []
    with torch.no_grad():
        for X_batch in loader:
            X_batch = X_batch[0].to(device)
            y_logits = model(X_batch)
            y_probs = torch.sigmoid(y_logits).squeeze()
            if y_probs.dim() == 0:
                all_probs.append(y_probs.cpu().numpy().reshape(1))
            else:
                all_probs.append(y_probs.cpu().numpy())
    if not all_probs:
        return np.array([]), np.array([])
    all_probs = np.concatenate(all_probs)
    all_preds = (all_probs > 0.5).astype(int)
    return all_preds, all_probs

print("\nLoading all models and preprocessor...")
try:
    logit_pipeline = joblib.load(os.path.join(MODEL_DIR, 'logit_pipeline.joblib'))
    xgb_pipeline = joblib.load(os.path.join(MODEL_DIR, 'xgb_pipeline.joblib'))
    preprocessor = joblib.load(os.path.join(MODEL_DIR, 'preprocessor.joblib'))
    
    with open(os.path.join(MODEL_DIR, 'n_features.txt'), 'r') as f:
        n_features = int(f.read())
        
    print(f"Loading GRU model with {n_features} input features.")
    model_gru = GRUModel(input_features=n_features).to(device)
    model_gru.load_state_dict(torch.load(os.path.join(MODEL_DIR, 'gru_model.pth'), map_location=device))
    model_gru.eval()
    
    print("All models loaded.")
except Exception as e:
    print(f"FATAL: Could not load models. {e}")

else:
    print("Loading all data splits...")
    data = load_data()
    if data[0] is None:
        print("Could not load data. Aborting.")
    else:
        X_train, y_train, X_test, y_test, X_val, y_val = data
        
        train_df_full = X_train.assign(target=y_train)
        test_df_full = X_test.assign(target=y_test)
        val_df_full = X_val.assign(target=y_val)
        
        all_df = pd.concat([train_df_full, test_df_full, val_df_full], axis=0)
        
        all_df = all_df.sort_values(by='date').reset_index(drop=True)
        
        X_all = all_df.drop(columns=[TARGET_COL])
        y_all = all_df[TARGET_COL]
        
        all_dates = X_all['date']
        print(f"Loaded {len(X_all)} total data points.")

        print("Running Logistic Regression predictions...")
        y_pred_logit = logit_pipeline.predict(X_all)
        
        print("Running XGBoost predictions...")
        y_pred_xgb = xgb_pipeline.predict(X_all)
        
        print("Running GRU predictions...")
        X_all_processed = preprocessor.transform(X_all)
        X_all_seq, y_all_seq_truth = create_sequences(X_all_processed, y_all.values, LOOKBACK_DAYS)
        
        all_pred_dataset = TensorDataset(X_all_seq)
        all_pred_loader = DataLoader(all_pred_dataset, batch_size=BATCH_SIZE, shuffle=False)
        
        y_pred_gru, _ = predict_pytorch_model(model_gru, all_pred_loader, device)

        print("Assembling results dataframe...")
        df_results = pd.DataFrame({
            'date': all_dates,
            'ground_truth_value': y_all
        })
        
        df_results['logit_pred'] = y_pred_logit
        df_results['xgboost_pred'] = y_pred_xgb
        
        gru_preds_series = pd.Series(y_pred_gru, index=df_results.index[LOOKBACK_DAYS:])
        df_results['gru_pred'] = gru_preds_series
        
        advice_map = {0: 'Sell', 1: 'Buy', np.nan: None}
        
        df_results['ground_truth_advice'] = df_results['ground_truth_value'].map(advice_map)
        df_results['logit_advice'] = df_results['logit_pred'].map(advice_map)
        df_results['xgboost_advice'] = df_results['xgboost_pred'].map(advice_map)
        df_results['gru_advice'] = df_results['gru_pred'].map(advice_map)

        df_results_final = df_results[[
            'date',
            'ground_truth_value',
            'ground_truth_advice',
            'logit_advice',
            'xgboost_advice',
            'gru_advice'
        ]]
        
        print("\nPrediction Results (head):")
        print(df_results_final.head(10))
        print("\nPrediction Results (tail):")
        print(df_results_final.tail())

        print("\nUploading results to BigQuery table 'model_results'...")
        
        results_schema = [
            bigquery.SchemaField('date', 'DATE'),
            bigquery.SchemaField('ground_truth_value', 'INTEGER'),
            bigquery.SchemaField('ground_truth_advice', 'STRING'),
            bigquery.SchemaField('logit_advice', 'STRING'),
            bigquery.SchemaField('xgboost_advice', 'STRING'),
            bigquery.SchemaField('gru_advice', 'STRING'),
        ]
        
        upload_to_bigquery(
            df_results_final,
            'model_results',
            results_schema,
            partition_field='date'
        )
        
        print("\n--- Process Complete ---")