# ScoreCard ML Pipeline Demo

This notebook demonstrates the refactored ScoreCard package with multi-horizon prediction support.

**Horizons:**
- **H1**: Predict the next scorecard (1 step ahead)
- **H2**: Predict the scorecard after next (2 steps ahead)

## 1. Imports and Setup

In [5]:
print("hi")

hi


In [1]:
# Standard imports
import warnings
warnings.filterwarnings('ignore')

# Import the scorecard package
from scorecard import (
    ScoreCardConfig,
    ScoreCardState,
    ConnectionManager,
    ScoreCardTextPrep,
    ScoreCardModeling,
    ScoreCardPipeline,
    ScoreCardRag,
    Horizon,
    SUPPORTED_HORIZONS,
    run_pipeline,
)

print("All imports successful!")
print(f"Supported horizons: {[f'H{int(h)}' for h in SUPPORTED_HORIZONS]}")

All imports successful!
Supported horizons: ['H1', 'H2']


## 2. Configuration

Configure the pipeline settings. You can customize paths, enable/disable stages, and set model parameters.

**Note:** File paths are automatically resolved relative to the package location, so you can run this notebook from any directory.

In [2]:
# Create configuration with custom settings
config = ScoreCardConfig(
    # === Data Sources ===
    sql_download=True,          # Set False to load from Elasticsearch instead
    
    # === Pipeline Stages ===
    enable_nlp=True,            # Run spaCy NLP enrichment
    build_models=True,          # Train ML models
    run_predictions=True,       # Generate predictions
    build_rag=True,             # Build RAG embeddings for GPT
    
    # === Model Configuration ===
    # Use predetermined model keys (fast) or set to None for grid search
    default_model_key_h1="complete_main_words_only | no_downsample_weighted | count | {0: 0.5, 1: 1.35, 2: 1.15}",
    default_model_key_h2="complete_main_words_only | no_downsample_weighted | count | {0: 0.5, 1: 1.35, 2: 1.15}",
    
    # === Training Settings ===
    training_length=5,          # Minimum notes needed per SID for training
)

print("Configuration created:")
print(f"  - SQL Download: {config.sql_download}")
print(f"  - Build Models: {config.build_models}")
print(f"  - Model Matrix JSON: {config.model_matrix_json}")
print(f"  - H1 Model Key: {config.default_model_key_h1[:50]}...")
print(f"  - H2 Model Key: {config.default_model_key_h2[:50]}...")

Configuration created:
  - SQL Download: True
  - Build Models: True
  - Model Matrix JSON: /home/jovyan/silver-iguana/spark_jan/prompts/model_matrix.json
  - H1 Model Key: complete_main_words_only | no_downsample_weighted ...
  - H2 Model Key: complete_main_words_only | no_downsample_weighted ...


## 3. Verify File Paths

Check that all required config files are accessible.

In [3]:
import os

files_to_check = [
    ("Model Matrix JSON", config.model_matrix_json),
    ("SQL Query File", config.sql_query_file),
    ("GPT Prompt", config.gpt_prompt_location),
]

print("File Path Verification:")
print("=" * 60)
all_ok = True
for name, path in files_to_check:
    exists = os.path.exists(path)
    status = "✓" if exists else "✗ MISSING"
    print(f"  {status} {name}: {path}")
    if not exists:
        all_ok = False

if all_ok:
    print("\nAll files found! Ready to run pipeline.")
else:
    print("\n⚠️  Some files are missing. Check your prompts/ folder.")

File Path Verification:
  ✓ Model Matrix JSON: /home/jovyan/silver-iguana/spark_jan/prompts/model_matrix.json
  ✓ SQL Query File: /home/jovyan/silver-iguana/spark_jan/prompts/sql_query.txt
  ✓ GPT Prompt: /home/jovyan/silver-iguana/spark_jan/prompts/GPT_Prompt.txt

All files found! Ready to run pipeline.


## 4. Run the Full Pipeline

The `run_pipeline()` function handles all initialization and execution.

In [4]:
# Run the full pipeline
# This will:
#   1. Download data from SQL (or load from ES)
#   2. Run NLP enrichment with spaCy
#   3. Build sliding windows for each SID
#   4. Train models for H1 and H2 horizons
#   5. Generate predictions
#   6. Build RAG embeddings (optional)

state, pipeline, rag = run_pipeline(
    sql_download=config.sql_download,
    enable_nlp=config.enable_nlp,
    build_models=config.build_models,
    run_predictions=config.run_predictions,
    build_rag=config.build_rag,
)

[INIT] 	Initializing ScoreCardState...
[INIT] 	Loading model matrices from: /home/jovyan/silver-iguana/spark_jan/prompts/model_matrix.json
[INIT] 	  -> 3 feature sets
[INIT] 	  -> 3 sampling strategies
[INIT] 	  -> 1 vectorization configs
[INIT] 	  -> 3 class weight configs
[INIT] 	Initializing spaCy model: en_core_web_trf
[INIT] 	  -> Activating GPU for spaCy...
[INIT] 	  -> Loading model 'en_core_web_trf'...
[INIT] 	  -> spaCy model loaded in 2.4s
[INIT] 	Loading SQL query from: /home/jovyan/silver-iguana/spark_jan/prompts/sql_query.txt
[INIT] 	  -> SQL query loaded (1475 chars)
[INIT] 	Loading GPT prompt from: /home/jovyan/silver-iguana/spark_jan/prompts/GPT_Prompt.txt
[INIT] 	  -> GPT prompt loaded (2236 chars)
[INIT] 	ScoreCardState initialization complete!
[CONN] 	ConnectionManager initializing...
[CONN] 	----------------------------------------
[CONN] 	Connecting to Elasticsearch: http://localhost:9200
[CONN] 	  -> Elasticsearch connection established!
[CONN] 	Connecting to SQL 

Batches:   0%|          | 0/229 [00:00<?, ?it/s]

[EMBD] 	Token count stats: {
  "count": 29281.0,
  "mean": 190.2069601448038,
  "std": 328.3067443233864,
  "min": 0.0,
  "25%": 32.0,
  "50%": 95.0,
  "75%": 243.0,
  "max": 14083.0
}
[ES] 	Deleted existing index 'scorecard_rag_notes'
[ES] 	Created index 'scorecard_rag_notes' with vector mapping
[ES] 	Indexed 29281 documents to 'scorecard_rag_notes'


## 5. Inspect Results

In [6]:
# Check the enriched dataframe
print("=" * 60)
print("ENRICHED DATAFRAME")
print("=" * 60)
if state.enriched_df is not None:
    print(f"Shape: {state.enriched_df.shape}")
    print(f"Columns: {list(state.enriched_df.columns[:10])}...")
    display(state.enriched_df.head(3))
else:
    print("Not available")

ENRICHED DATAFRAME
Shape: (29281, 42)
Columns: ['SID', 'Scorecard_Detail_Note_SID', 'Scorecard_Note', 'Note_Year', 'Note_Month', 'PO_Number', 'PO_Contract_Type', 'PO_Complexity_Level', 'PO_Lifecycle_Phase', 'Supplier_Name']...


Unnamed: 0,SID,Scorecard_Detail_Note_SID,Scorecard_Note,Note_Year,Note_Month,PO_Number,PO_Contract_Type,PO_Complexity_Level,PO_Lifecycle_Phase,Supplier_Name,...,Archive_Indicator,sid_key,pre_scrub_text,verbs,adjectives,noun_chunks,main_words,target,next_color_code,note_history
0,1,927,PO Contract value in the General Section does ...,2017,12,HA80E5771,CPFF,1,Production,NORTHROP GRUMMAN SYSTEMS CORPORATION,...,Y,000001.2017.12.000927,PO Contract value in the General Section does ...,reflect,total basic,PO Contract value the General Section the Tota...,reflect total basic po contract value the gene...,0,0,
1,1,1,OLGASim LOS Bias Error Insertion8,2018,2,HA80E5771,CPFF,1,Production,NORTHROP GRUMMAN SYSTEMS CORPORATION,...,Y,000001.2018.02.000001,OLGASim LOS Bias Error Insertion8,,,OLGASim LOS Bias Error Insertion8,olgasim los bias error insertion8,0,0,927
2,1,2,conduct initial delivery 4 MCSB-2 IT&E capabil...,2018,2,HA80E5771,CPFF,1,Production,NORTHROP GRUMMAN SYSTEMS CORPORATION,...,Y,000001.2018.02.000002,conduct initial delivery 4 MCSB-2 IT&E capabil...,conduct,initial complete,initial delivery 4 MCSB-2,conduct initial complete initial delivery 4 mc...,0,0,927;1


In [7]:
# Check the SID history windows
print("=" * 60)
print("SID HISTORY WINDOWS (Training Data)")
print("=" * 60)
if state.sid_df is not None:
    print(f"Shape: {state.sid_df.shape}")
    
    # Show key columns
    key_cols = ['sid', 'trainable', 'trainable_h2', 'target', 'target_h2', 'color_set', 'all_green']
    available_cols = [c for c in key_cols if c in state.sid_df.columns]
    display(state.sid_df[available_cols].head(5))
    
    # Stats
    print(f"\nH1 Trainable rows: {state.sid_df['trainable'].sum()}")
    if 'trainable_h2' in state.sid_df.columns:
        print(f"H2 Trainable rows: {state.sid_df['trainable_h2'].sum()}")
else:
    print("Not available")

SID HISTORY WINDOWS (Training Data)
Shape: (25964, 33)


Unnamed: 0,sid,trainable,trainable_h2,target,target_h2,color_set,all_green
0,1,True,True,0,0,GGGG,1
1,1,True,True,0,0,GGGG,1
2,1,True,True,0,0,GGGG,1
3,1,True,True,0,0,GGGG,1
4,1,True,True,0,0,GGGG,1



H1 Trainable rows: 25048
H2 Trainable rows: 24193


In [8]:
# Check model results by horizon
print("=" * 60)
print("MODEL RESULTS BY HORIZON")
print("=" * 60)

for h_int, key in state.best_model_key_by_horizon.items():
    print(f"\n--- Horizon H{h_int} ---")
    print(f"Best Model Key: {key}")
    
    model_info = state.best_model_by_horizon.get(h_int, {})
    if model_info:
        print(f"Accuracy: {model_info.get('accuracy', 'N/A'):.4f}")
        print(f"False Negatives: {model_info.get('total_false_negatives', 'N/A')}")
        print(f"Class Weights: {model_info.get('class_weights', 'N/A')}")

MODEL RESULTS BY HORIZON

--- Horizon H1 ---
Best Model Key: H1 | complete_main_words_only | no_downsample_weighted | count | Weights {0: 0.5, 1: 1.35, 2: 1.15}
Accuracy: 0.9651
False Negatives: 83
Class Weights: {0: 0.5, 1: 1.35, 2: 1.15}

--- Horizon H2 ---
Best Model Key: H2 | complete_main_words_only | no_downsample_weighted | count | Weights {0: 0.5, 1: 1.35, 2: 1.15}
Accuracy: 0.9731
False Negatives: 57
Class Weights: {0: 0.5, 1: 1.35, 2: 1.15}


In [9]:
# Check predictions
print("=" * 60)
print("PREDICTIONS")
print("=" * 60)

if state.complete_df is not None:
    print(f"Complete DataFrame shape: {state.complete_df.shape}")
    
    # Show prediction columns
    pred_cols = ['sid_key', 'predicted_color', 'prob_green', 'prob_yellow', 'prob_red']
    if 'predicted_color_h2' in state.complete_df.columns:
        pred_cols.extend(['predicted_color_h2', 'prob_green_h2', 'prob_yellow_h2', 'prob_red_h2'])
    
    available_cols = [c for c in pred_cols if c in state.complete_df.columns]
    display(state.complete_df[available_cols].head(10))
    
    # Prediction distribution
    if 'predicted_color' in state.complete_df.columns:
        print("\nH1 Prediction Distribution:")
        print(state.complete_df['predicted_color'].value_counts())
    
    if 'predicted_color_h2' in state.complete_df.columns:
        print("\nH2 Prediction Distribution:")
        print(state.complete_df['predicted_color_h2'].value_counts())
else:
    print("Not available")

PREDICTIONS
Complete DataFrame shape: (29281, 61)


Unnamed: 0,sid_key,predicted_color,prob_green,prob_yellow,prob_red,predicted_color_h2,prob_green_h2,prob_yellow_h2,prob_red_h2
0,000001.2017.12.000927,,,,,,,,
1,000001.2018.02.000001,,,,,,,,
2,000001.2018.02.000002,,,,,,,,
3,000001.2018.02.000003,Green,0.981678,0.017538,0.000784,Green,0.996945,0.002598,0.000457
4,000001.2018.02.000004,Green,0.988801,0.0103,0.0009,Green,0.992948,0.006205,0.000846
5,000001.2018.02.000005,Green,0.98819,0.011203,0.000607,Green,0.987267,0.012031,0.000702
6,000001.2018.02.000006,Green,0.976389,0.021143,0.002468,Green,0.976663,0.019566,0.003771
7,000001.2018.06.002229,Green,0.968093,0.029249,0.002658,Green,0.971008,0.025938,0.003054
8,000001.2019.01.001001,Green,0.987039,0.011908,0.001053,Green,0.987257,0.012049,0.000695
9,000001.2019.01.002228,Green,0.971077,0.027122,0.001801,Green,0.963485,0.035288,0.001227



H1 Prediction Distribution:
predicted_color
Green     20864
Yellow     4087
Red        1013
Name: count, dtype: int64

H2 Prediction Distribution:
predicted_color_h2
Green     20831
Yellow     4128
Red        1005
Name: count, dtype: int64


## 6. Manual Step-by-Step Execution (Alternative)

If you prefer more control, you can run each component manually:

In [None]:
# Manual execution example (commented out - uncomment to use)
"""
# Step 1: Create config and state
config = ScoreCardConfig(
    sql_download=False,  # Load from ES instead
    build_models=True,
)
state = ScoreCardState(config=config)

# Step 2: Initialize connections
conn = ConnectionManager(config=config, state=state)

# Step 3: Load data from Elasticsearch
state.enriched_df = conn.load_from_es(index_name="scorecard_enriched", id_col="sid_key")
state.sid_df = conn.load_from_es(index_name="scorecard_sid_history")

# Step 4: Train models
modeler = ScoreCardModeling(config=config, state=state, conn=conn)
modeler.build_model_grid()

# Train H1
modeler.find_best_model(horizon=Horizon.H1)
modeler.predict_with_best_model(state.sid_df, horizon=Horizon.H1)

# Train H2
modeler.find_best_model(horizon=Horizon.H2)
modeler.predict_with_best_model(state.sid_df, horizon=Horizon.H2)

# Merge predictions
modeler.merge_data()

# Step 5: Generate GPT justifications (optional)
rag = ScoreCardRag(config=config, state=state, conn=conn)
rag.embed_and_index_notes()
rag.generate_justifications(anchor_sid_key="000123.2024.06.000456", printer=True)
"""
print("Manual execution example is available in comments above.")

## 7. RAG and GPT Justifications

In [None]:
# Generate a GPT justification for a specific note
# (Requires RAG index to be built first)

if state.complete_df is not None and len(state.complete_df) > 0:
    # Pick a sample sid_key
    sample_sid_key = state.complete_df['sid_key'].iloc[0]
    print(f"Sample sid_key: {sample_sid_key}")
    
    # Uncomment to generate justification:
    # rag.generate_justifications(anchor_sid_key=sample_sid_key, printer=True)
else:
    print("No data available for RAG demo")

## 8. Summary

In [None]:
print("=" * 60)
print("PIPELINE SUMMARY")
print("=" * 60)

summary = {
    "Enriched Notes": len(state.enriched_df) if state.enriched_df is not None else 0,
    "Training Windows": len(state.sid_df) if state.sid_df is not None else 0,
    "Complete DataFrame": len(state.complete_df) if state.complete_df is not None else 0,
    "Models Trained": len(state.best_model_key_by_horizon),
    "Horizons": list(state.best_model_key_by_horizon.keys()),
}

for key, value in summary.items():
    print(f"{key}: {value}")

print("\nPipeline complete!")