<a href="https://colab.research.google.com/github/norman-AI-2025/hackathon-2025/blob/main/run_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# --- 1. Install Dependencies ---
print("Installing required libraries...")
!pip install import-ipynb -q
!pip install transformers -q
!pip install kagglehub[pandas-datasets] -q
print("Libraries installed: import-ipynb, transformers, kagglehub.")

# --- 2. Load Data using kagglehub ---
import kagglehub
from kagglehub import KaggleDatasetAdapter
import pandas as pd
import import_ipynb
import os

# Define the file and dataset path
KAGGLE_DATASET_PATH = "mirzahasnine/loan-data-set"
KAGGLE_FILE_PATH = "loan_test.csv"

try:
    print(f"\nLoading '{KAGGLE_FILE_PATH}' from Kaggle...")
    df = kagglehub.dataset_load(
        KaggleDatasetAdapter.PANDAS,
        KAGGLE_DATASET_PATH,
        KAGGLE_FILE_PATH,
    )
    print(f"Successfully loaded {len(df)} rows.")

except Exception as e:
    print(f"ERROR: Could not load data from KaggleHub. Using local 'loan_test.csv' fallback.")
    if os.path.exists('loan_test.csv'):
        df = pd.read_csv('loan_test.csv')
    else:
        print("ERROR: Local 'loan_test.csv' not found either.")
        df = pd.DataFrame()


# --- 3. Prepare 'notes' column (Required for text_analysis) ---
if not df.empty:
    sample_notes = [
        "Applicant has a flawless record and high net worth. Highly recommended.",
        "Co-applicant's credit score is alarming, major red flag for this loan.",
        "Standard application, everything appears to be in order.",
        "The client is a top tier applicant, approve immediately.",
        "I have several concerns about the income verification documents.",
        "Credit history is average, loan amount is quite high.",
        "Uncertainty around employment status.",
        "No issues found, excellent candidate.",
    ]

    num_rows = len(df)
    df['notes'] = [sample_notes[i % len(sample_notes)] for i in range(num_rows)]
    print("Data preparation complete. Ready to run models.")

Installing required libraries...
Libraries installed: import-ipynb, transformers, kagglehub.

Loading 'loan_test.csv' from Kaggle...
Using Colab cache for faster access to the 'loan-data-set' dataset.
Successfully loaded 367 rows.
Data preparation complete. Ready to run models.


In [None]:
# --- 4. Import Functions from Model Notebooks (FIXED) ---
import sys
import os
import import_ipynb

print("Verifying current directory and forcing import path...")

# Define the source path in Google Drive
model_notebooks_source_path = '/content/drive/MyDrive/Colab Notebooks/'

# Define the destination path (current working directory)
local_destination_path = './'

# Copy the .ipynb files from Google Drive to the current directory
print(f"Copying model notebooks from '{model_notebooks_source_path}' to '{local_destination_path}'...")
required_notebooks = ['numeric_model.ipynb', 'text_analysis.ipynb', 'fusion_model.ipynb']
for notebook_file in required_notebooks:
    source_file = os.path.join(model_notebooks_source_path, notebook_file)
    destination_file = os.path.join(local_destination_path, notebook_file)
    if os.path.exists(source_file):
        !cp "$source_file" "$destination_file"
        print(f"Copied {notebook_file}")
    else:
        print(f"WARNING: {notebook_file} not found at {source_file}. This might cause import errors.")

# Add the current directory to sys.path to ensure imported notebooks are found
# Even if copied, it's good practice to ensure the current dir is in sys.path
if local_destination_path not in sys.path:
    sys.path.insert(0, local_destination_path)
    print(f"Added '{local_destination_path}' to system search path.")

# Now we run the imports.
try:
    print("Attempting imports...")
    import numeric_model
    import text_analysis
    import fusion_model

    # Alias the imported functions
    compute_numeric_risk_scores = numeric_model.compute_numeric_risk_scores
    compute_text_score = text_analysis.compute_text_score
    compute_fusion_risk = fusion_model.compute_fusion_risk

    print("Imports successful. Starting pipeline execution.")

except ModuleNotFoundError as e:
    print(f"\nFATAL ERROR: ModuleNotFoundError: {e}")
    print("\nTroubleshooting: Please ensure ALL of these files are:")
    print(f"  - present in {model_notebooks_source_path} OR successfully copied to {local_destination_path}")
    print(f"  - {required_notebooks}")
    exit() # Stop execution if the core components can't be found


# --- 5. Execute the Full Pipeline ---
# NOTE: The DataFrame 'df' must be loaded from Cell 1.
if 'df' in locals() and not df.empty:
    print("\n--- Starting Risk Scoring Pipeline ---")

    # A. Run the Numeric Model
    print("1. Calculating numeric risk scores...")
    df_intermediate = compute_numeric_risk_scores(df)

    # B. Run the Fusion Model
    print("\n2. Calculating fusion risk scores (Alpha=0.7)...")
    final_df = compute_fusion_risk(
        df_intermediate,
        compute_text_score=compute_text_score,
        alpha=0.7
    )

    print("\n--- Pipeline Complete ---\n")

    # --- 6. Display Results for Testing ---
    print("Final Results (First 10 rows):")
    print(final_df[[
        'Applicant_Income',
        'Credit_History',
        'notes',
        'numeric_score',
        'text_score',
        'risk_score',
        'risk_label'
    ]].head(10).to_markdown(index=False))

else:
    print("Cannot run pipeline because the DataFrame 'df' is empty or not loaded. Check Cell 1.")

Verifying current directory and forcing import path...
Copying model notebooks from '/content/drive/MyDrive/Colab Notebooks/' to './'...
Copied numeric_model.ipynb
Copied text_analysis.ipynb
Copied fusion_model.ipynb
Added './' to system search path.
Attempting imports...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


Sentiment classifier loaded successfully.
Imports successful. Starting pipeline execution.

--- Starting Risk Scoring Pipeline ---
1. Calculating numeric risk scores...

2. Calculating fusion risk scores (Alpha=0.7)...
  -> Calculating text scores...
  -> Fusion complete and risk labels assigned.

--- Pipeline Complete ---

Final Results (First 10 rows):
|   Applicant_Income |   Credit_History | notes                                                                   |   numeric_score |   text_score |   risk_score | risk_label   |
|-------------------:|-----------------:|:------------------------------------------------------------------------|----------------:|-------------:|-------------:|:-------------|
|             572000 |                1 | Applicant has a flawless record and high net worth. Highly recommended. |         35.634  |    0.0132442 |      24.9478 | Low Risk     |
|             307600 |                1 | Co-applicant's credit score is alarming, major red flag for this

In [None]:
!ls -F

drive/	sample_data/
