# 1. Configuration

In [5]:
import pandas as pd
import altair as alt
import joblib
from io import BytesIO
import os
import sys

# Add project root (one level up from notebooks/) to sys.path
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

# ---- FUNCTIONS ----

from src.extract_usage import extract_usage
from src.necessity_index import compute_necessity, index_scaler, qcut_labels
from src.column_detection import detect_freeform_col
from src.shortlist import shortlist_applications
from src.twinkl_originals import find_book_candidates
from src.preprocess_text import normalise_text 
from typing import Tuple

In [18]:
def load_heartfelt_predictor():
    # Compute absolute path from notebook location
    project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
    model_path = os.path.join(project_root, "src", "models", "heartfelt_pipeline.joblib")
    return joblib.load(model_path)


In [21]:
def load_and_process(raw_csv) -> Tuple[pd.DataFrame, str]:
    """
    Load CSV from raw bytes, detect freeform column, compute necessity scores,
    and extract usage items. Returns processed DataFrame and freeform column name.
    """
    # Read Uploaded Data 
    df_orig = pd.read_csv(raw_csv)

    # Detect freeform column
    freeform_col = detect_freeform_col(df_orig)

    df_orig = df_orig[df_orig[freeform_col].notna()]

    #Word Count
    df_orig['word_count'] = df_orig[freeform_col].fillna('').str.split().str.len()

    # Compute Necessity Scores
    scored = df_orig.join(df_orig[freeform_col].apply(compute_necessity))
    scored['necessity_index'] = index_scaler(scored['necessity_index'].values)
    scored['priority'] = qcut_labels(scored['necessity_index'])

    # Find Twinkl Originals Candidates
    scored['book_candidates'] = find_book_candidates(scored, freeform_col)

    # Label Heartfelt Applications
    scored['clean_text'] = scored[freeform_col].map(normalise_text)
    model = load_heartfelt_predictor()
    scored['is_heartfelt'] = model.predict(scored['clean_text'].astype(str))


    
    # Usage Extraction
    #docs = df_orig[freeform_col].to_list()  <---- Disabled Ai-powered extraction for testing
    #scored['Usage'] = extract_usage(docs)

    return scored, freeform_col



In [22]:
df, freeform_col = load_and_process('data/feb-march-data.csv')

  is_primary = series.str.contains(pattern_level, case=False, na=False)


In [23]:
df.head(3)

Unnamed: 0,Id,Date/Time Requested,Giveaway Title,Customer Name,Email Address,School Name,Postal Address,Address Line 2,Address City,Postcode,...,Unnamed: 11,word_count,necessity_index,urgency_score,severity_score,vulnerability_score,priority,book_candidates,clean_text,is_heartfelt
0,304399.0,01/03/2025 00:52,March Community Collection,Susan Bushnell,susan.bushnell@googlemail.com,Southfield Junior School,Shrivenham Road,Highworth,Swindon,SN6 7BZ,...,,69,0.25,0.0,0.0,0.0,medium,False,i would love to use it to spread the love of r...,True
1,305004.0,02/03/2025 19:52,March Community Collection,Sarah Arabestani,sarah.a@sandringhamnursery.com,Sandringham Nursery,16 Sandringham Road,Penylan,Cardiff,CF23 5BJ,...,,46,0.0625,0.0,0.0,0.0,low,False,we would like to introduce early years yoga an...,False
2,305493.0,05/03/2025 14:34,March Community Collection,Rebecca Asker,mrsrasker@gmail.com,Newhaven PRU Outreach,Newhaven Gardens,,Greenwich,SE96HR,...,,86,0.09375,0.0,0.0,1.0,low,False,â£500 would enable us to set up a small sensor...,True


In [None]:
# 2. 