<a href="https://colab.research.google.com/github/norman-AI-2025/hackathon-2025/blob/main/text_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# text_analysis.py

from typing import List
import pandas as pd
from transformers import pipeline

# ---------------- TEXT RISK MODEL ---------------- #

# Simple keyword list for risky behavior in text
RISK_KEYWORDS = [
    "late payment",
    "delay",
    "default",
    "high debt",
    "missed payment",
    "missed invoice",
    "overdue",
    "collection",
    "bankrupt",
    "fraud",
    "scam",
    "dispute",
    "cannot pay",
    "cannot repay",
    "legal action",
    "court case",
    "restructuring",
]

def compute_keyword_risk(text: str) -> int:
    """
    Very simple keyword-based risk score.
    Counts how many risk keywords appear and maps to 0–60 points.
    """
    if not isinstance(text, str) or not text.strip():
        return 0

    t = text.lower()
    count = 0
    for kw in RISK_KEYWORDS:
        if kw in t:
            count += 1

    # Cap count so it doesn't blow up
    count = min(count, 6)

    # Each hit = 10 points, max 60
    return count * 10


# Cache the classifier so it's only loaded once
_SENTIMENT_CLASSIFIER = None

def get_sentiment_classifier():
    """
    Lazy-load a free sentiment model from transformers.
    """
    global _SENTIMENT_CLASSIFIER
    if _SENTIMENT_CLASSIFIER is None:
        _SENTIMENT_CLASSIFIER = pipeline("sentiment-analysis")
    return _SENTIMENT_CLASSIFIER


def compute_sentiment_risk(text: str) -> float:
    """
    Use sentiment analysis as another risk signal.
    Negative sentiment -> higher risk.
    Maps to 0–40 points.
    """
    if not isinstance(text, str) or not text.strip():
        return 0.0

    classifier = get_sentiment_classifier()

    # Truncate very long text for speed & model limits
    result = classifier(text[:512])[0]
    label = result["label"]
    score = float(result["score"])

    if label.upper().startswith("NEG"):
        # Strong negative sentiment => up to 40 points
        return score * 40.0
    else:
        # Positive/neutral → no extra risk from sentiment
        return 0.0


def compute_text_score(text: str) -> float:
    """
    Combine keyword-based risk and sentiment-based risk into a single text_score (0–100).
    """
    keyword_score = compute_keyword_risk(text)
    sentiment_score = compute_sentiment_risk(text)
    text_score = keyword_score + sentiment_score

    # Clip between 0 and 100
    text_score = max(0.0, min(100.0, text_score))
    return text_score


def add_text_scores_to_df(
    df: pd.DataFrame,
    text_col: str = "notes",
    output_col: str = "text_score",
) -> pd.DataFrame:
    """
    Convenience function:
    - Takes a DataFrame with a text column
    - Adds a new column with text risk scores
    """
    df = df.copy()
    df[output_col] = df[text_col].apply(compute_text_score)
    return df
