In [None]:
import spacy
from dataclasses import dataclass
from typing import List, Dict, Tuple
import subprocess

# =============================================================================
# DATA STRUCTURE
# =============================================================================
@dataclass
class ExtractedFeatures:
    """Container for pre-extracted features from your pipeline."""
    entities: List[str]  # From LLM extraction
    key_phrases: List[str]  # From KeyBERT
    text: str  # Original answer text

# =============================================================================
# LINGUISTIC HEURISTIC CLASSIFIER (FIXED)
# =============================================================================

class ResearchInspiredClassifier:
    """
    Classifies answers using grammatical rules from spaCy.
    """

    def __init__(self):
        try:
            self.nlp = spacy.load('en_core_web_sm')
        except:
            print("Downloading spaCy model 'en_core_web_sm'...")
            subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"],
                           stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
            self.nlp = spacy.load('en_core_web_sm')

        self.sequential_markers = {'first', 'second', 'then', 'next', 'finally', 'subsequently'}
        self.second_person_pronouns = {'you', 'your', 'yours'}

    def predict(self, extracted: ExtractedFeatures) -> Tuple[str, Dict[str, int]]:
        """
        Predicts if an answer is actionable or informative based on
        grammatical analysis.
        """

        scores = {
            "actionable_score": 0,
            "informative_score": 0
        }

        # --- 1. Analyze Key Phrases (for Action Verbs) ---
        for phrase in extracted.key_phrases:
            doc = self.nlp(phrase)
            # *** FIX IS HERE ***
            # We get the list of sentences, get the first one, and find its root.
            sents = list(doc.sents)
            if sents: # Make sure we have a sentence
                root = sents[0].root
                if root.pos_ == 'VERB':
                    scores["actionable_score"] += 1

        # --- 2. Analyze Entities (for Noun Types) ---
        for entity in extracted.entities:
            doc = self.nlp(entity)
            # *** FIX IS HERE ***
            sents = list(doc.sents)
            if sents: # Make sure we have a sentence
                root = sents[0].root
                if root.pos_ == 'PROPN':
                    scores["informative_score"] += 1
                elif root.pos_ == 'NOUN':
                    scores["actionable_score"] += 1

        # --- 3. Analyze Full Text for Linguistic Cues ---
        doc = self.nlp(extracted.text)
        text_lower = extracted.text.lower()

        for token in doc:
            # A) Check for Modal Verbs (must, should, can, etc.)
            if token.tag_ == 'MD':
                scores["actionable_score"] += 1

            # B) Check for Second-Person Pronouns (you, your)
            if token.lower_ in self.second_person_pronouns:
                scores["actionable_score"] += 1

            # C) Check for Imperative Verbs (Commands)
            if token.pos_ == 'VERB' and token.dep_ == 'ROOT' and token.tag_ == 'VB':
                has_subject = any(child.dep_ in ['nsubj', 'nsubjpass'] for child in token.children)
                if not has_subject:
                    scores["actionable_score"] += 1

            # D) Check for Actionable Infinitives (e.g., "to file")
            if token.lower_ == 'to' and token.dep_ == 'aux':
                if token.head.pos_ == 'VERB':
                    scores["actionable_score"] += 1

        # E) Check for Sequential Markers (first, then, next)
        for marker in self.sequential_markers:
            if marker in text_lower:
                scores["actionable_score"] += 1

        # --- 4. Make the final decision ---
        if scores["actionable_score"] > scores["informative_score"]:
            label = "actionable"
        else:
            label = "informative"

        return label, scores

# =============================================================================
# EXAMPLE USAGE (HOW TO RUN IT)
# =============================================================================

# 1. Instantiate the classifier
print("Loading classifier...")
classifier = ResearchInspiredClassifier()
print("Classifier loaded successfully.")

# 2. Your sample data
sample1 = ExtractedFeatures(
        entities=['ews category reservations', 'residential plot', 'residential flat', 'ews reservation', 'family definition', 'agricultural land', 'gross annual income', 'ews certificate', 'general candidate'],
        key_phrases=['eligible ews certificate satisfy', 'seeks benefit reservation parents', 'rs lakhs includes', 'land property owned', 'conditions general', 'family gross annual', 'reservations definition', 'different locations clubbed checking', '18', 'dear client'],
        text="""Dear Client,To be eligible for the EWS certificate, you will have to satisfy all the following conditions: 1) You should be a â€˜generalâ€™ candidate (not covered under reservation for SC, ST, or OBC). 2) Your familyâ€™s gross annual income should be below Rs. 8 lakhs. This includes income from all sources such as agriculture, salary, business, etc., for the financial year before you apply for the exam., 3) Your family should not own agricultural land of size 5 acres or more. 4) Your family should not own a residential flat of an area of 1000 square feet or more. 5) Your family should not own a residential plot (in notified municipalities) of an area of 100 square yards or more. 6) Your family should not own a residential plot (other than in notified municipalities) of an area of 200 square yards or more. The land or property owned by the family in different locations should be clubbed while checking the eligibility conditions for EWS category reservations. The definition of Family in EWS reservation means the person who seeks the benefit of reservation, his/her parents and siblings below the age of 18 years, as also his/her spouse and children below the age of 18 years. So, given the conditions of eligibility and the definition of family, when your family owns a residential flat of more than an area of 1000 sq. feet, even if it is in the name of your deceased father, you may not be considered for the issue of EWS certificate."""
    )

print("\n" + "=" * 40)
print("ðŸš€ Running Research-Inspired Classifier...")
print("-" * 40)

# --- Process Answer 1 ---
label1, scores1 = classifier.predict(sample1)
print(f"Answer 1 Prediction: {label1.upper()}")
print(f"Scores: {scores1}")

Loading classifier...
Classifier loaded successfully.

ðŸš€ Running Research-Inspired Classifier...
----------------------------------------
Answer 1 Prediction: ACTIONABLE
Scores: {'actionable_score': 33, 'informative_score': 0}


In [None]:
import spacy
from dataclasses import dataclass
from typing import List, Dict, Tuple
import subprocess
import re # We'll use regex to find list items

# =============================================================================
# DATA STRUCTURE
# =============================================================================
@dataclass
class ExtractedFeatures:
    """Container for pre-extracted features from your pipeline."""
    entities: List[str]  # From LLM extraction
    key_phrases: List[str]  # From KeyBERT
    text: str  # Original answer text

# =============================================================================
# IMPROVISED LINGUISTIC HEURISTIC V3 (With Weights & Balancing)
# =============================================================================

class ImprovisedClassifier:
    """
    Improves the heuristic with weights and balancing rules.
    It's less "trigger-happy" on actionable cues and better
    at spotting informative text.
    """

    def __init__(self):
        try:
            self.nlp = spacy.load('en_core_web_sm')
        except:
            print("Downloading spaCy model 'en_core_web_sm'...")
            subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"],
                           stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
            self.nlp = spacy.load('en_core_web_sm')

        # --- Define our grammatical markers ---
        self.sequential_markers = {'first', 'second', 'then', 'next', 'finally', 'subsequently'}
        self.second_person_pronouns = {'you', 'your', 'yours'}

        # NEW: Markers for informative text to balance the score
        self.explanatory_markers = {
            'because', 'since', 'due to', 'under', 'pursuant',
            'whereas', 'states that', 'means', 'definition of', 'refers to',
            'in the case of', 'as per'
        }

        # NEW: Weights to make some rules stronger than others
        self.weights = {
            "verb_phrase": 1.0,         # A verb phrase is a decent signal
            "propn_entity": 1.5,        # A proper noun (Act, Court) is a strong info signal
            # "noun_entity" is REMOVED. It was buggy and mis-ID'ing things.

            # --- Text Weights ---
            "modal_verb": 1.0,          # "should", "must" are strong...
            "second_person": 0.5,       # ...but "you" is a weak signal (it's in all text)
            "imperative_cmd": 3.0,      # A direct command is a HUGE action signal
            "action_infinitive": 1.5,   # "to file" is a good action signal
            "sequential_marker": 1.5,   # "first..." is a good action signal

            # --- Balancing Weights ---
            "explanatory_marker": 1.5,  # "because..." is a strong info signal
            "list_marker": 2.5          # "1)" is a VERY strong info signal (the "EWS fix")
        }

    def predict(self, extracted: ExtractedFeatures) -> Tuple[str, Dict[str, float]]:
        """
        Predicts using a weighted, balanced scoring system.
        """

        scores = {
            "actionable_score": 0.0,
            "informative_score": 0.0
        }

        # --- 1. Analyze Key Phrases ---
        for phrase in extracted.key_phrases:
            doc = self.nlp(phrase)
            sents = list(doc.sents)
            if sents:
                root = sents[0].root
                if root.pos_ == 'VERB':
                    scores["actionable_score"] += self.weights['verb_phrase']

        # --- 2. Analyze Entities ---
        for entity in extracted.entities:
            doc = self.nlp(entity)
            # This is a better rule: if ANY part of the entity is a Proper Noun,
            # it's informative (e.g., "EWS certificate", "Industrial Act")
            if any(token.pos_ == 'PROPN' for token in doc):
                scores["informative_score"] += self.weights['propn_entity']
            # We removed the 'NOUN' rule because it was wrong (e.g., "certificate")

        # --- 3. Analyze Full Text ---
        doc = self.nlp(extracted.text)
        text_lower = extracted.text.lower()

        # A) Check for list items (THE "EWS FIX")
        # Looks for "1)", "2)", "(a)", "(b)", etc.
        list_items = re.findall(r'\b[0-9a-z][.)]', text_lower)
        if list_items:
            # Add a big bonus for each list item found
            scores["informative_score"] += (len(list_items) * self.weights['list_marker'])

        # B) Check for explanatory markers
        for marker in self.explanatory_markers:
            if marker in text_lower:
                scores["informative_score"] += self.weights['explanatory_marker']

        # C) Check for standard actionable cues
        for token in doc:
            if token.tag_ == 'MD':
                scores["actionable_score"] += self.weights['modal_verb']
            if token.lower_ in self.second_person_pronouns:
                scores["actionable_score"] += self.weights['second_person']
            if token.pos_ == 'VERB' and token.dep_ == 'ROOT' and token.tag_ == 'VB':
                has_subject = any(child.dep_ in ['nsubj', 'nsubjpass'] for child in token.children)
                if not has_subject:
                    scores["actionable_score"] += self.weights['imperative_cmd']
            if token.lower_ == 'to' and token.dep_ == 'aux':
                if token.head.pos_ == 'VERB':
                    scores["actionable_score"] += self.weights['action_infinitive']

        for marker in self.sequential_markers:
            if marker in text_lower:
                scores["actionable_score"] += self.weights['sequential_marker']

        # --- 4. Make the final decision ---
        if scores["actionable_score"] > scores["informative_score"]:
            label = "actionable"
        else:
            label = "informative"

        # Round scores for clean output
        scores["actionable_score"] = round(scores["actionable_score"], 2)
        scores["informative_score"] = round(scores["informative_score"], 2)

        return label, scores

# =============================================================================
# EXAMPLE USAGE (Running the problematic text)
# =============================================================================

# 1. Instantiate the classifier
print("Loading classifier...")
classifier = ImprovisedClassifier()
print("Classifier loaded successfully.")

# 2. Your problematic EWS sample
sample1 = ExtractedFeatures(
        entities=['ews category reservations', 'residential plot', 'residential flat', 'ews reservation', 'family definition', 'agricultural land', 'gross annual income', 'ews certificate', 'general candidate'],
        key_phrases=['eligible ews certificate satisfy', 'seeks benefit reservation parents', 'rs lakhs includes', 'land property owned', 'conditions general', 'family gross annual', 'reservations definition', 'different locations clubbed checking', '18', 'dear client'],
        text="""Dear Client,To be eligible for the EWS certificate, you will have to satisfy all the following conditions: 1) You should be a â€˜generalâ€™ candidate (not covered under reservation for SC, ST, or OBC). 2) Your familyâ€™s gross annual income should be below Rs. 8 lakhs. This includes income from all sources such as agriculture, salary, business, etc., for the financial year before you apply for the exam., 3) Your family should not own agricultural land of size 5 acres or more. 4) Your family should not own a residential flat of an area of 1000 square feet or more. 5) Your family should not own a residential plot (in notified municipalities) of an area of 100 square yards or more. 6) Your family should not own a residential plot (other than in notified municipalities) of an area of 200 square yards or more. The land or property owned by the family in different locations should be clubbed while checking the eligibility conditions for EWS category reservations. The definition of Family in EWS reservation means the person who seeks the benefit of reservation, his/her parents and siblings below the age of 18 years, as also his/her spouse and children below the age of 18 years. So, given the conditions of eligibility and the definition of family, when your family owns a residential flat of more than an area of 1000 sq. feet, even if it is in the name of your deceased father, you may not be considered for the issue of EWS certificate."""
    )

print("\n" + "=" * 40)
print("ðŸš€ Running Improvised Classifier V3...")
print("-" * 40)

# --- Process Answer 1 ---
label1, scores1 = classifier.predict(sample1)
print(f"Prediction: {label1.upper()}")
print(f"Scores: {scores1}")

print("\n" + "=" * 40)
print("ðŸš€ Running on a clearly ACTIONABLE text...")
print("-" * 40)

# 3. A clearly actionable sample
sample2 = ExtractedFeatures(
    entities=['Advocate', 'Labour Commissioner', 'Civil Court'],
    key_phrases=['file a civil suit', 'raise an industrial dispute', 'consult an Advocate'],
    text="""...In that scenario, the terminated employee serving a legal notice to the employer can raise an industrial dispute before the concerned Labour Commissioner. Otherwise, he has to file a civil suit before the Civil Court. If required, consult an Advocate experienced in service matters."""
)

# --- Process Answer 2 ---
label2, scores2 = classifier.predict(sample2)
print(f"Prediction: {label2.upper()}")
print(f"Scores: {scores2}")

Loading classifier...
Classifier loaded successfully.

ðŸš€ Running Improvised Classifier V3...
----------------------------------------
Prediction: ACTIONABLE
Scores: {'actionable_score': 20.0, 'informative_score': 19.5}

ðŸš€ Running on a clearly ACTIONABLE text...
----------------------------------------
Prediction: ACTIONABLE
Scores: {'actionable_score': 8.5, 'informative_score': 3.0}


In [None]:
#Cl Robust
import spacy
from dataclasses import dataclass
from typing import List, Dict, Tuple
import subprocess
import re

# =============================================================================
# DATA STRUCTURE
# =============================================================================
@dataclass
class ExtractedFeatures:
    """Container for pre-extracted features from your pipeline."""
    entities: List[str]  # From LLM extraction
    key_phrases: List[str]  # From KeyBERT
    text: str  # Original answer text

# =============================================================================
# IMPROVED LINGUISTIC HEURISTIC CLASSIFIER
# =============================================================================

class ImprovedResearchInspiredClassifier:
    """
    Classifies answers using grammatical rules from spaCy with better balance
    between informative and actionable signals.
    """

    def __init__(self):
        try:
            self.nlp = spacy.load('en_core_web_sm')
        except:
            print("Downloading spaCy model 'en_core_web_sm'...")
            subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"],
                           stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
            self.nlp = spacy.load('en_core_web_sm')

        # Actionable markers
        self.sequential_markers = {'first', 'second', 'then', 'next', 'finally', 'subsequently', 'lastly'}
        self.second_person_pronouns = {'you', 'your', 'yours', 'yourself'}
        self.action_verbs = {'apply', 'file', 'submit', 'contact', 'visit', 'obtain', 'provide',
                            'ensure', 'check', 'verify', 'prepare', 'complete', 'sign'}

        # Informative markers
        self.legal_terms = {'act', 'section', 'clause', 'article', 'amendment', 'statute',
                           'regulation', 'ordinance', 'provision', 'code', 'law', 'rule',
                           'subsection', 'paragraph', 'schedule', 'chapter'}
        self.citation_patterns = [
            r'\b(section|sec\.?|s\.?)\s*\d+',  # Section 123
            r'\b(article|art\.?)\s*\d+',        # Article 45
            r'\b\d{4}\s*act\b',                  # 2020 Act
            r'\b[A-Z]{2,}\s+Act\b',             # Income Tax Act
            r'\bvs?\.?\s+[A-Z]',                # Case citations (A vs B)
        ]

    def _count_legal_citations(self, text: str) -> int:
        """Count legal citations and references in text."""
        count = 0
        text_lower = text.lower()

        # Count legal terms
        for term in self.legal_terms:
            count += len(re.findall(r'\b' + term + r'\b', text_lower))

        # Count citation patterns
        for pattern in self.citation_patterns:
            count += len(re.findall(pattern, text, re.IGNORECASE))

        return count

    def _analyze_sentence_types(self, doc) -> Dict[str, int]:
        """Analyze types of sentences in the text."""
        sentence_types = {
            'imperative': 0,      # Commands
            'declarative': 0,     # Statements
            'interrogative': 0    # Questions
        }

        for sent in doc.sents:
            sent_text = sent.text.strip()
            root = sent.root

            # Check for questions
            if sent_text.endswith('?'):
                sentence_types['interrogative'] += 1
            # Check for imperatives (commands)
            elif root.pos_ == 'VERB' and root.tag_ == 'VB':
                has_subject = any(child.dep_ in ['nsubj', 'nsubjpass'] for child in root.children)
                if not has_subject:
                    sentence_types['imperative'] += 1
            else:
                sentence_types['declarative'] += 1

        return sentence_types

    def _calculate_informative_density(self, doc, text: str) -> float:
        """
        Calculate density of informative content:
        - Legal terms per 100 words
        - Proper nouns (laws, acts, organizations)
        - Numbers (amounts, sections, years)
        """
        word_count = len([token for token in doc if not token.is_punct and not token.is_space])
        if word_count == 0:
            return 0

        # Count legal citations
        legal_count = self._count_legal_citations(text)

        # Count proper nouns (excluding "you", "client" etc)
        propn_count = sum(1 for token in doc if token.pos_ == 'PROPN' and
                         token.lower_ not in {'client', 'sir', 'madam'})

        # Count numbers (potentially section numbers, amounts, dates)
        num_count = sum(1 for token in doc if token.pos_ == 'NUM' or
                       token.like_num or re.match(r'\d+', token.text))

        # Density per 100 words
        density = ((legal_count * 3) + (propn_count * 2) + num_count) / word_count * 100
        return density

    def _calculate_actionable_density(self, doc, text: str) -> float:
        """
        Calculate density of actionable content:
        - Imperative verbs
        - Modal verbs
        - Second person pronouns
        - Action verbs
        """
        word_count = len([token for token in doc if not token.is_punct and not token.is_space])
        if word_count == 0:
            return 0

        action_count = 0
        text_lower = text.lower()

        for token in doc:
            # Modal verbs (should, must, can)
            if token.tag_ == 'MD':
                action_count += 2  # Weight modals more

            # Second person pronouns
            if token.lower_ in self.second_person_pronouns:
                action_count += 1

            # Action verbs from our list
            if token.lemma_ in self.action_verbs:
                action_count += 2

            # Imperatives
            if token.pos_ == 'VERB' and token.dep_ == 'ROOT' and token.tag_ == 'VB':
                has_subject = any(child.dep_ in ['nsubj', 'nsubjpass'] for child in token.children)
                if not has_subject:
                    action_count += 2

        # Sequential markers
        for marker in self.sequential_markers:
            if marker in text_lower:
                action_count += 1

        # Density per 100 words
        density = action_count / word_count * 100
        return density

    def predict(self, extracted: ExtractedFeatures) -> Tuple[str, Dict[str, float]]:
        """
        Predicts if an answer is actionable or informative based on
        improved grammatical and content analysis.
        """
        doc = self.nlp(extracted.text)

        # Calculate densities
        informative_density = self._calculate_informative_density(doc, extracted.text)
        actionable_density = self._calculate_actionable_density(doc, extracted.text)

        # Analyze sentence types
        sentence_types = self._analyze_sentence_types(doc)
        total_sentences = sum(sentence_types.values())

        # Calculate ratios
        imperative_ratio = sentence_types['imperative'] / total_sentences if total_sentences > 0 else 0
        declarative_ratio = sentence_types['declarative'] / total_sentences if total_sentences > 0 else 0

        # Analyze entities for legal content
        entity_legal_score = 0
        for entity in extracted.entities:
            entity_lower = entity.lower()
            # Check if entity contains legal terms
            if any(term in entity_lower for term in self.legal_terms):
                entity_legal_score += 1

        # Calculate final scores with weighted factors
        scores = {
            "informative_score": (
                informative_density * 2.0 +           # Legal citations are strong signal
                declarative_ratio * 20 +              # Declarative sentences suggest informative
                entity_legal_score * 5                # Legal entities
            ),
            "actionable_score": (
                actionable_density * 1.5 +            # Action markers
                imperative_ratio * 30                 # Imperative sentences are strong signal
            ),
            # Additional breakdown for debugging
            "informative_density": round(informative_density, 2),
            "actionable_density": round(actionable_density, 2),
            "imperative_ratio": round(imperative_ratio, 2),
            "declarative_ratio": round(declarative_ratio, 2),
            "legal_citations": self._count_legal_citations(extracted.text),
            "entity_legal_score": entity_legal_score
        }

        # Make decision with threshold
        # If scores are close, prefer informative (legal content is default)
        threshold = 1.2  # Actionable must be 20% higher to win
        if scores["actionable_score"] > scores["informative_score"] * threshold:
            label = "actionable"
        else:
            label = "informative"

        return label, scores


if __name__ == "__main__":
    # 1. Instantiate the classifier
    print("Loading improved classifier...")
    classifier = ImprovedResearchInspiredClassifier()
    print("Classifier loaded successfully.\n")

    # 2. Example 1
    sample1 = ExtractedFeatures(
        entities=['abnormal loss', 'recovery of amount', 'claim amount', 'legal notice', 'file a suit'],
        key_phrases=['left job intimation caused', 'sir issue legal', 'abnormal loss company', 'claim', 'recovery', 'reasonable file', 'left', 'dear sir', 'intimation', 'said'],
        text="""Dear Sir,You may it issue a legal notice to both of them informing them that they have left the job without intimation and caused abnormal loss to the company and you may claim any amount which is reasonable and thereafter file a suit for recovery of said amount."""
    )

    # Example 2: More clearly informative (with legal citations)
    sample2 = ExtractedFeatures(
        entities=['Section 498A', 'Indian Penal Code', 'Supreme Court', 'dowry harassment'],
        key_phrases=['section 498a ipc', 'supreme court ruling', 'cognizable offence',
                    'non-bailable', 'matrimonial cruelty'],
        text="""Section 498A of the Indian Penal Code deals with cruelty by husband or relatives of husband. According to this provision, whoever, being the husband or the relative of the husband of a woman, subjects such woman to cruelty shall be punished with imprisonment which may extend to three years and shall also be liable to fine. The Supreme Court in Rajesh vs State of Haryana (2013) has clarified the scope of this section. It is a cognizable and non-bailable offence. The term 'cruelty' has been defined under the Explanation to include harassment for dowry demands."""
    )

    print("=" * 60)
    print("ðŸš€ Testing Improved Classifier")
    print("=" * 60)

    # --- Process Answer 1 (Should be more actionable) ---
    print("\nðŸ“„ ANSWER 1 (EWS Certificate Eligibility):")
    print("-" * 60)
    label1, scores1 = classifier.predict(sample1)
    print(f"Prediction: {label1.upper()}")
    print(f"\nDetailed Scores:")
    for key, value in scores1.items():
        print(f"  {key}: {value}")

    # --- Process Answer 2 (Should be informative) ---
    print("\n\nðŸ“„ ANSWER 2 (Section 498A Explanation):")
    print("-" * 60)
    label2, scores2 = classifier.predict(sample2)
    print(f"Prediction: {label2.upper()}")
    print(f"\nDetailed Scores:")
    for key, value in scores2.items():
        print(f"  {key}: {value}")

    print("\n" + "=" * 60)

Loading improved classifier...
Classifier loaded successfully.

ðŸš€ Testing Improved Classifier

ðŸ“„ ANSWER 1 (EWS Certificate Eligibility):
------------------------------------------------------------
Prediction: ACTIONABLE

Detailed Scores:
  informative_score: 8.16326530612245
  actionable_score: 24.48979591836735
  informative_density: 4.08
  actionable_density: 16.33
  imperative_ratio: 0
  declarative_ratio: 0
  legal_citations: 0
  entity_legal_score: 0


ðŸ“„ ANSWER 2 (Section 498A Explanation):
------------------------------------------------------------
Prediction: INFORMATIVE

Detailed Scores:
  informative_score: 119.58333333333334
  actionable_score: 9.375
  informative_density: 44.79
  actionable_density: 6.25
  imperative_ratio: 0.0
  declarative_ratio: 1.0
  legal_citations: 6
  entity_legal_score: 2



In [None]:
#Cl Robust heuristic- THIS IS THE LATEST. USE THIS.
import spacy
from dataclasses import dataclass
from typing import List, Dict, Tuple
import subprocess
import re

@dataclass
class ExtractedFeatures:
    entities: List[str]
    key_phrases: List[str]
    text: str


class ImprovedResearchInspiredClassifier:

    def __init__(self):
        try:
            self.nlp = spacy.load('en_core_web_sm')
        except:
            print("Downloading spaCy model 'en_core_web_sm'...")
            subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"],
                           stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
            self.nlp = spacy.load('en_core_web_sm')

        # Actionable markers
        self.sequential_markers = {'first', 'second', 'then', 'next', 'finally', 'subsequently', 'lastly'}
        self.second_person_pronouns = {'you', 'your', 'yours', 'yourself'}
        self.action_verbs = {'apply', 'file', 'submit', 'contact', 'visit', 'obtain', 'provide',
                            'ensure', 'check', 'verify', 'prepare', 'complete', 'sign'}

        # Informative markers
        self.legal_terms = {'act', 'section', 'clause', 'article', 'amendment', 'statute',
                           'regulation', 'ordinance', 'provision', 'code', 'law', 'rule',
                           'subsection', 'paragraph', 'schedule', 'chapter'}
        self.citation_patterns = [
            r'\b(section|sec\.?|s\.?)\s*\d+',  #  examples: Section 123
            r'\b(article|art\.?)\s*\d+',        # Article 45
            r'\b\d{4}\s*act\b',                  # 2020 Act
            r'\b[A-Z]{2,}\s+Act\b',             # Income Tax Act
            r'\bvs?\.?\s+[A-Z]',                # Case citations (A vs B)
        ]

    def _count_legal_citations(self, text: str) -> int:
        """Count legal citations and references in text."""
        count = 0
        text_lower = text.lower()

        # Count legal terms
        for term in self.legal_terms:
            count += len(re.findall(r'\b' + term + r'\b', text_lower))

        # Count citation patterns
        for pattern in self.citation_patterns:
            count += len(re.findall(pattern, text, re.IGNORECASE))

        return count

    def _analyze_sentence_types(self, doc) -> Dict[str, int]:
        """Analyze types of sentences in the text."""
        sentence_types = {
            'imperative': 0,
            'declarative': 0,
            'interrogative': 0
        }

        for sent in doc.sents:
            sent_text = sent.text.strip()
            root = sent.root

            # Check for questions
            if sent_text.endswith('?'):
                sentence_types['interrogative'] += 1
            # Check for imperatives (commands)
            elif root.pos_ == 'VERB' and root.tag_ == 'VB':
                has_subject = any(child.dep_ in ['nsubj', 'nsubjpass'] for child in root.children)
                if not has_subject:
                    sentence_types['imperative'] += 1
            else:
                sentence_types['declarative'] += 1

        return sentence_types

    def _calculate_informative_density(self, doc, text: str) -> float:

        word_count = len([token for token in doc if not token.is_punct and not token.is_space])
        if word_count == 0:
            return 0

        # Count legal citations
        legal_count = self._count_legal_citations(text)

        # Count proper nouns (excluding "you", "client" etc)
        propn_count = sum(1 for token in doc if token.pos_ == 'PROPN' and
                         token.lower_ not in {'client', 'sir', 'madam'})

        # Count numbers (potentially section numbers, amounts, dates)
        num_count = sum(1 for token in doc if token.pos_ == 'NUM' or
                       token.like_num or re.match(r'\d+', token.text))

        # Density per 100 words
        density = ((legal_count * 3) + (propn_count * 2) + num_count) / word_count * 100
        return density

    def _calculate_actionable_density(self, doc, text: str) -> float:

        word_count = len([token for token in doc if not token.is_punct and not token.is_space])
        if word_count == 0:
            return 0

        action_count = 0
        text_lower = text.lower()

        for token in doc:
            # Modal verbs (should, must, can)
            if token.tag_ == 'MD':
                action_count += 2  # Weight modals more

            # Second person pronouns
            if token.lower_ in self.second_person_pronouns:
                action_count += 1

            # Action verbs from our list
            if token.lemma_ in self.action_verbs:
                action_count += 2

            # Imperatives
            if token.pos_ == 'VERB' and token.dep_ == 'ROOT' and token.tag_ == 'VB':
                has_subject = any(child.dep_ in ['nsubj', 'nsubjpass'] for child in token.children)
                if not has_subject:
                    action_count += 2

        # Sequential markers
        for marker in self.sequential_markers:
            if marker in text_lower:
                action_count += 1

        # Density per 100 words
        density = action_count / word_count * 100
        return density

    def predict(self, extracted: ExtractedFeatures) -> Tuple[str, Dict[str, float]]:
        """
        Predicts if an answer is actionable or informative based on
        improved grammatical and content analysis.
        """
        doc = self.nlp(extracted.text)

        # Calculate densities
        informative_density = self._calculate_informative_density(doc, extracted.text)
        actionable_density = self._calculate_actionable_density(doc, extracted.text)

        # Analyze sentence types
        sentence_types = self._analyze_sentence_types(doc)
        total_sentences = sum(sentence_types.values())

        # Calculate ratios
        imperative_ratio = sentence_types['imperative'] / total_sentences if total_sentences > 0 else 0
        declarative_ratio = sentence_types['declarative'] / total_sentences if total_sentences > 0 else 0

        # Analyze entities for legal content
        entity_legal_score = 0
        for entity in extracted.entities:
            entity_lower = entity.lower()
            # Check if entity contains legal terms
            if any(term in entity_lower for term in self.legal_terms):
                entity_legal_score += 1


        scores = {
            "informative_score": (
                informative_density * 2.0 +
                declarative_ratio * 20 +
                entity_legal_score * 5
            ),
            "actionable_score": (
                actionable_density * 1.5 +
                imperative_ratio * 30
            ),

            "informative_density": round(informative_density, 2),
            "actionable_density": round(actionable_density, 2),
            "imperative_ratio": round(imperative_ratio, 2),
            "declarative_ratio": round(declarative_ratio, 2),
            "legal_citations": self._count_legal_citations(extracted.text),
            "entity_legal_score": entity_legal_score
        }


        threshold = 1.2  # Actionable must be 20% higher to win
        if scores["actionable_score"] > scores["informative_score"] * threshold:
            label = "actionable"
        else:
            label = "informative"

        return label, scores


if __name__ == "__main__":

    print("Loading improved classifier...")
    classifier = ImprovedResearchInspiredClassifier()
    print("Classifier loaded successfully.\n")


    sample1 = ExtractedFeatures(
        entities=['ews category reservations', 'residential plot', 'residential flat',
                 'ews reservation', 'family definition', 'agricultural land',
                 'gross annual income', 'ews certificate', 'general candidate'],
        key_phrases=['eligible ews certificate satisfy', 'seeks benefit reservation parents',
                    'rs lakhs includes', 'land property owned', 'conditions general',
                    'family gross annual', 'reservations definition',
                    'different locations clubbed checking', '18', 'dear client'],
        text="""Dear Client,To be eligible for the EWS certificate, you will have to satisfy all the following conditions: 1) You should be a 'general' candidate (not covered under reservation for SC, ST, or OBC). 2) Your family's gross annual income should be below Rs. 8 lakhs. This includes income from all sources such as agriculture, salary, business, etc., for the financial year before you apply for the exam., 3) Your family should not own agricultural land of size 5 acres or more. 4) Your family should not own a residential flat of an area of 1000 square feet or more. 5) Your family should not own a residential plot (in notified municipalities) of an area of 100 square yards or more. 6) Your family should not own a residential plot (other than in notified municipalities) of an area of 200 square yards or more. The land or property owned by the family in different locations should be clubbed while checking the eligibility conditions for EWS category reservations. The definition of Family in EWS reservation means the person who seeks the benefit of reservation, his/her parents and siblings below the age of 18 years, as also his/her spouse and children below the age of 18 years. So, given the conditions of eligibility and the definition of family, when your family owns a residential flat of more than an area of 1000 sq. feet, even if it is in the name of your deceased father, you may not be considered for the issue of EWS certificate."""
    )


    sample2 = ExtractedFeatures(
        entities=['abnormal loss', 'recovery of amount', 'claim amount', 'legal notice', 'file a suit'],
        key_phrases=['left job intimation caused', 'sir issue legal', 'abnormal loss company', 'claim', 'recovery', 'reasonable file', 'left', 'dear sir', 'intimation', 'said'],
        text="""Dear Sir,You may it issue a legal notice to both of them informing them that they have left the job without intimation and caused abnormal loss to the company and you may claim any amount which is reasonable and thereafter file a suit for recovery of said amount."""
    )

    print("=" * 60)
    print(" Testing Improved Classifier")
    print("=" * 60)

    print("\n ANSWER 1:")
    print("-" * 60)
    label1, scores1 = classifier.predict(sample1)
    print(f"Prediction: {label1.upper()}")
    print(f"\nDetailed Scores:")
    for key, value in scores1.items():
        print(f"  {key}: {value}")

    print("\n\n ANSWER 2:")
    print("-" * 60)
    label2, scores2 = classifier.predict(sample2)
    print(f"Prediction: {label2.upper()}")
    print(f"\nDetailed Scores:")
    for key, value in scores2.items():
        print(f"  {key}: {value}")

    print("\n" + "=" * 60)

Loading improved classifier...
Classifier loaded successfully.

 Testing Improved Classifier

 ANSWER 1:
------------------------------------------------------------
Prediction: INFORMATIVE

Detailed Scores:
  informative_score: 48.01556420233463
  actionable_score: 19.260700389105057
  informative_density: 14.01
  actionable_density: 12.84
  imperative_ratio: 0.0
  declarative_ratio: 1.0
  legal_citations: 0
  entity_legal_score: 0


 ANSWER 2:
------------------------------------------------------------
Prediction: ACTIONABLE

Detailed Scores:
  informative_score: 8.16326530612245
  actionable_score: 24.48979591836735
  informative_density: 4.08
  actionable_density: 16.33
  imperative_ratio: 0
  declarative_ratio: 0
  legal_citations: 0
  entity_legal_score: 0



In [None]:
#Trying on the entire dataset and standardizing values
import pandas as pd
import spacy
import re
import ast
import subprocess
from dataclasses import dataclass
from typing import List, Dict, Any
from google.colab import drive
import os

# --- PART 1: SETUP ---
try:
    drive.mount('/content/drive')
except:
    print("Drive already mounted or running locally.")

# --- PART 2: SCORING LOGIC (PRESERVED & ADAPTED) ---
class RobustClassifier:
    def __init__(self):
        # Load spaCy for sentence structure analysis (imperative vs declarative)
        try:
            self.nlp = spacy.load('en_core_web_sm')
        except:
            print("Downloading spaCy model...")
            subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"],
                           stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
            self.nlp = spacy.load('en_core_web_sm')

        # Actionable markers (Identical to original script)
        self.sequential_markers = {'first', 'second', 'then', 'next', 'finally', 'subsequently', 'lastly'}
        self.second_person_pronouns = {'you', 'your', 'yours', 'yourself'}
        self.action_verbs = {'apply', 'file', 'submit', 'contact', 'visit', 'obtain', 'provide',
                            'ensure', 'check', 'verify', 'prepare', 'complete', 'sign', 'consult', 'appeal'}

        # Informative markers (Identical to original script)
        self.legal_terms = {'act', 'section', 'clause', 'article', 'amendment', 'statute',
                           'regulation', 'ordinance', 'provision', 'code', 'law', 'rule',
                           'subsection', 'paragraph', 'schedule', 'chapter', 'court', 'tribunal'}

        self.citation_patterns = [
            r'\b(section|sec\.?|s\.?)\s*\d+', r'\b(article|art\.?)\s*\d+',
            r'\b\d{4}\s*act\b', r'\b[A-Z]{2,}\s+Act\b', r'\bvs?\.?\s+[A-Z]',
        ]

    def _analyze_sentence_types(self, doc) -> Dict[str, int]:
        sentence_types = {'imperative': 0, 'declarative': 0, 'interrogative': 0}
        for sent in doc.sents:
            sent_text = sent.text.strip()
            root = sent.root
            if sent_text.endswith('?'):
                sentence_types['interrogative'] += 1
            elif root.pos_ == 'VERB' and root.tag_ == 'VB':
                has_subject = any(child.dep_ in ['nsubj', 'nsubjpass'] for child in root.children)
                if not has_subject:
                    sentence_types['imperative'] += 1
            else:
                sentence_types['declarative'] += 1
        return sentence_types

    def normalize_score(self, raw_score: float) -> int:
        # Heuristic scaling to map density values to 0-100
        scaled = raw_score * 1.6
        return int(min(scaled, 100))

    def predict(self, text: str, extracted_entities: List[str]) -> str:
        doc = self.nlp(text)
        word_count = len([t for t in doc if not t.is_punct and not t.is_space])
        if word_count == 0: return "Inf:0/Act:0"

        # --- 1. INFORMATIVE SCORE CALCULATION ---
        # Logic: (Legal Citations * 3) + (Extracted Entities * 2) + (Numbers)

        # A. Count Legal Citations in Text (Regex)
        legal_citation_count = 0
        for pattern in self.citation_patterns:
            legal_citation_count += len(re.findall(pattern, text, re.IGNORECASE))

        # B. Count Provided Entities (from Columns F & H)
        # We trust the extracted features instead of guessing with spaCy PROPN
        entity_count = len(extracted_entities)

        # C. Count Numbers
        num_count = sum(1 for token in doc if token.like_num or token.pos_ == 'NUM')

        # Density Math
        inf_density = ((legal_citation_count * 3) + (entity_count * 2) + num_count) / word_count * 100

        # Check if entities contain legal terms for bonus
        entity_legal_score = sum(1 for e in extracted_entities if any(t in e.lower() for t in self.legal_terms))

        # --- 2. ACTIONABLE SCORE CALCULATION ---
        # Logic: Modals + Second Person + Action Verbs + Imperatives

        action_count = 0
        text_lower = text.lower()

        for token in doc:
            if token.tag_ == 'MD': action_count += 2 # Modals (must/should)
            if token.lower_ in self.second_person_pronouns: action_count += 1
            if token.lemma_ in self.action_verbs: action_count += 2

        # Check sequential markers in text
        for marker in self.sequential_markers:
            if marker in text_lower: action_count += 1

        act_density = action_count / word_count * 100

        # --- 3. SENTENCE STRUCTURE ---
        st = self._analyze_sentence_types(doc)
        total_sent = sum(st.values())
        imp_ratio = st['imperative'] / total_sent if total_sent > 0 else 0
        dec_ratio = st['declarative'] / total_sent if total_sent > 0 else 0

        # --- 4. FINAL SCORING (THE ORIGINAL FORMULA) ---

        raw_inf_score = (inf_density * 2.0) + (dec_ratio * 20) + (entity_legal_score * 5)
        raw_act_score = (act_density * 1.5) + (imp_ratio * 30)

        final_inf = self.normalize_score(raw_inf_score)
        final_act = self.normalize_score(raw_act_score)

        return f"Inf:{final_inf} | Act:{final_act}"

# --- PART 3: FILE PROCESSING ---

def parse_clean_list(cell_content):
    """Parses the dirty string cells from Columns F and H into clean lists."""
    if pd.isna(cell_content): return []
    s = str(cell_content)
    # Remove the stats "Precision: ..."
    s = re.sub(r'(Precision|Recall|F1-Score).*', '', s, flags=re.DOTALL | re.IGNORECASE).strip()
    # Extract text inside single quotes
    items = re.findall(r"'([^']*)'", s)
    return items

def run_pipeline():
    # 1. Define Paths
    folder_path = "/content/drive/MyDrive"
    input_filename = "Compare different Entity Extraction.xlsx"
    output_filename = "Compare different Entity Extraction_SCORED.xlsx"

    input_path = os.path.join(folder_path, input_filename)
    output_path = os.path.join(folder_path, output_filename)

    print(f"Reading file: {input_path}")
    try:
        df = pd.read_excel(input_path)
    except Exception as e:
        print(f"Error reading file: {e}")
        print("Make sure you added the shortcut to 'My Drive'!")
        return

    # 2. Handle S.NO Merged Cells (The User's Requirement)
    if 'S.NO' in df.columns:
        df['S.NO'] = df['S.NO'].ffill()

    classifier = RobustClassifier()
    scores = []

    print("Processing rows...")

    # 3. Iterate Rows
    for index, row in df.iterrows():

        # Extract Data
        # We use .get() or column names. Assuming names match the screenshot.
        answer = str(row['Answer']) if not pd.isna(row['Answer']) else ""

        # Combine Entity (Col F) and KeyPhrases (Col H) for the "Entity Count"
        col_f = row['(Entity + Fact) Extraction by LLM']
        col_h = row['Key-Phrases Extraction (using KeyBERT)']

        entities_f = parse_clean_list(col_f)
        entities_h = parse_clean_list(col_h)
        all_entities = list(set(entities_f + entities_h)) # Unique list

        # 4. Handle "||" split inside the answer column
        # Even if S.NO splits rows, specific cells might still have splits.
        if "||" in answer:
            split_answers = answer.split("||")
            split_scores = []
            for sub_ans in split_answers:
                if len(sub_ans.strip()) < 3: continue
                sc = classifier.predict(sub_ans.strip(), all_entities)
                split_scores.append(sc)
            scores.append(" || ".join(split_scores))
        else:
            # Standard single row processing
            if len(answer) < 3:
                scores.append("Inf:0 | Act:0")
            else:
                sc = classifier.predict(answer, all_entities)
                scores.append(sc)

    # 5. Save
    df['Automated_Scores'] = scores
    df.to_excel(output_path, index=False)
    print(f"Done. File saved to: {output_path}")

if __name__ == "__main__":
    run_pipeline()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Reading file: /content/drive/MyDrive/Compare different Entity Extraction.xlsx
Processing rows...
Done. File saved to: /content/drive/MyDrive/Compare different Entity Extraction_SCORED.xlsx
