<a href="https://colab.research.google.com/github/mjeevawork-art/CLAUSEWISE-LEGAL-WRITE/blob/main/JEEVA's_Clausewise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ============================
# ClauseWise with OCR Support (Colab Ready)
# ============================

!apt-get install -y tesseract-ocr
!pip install pytesseract pillow transformers accelerate gradio PyPDF2 python-docx reportlab

import gradio as gr
import torch, re, os, io
import PyPDF2, docx
from PIL import Image
import pytesseract
from transformers import AutoTokenizer, AutoModelForCausalLM

# ============================
# Analyzer Class
# ============================
class ClauseWiseLegalAnalyzer:
    def _init_(self):
        print("📥 Loading IBM Granite model...")
        self.tokenizer = AutoTokenizer.from_pretrained(
            "ibm-granite/granite-3.3-2b-instruct",
            trust_remote_code=True
        )
        self.model = AutoModelForCausalLM.from_pretrained(
            "ibm-granite/granite-3.3-2b-instruct",
            trust_remote_code=True,
            device_map="auto"
        )
        print("✅ Granite model loaded!")

        # Keywords for classification
        self.doc_type_keywords = {
            "NDA": ["non-disclosure", "confidentiality", "trade secret"],
            "Employment Contract": ["employee", "employer", "salary", "termination"],
            "Lease Agreement": ["lease", "rent", "tenant", "landlord", "property"],
            "Service Agreement": ["services", "contractor", "deliverables", "scope of work"],
            "Purchase Agreement": ["purchase", "buyer", "seller", "goods", "delivery"],
            "Partnership Agreement": ["partner", "partnership", "profit sharing"],
            "License Agreement": ["license", "licensor", "licensee", "intellectual property"]
        }

    def generate_response(self, prompt, max_tokens=400):
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=max_tokens,
                temperature=0.7,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id
            )
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

    # -------------------------
    # File Handling
    # -------------------------
    def extract_text_from_file(self, filepath):
        if filepath is None:
            return ""
        ext = os.path.splitext(filepath)[-1].lower()
        try:
            if ext == ".pdf":
                return self.extract_text_from_pdf(filepath)
            elif ext == ".docx":
                return self.extract_text_from_docx(filepath)
            elif ext == ".txt":
                return self.extract_text_from_txt(filepath)
            else:
                return "❌ Unsupported format."
        except Exception as e:
            return f"⚠ Error: {str(e)}"

    def extract_text_from_pdf(self, filepath):
        text = ""
        with open(filepath, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            for page in reader.pages:
                if page.extract_text():
                    text += page.extract_text() + "\n"
        return text.strip()

    def extract_text_from_docx(self, filepath):
        doc = docx.Document(filepath)
        return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])

    def extract_text_from_txt(self, filepath):
        with open(filepath, "r", encoding="utf-8") as f:
            return f.read()

    def extract_text_from_image(self, image):
        try:
            img = Image.open(image)
            text = pytesseract.image_to_string(img)
            return text.strip() if text else "⚠ No text found in image."
        except Exception as e:
            return f"⚠ OCR Error: {str(e)}"

    # -------------------------
    # Features
    # -------------------------
    def simplify_clause(self, text):
        if not text.strip(): return "⚠ Provide text."
        prompt = f"Simplify this legal clause:\n\n{text}\n\nSimplified:"
        return self.generate_response(prompt, max_tokens=200)

    def extract_named_entities(self, text):
        if not text.strip(): return "⚠ Provide text."
        dates = re.findall(r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b", text)
        money = re.findall(r"\$[\d,]+|\d+(?:,\d{3})(?:\.\d{2})?\s(?:USD|dollars?)", text)
        prompt = f"Extract entities (Parties, Terms, Obligations) from:\n{text[:800]}"
        ai_entities = self.generate_response(prompt, max_tokens=300)
        return f"*Dates:* {dates or 'None'}\n*Money:* {money or 'None'}\n\n*AI Extracted:*\n{ai_entities}"

    def extract_clauses(self, text):
        if not text.strip(): return "⚠ Provide text."
        prompt = f"Break this into numbered clauses:\n{text[:1200]}"
        return self.generate_response(prompt, max_tokens=500)

    def classify_document(self, text):
        if not text.strip(): return "⚠ Provide text."
        scores = {doc: sum(k in text.lower() for k in kws) for doc, kws in self.doc_type_keywords.items()}
        keyword_class = max(scores, key=scores.get)
        prompt = f"Classify this document type:\n{text[:600]}"
        ai_class = self.generate_response(prompt, max_tokens=200)
        return f"*Keyword Match:* {keyword_class}\n\n*AI Classification:* {ai_class}"

    def comprehensive_analysis(self, text):
        if not text.strip(): return "⚠ Provide text."
        prompt = f"Summarize key provisions, risks, and recommendations:\n{text[:1500]}"
        return self.generate_response(prompt, max_tokens=600)

# ============================
# Build Gradio UI
# ============================
analyzer = ClauseWiseLegalAnalyzer()

def create_interface():
    with gr.Blocks() as demo:
        gr.Markdown("## ⚖ ClauseWise – AI Legal Document Analyzer (IBM Granite + OCR)")

        with gr.Tab("📄 Upload Document"):
            file_input = gr.File(label="Upload File (PDF/DOCX/TXT)", type="filepath")
            extract_btn = gr.Button("Extract Text")
            extracted_text = gr.Textbox(label="Extracted Text", lines=10)

        with gr.Tab("🖼 Image OCR"):
            img_input = gr.Image(label="Upload Image (JPG/PNG)", type="filepath")
            ocr_btn = gr.Button("Run OCR")
            ocr_text = gr.Textbox(label="Extracted Text from Image", lines=10)

        with gr.Tab("✨ Simplify Clause"):
            clause_in = gr.Textbox(label="Clause", lines=5)
            clause_out = gr.Textbox(label="Simplified", lines=5)
            simplify_btn = gr.Button("Simplify")

        with gr.Tab("🏷 Entity Extraction"):
            ner_in = gr.Textbox(label="Text", lines=6)
            ner_out = gr.Textbox(label="Entities", lines=8)
            ner_btn = gr.Button("Extract Entities")

        with gr.Tab("📋 Clause Breakdown"):
            break_in = gr.Textbox(label="Document", lines=6)
            break_out = gr.Textbox(label="Clauses", lines=8)
            break_btn = gr.Button("Extract Clauses")

        with gr.Tab("📊 Classification"):
            class_in = gr.Textbox(label="Document", lines=6)
            class_out = gr.Textbox(label="Classification", lines=8)
            class_btn = gr.Button("Classify")

        with gr.Tab("🔍 Comprehensive Analysis"):
            analysis_in = gr.Textbox(label="Document", lines=6)
            analysis_out = gr.Textbox(label="Analysis", lines=10)
            analysis_btn = gr.Button("Analyze")

        # Bindings
        extract_btn.click(fn=analyzer.extract_text_from_file, inputs=file_input, outputs=extracted_text)
        ocr_btn.click(fn=analyzer.extract_text_from_image, inputs=img_input, outputs=ocr_text)
        simplify_btn.click(fn=analyzer.simplify_clause, inputs=clause_in, outputs=clause_out)
        ner_btn.click(fn=analyzer.extract_named_entities, inputs=ner_in, outputs=ner_out)
        break_btn.click(fn=analyzer.extract_clauses, inputs=break_in, outputs=break_out)
        class_btn.click(fn=analyzer.classify_document, inputs=class_in, outputs=class_out)
        analysis_btn.click(fn=analyzer.comprehensive_analysis, inputs=analysis_in, outputs=analysis_out)

        # Auto-fill extracted/ocr text into other tabs
        extracted_text.change(lambda t: [t, t, t, t, t], inputs=extracted_text, outputs=[clause_in, ner_in, break_in, class_in, analysis_in])
        ocr_text.change(lambda t: [t, t, t, t, t], inputs=ocr_text, outputs=[clause_in, ner_in, break_in, class_in, analysis_in])
    return demo

demo = create_interface()
demo.launch(share=True)

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting reportlab
  Downloading reportlab-4.4.4-py3-none-any.whl.metadata (1.7 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading re

