<a href="https://colab.research.google.com/github/mimjdefender/canabinoid-game/blob/main/Untitled14.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
!pip install PyMuPDF pandas scikit-learn nltk pytesseract summa bertopic
!sudo apt install tesseract-ocr

import fitz  # PyMuPDF for PDF text extraction
import pandas as pd
import nltk
import os
import re
import pytesseract
import spacy
from PIL import Image
from summa import summarizer
from bertopic import BERTopic
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

# Load spaCy's small English model for NER
nlp = spacy.load("en_core_web_sm")

def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF efficiently using PyMuPDF and falls back to OCR if necessary."""
    text = ""
    with fitz.open(pdf_path) as pdf_doc:
        for page in pdf_doc:
            # Fallback to OCR for image-based PDFs
            pix = page.get_pixmap()
            img_path = f"page_{page.number}.png"
            pix.save(img_path)
            text += pytesseract.image_to_string(Image.open(img_path))
            os.remove(img_path)
    return text

def preprocess_text(text):
    """
    Preprocesses text by lowercasing, tokenizing, and removing stopwords.
    Adjusted to be less aggressive and preserve more meaningful words.
    """
    stop_words = set(nltk.corpus.stopwords.words('english'))
    # Reduce stopwords list - fine-tune this list as needed
    stop_words = stop_words - set(['case', 'fine', 'amount', 'business', 'date'])
    tokens = nltk.word_tokenize(text.lower())
    # Allow some punctuation - adjust this regex as needed
    tokens = [word for word in tokens if re.match(r'\b[a-zA-Z0-9.,!?]+\b', word) and word not in stop_words]
    return " ".join(tokens)

def extract_entities(text):
    """Extracts case-related entities using spaCy NER."""
    doc = nlp(text)
    entities = {
        "Case Number": None,
        "Business Name": None,
        "Fine Amount": None,
        "Date": None
    }
    for ent in doc.ents:
        if ent.label_ == "MONEY":
            entities["Fine Amount"] = ent.text
        elif ent.label_ == "ORG":
            entities["Business Name"] = ent.text
        elif ent.label_ == "DATE":
            entities["Date"] = ent.text
        elif re.search(r'\b(?:Case|Docket|File)\s*#?\s*(\d+)', ent.text, re.IGNORECASE):
            entities["Case Number"] = ent.text
    return entities

def summarize_case(text, topic_model):
    """
    Generates user-friendly and programmatic summaries.
    Increased ratio for programmatic summary to potentially provide more content.
    """
    summary_programmatic = summarizer.summarize(text, ratio=0.2)  # Increased ratio from 0.1
    summary_user = summarizer.summarize(text, ratio=0.3)

    # Extract dominant topic
    topics, _ = topic_model.transform([text])
    dominant_topic = topic_model.get_topic(topics[0])

    # Include topic keywords in the user summary
    if dominant_topic:
        topic_keywords = ", ".join([word for word, _ in dominant_topic[:5]])
        summary_user = f"**Dominant Topic Keywords:** {topic_keywords}\n\n{summary_user}"

    return summary_programmatic, summary_user

def detect_fines(text):
    """Extracts fine amounts with stricter criteria to improve accuracy."""
    # Modified pattern to capture potential false positives in a separate group
    fine_amount_pattern = r"(\b[A-Za-z]{3}\s\d\s[A-Za-z]{2}|\b[A-Z0-9]{1,8})?\s*(?:fine(?:\s+\w+){0,10})\s*\(\s*Dollars\s*\$\s*(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)\s*\)"

    fine_amounts_with_context = re.findall(fine_amount_pattern, text, re.IGNORECASE)

    # Filter out matches with potential false positives
    filtered_fine_amounts = [amount for context, amount in fine_amounts_with_context if not context]

    return filtered_fine_amounts if filtered_fine_amounts else None


# --- Main Execution ---
pdf_folder = "/root/pdf_downloads"  # Change to your actual folder

# 1. Initialize BERTopic model
topic_model = BERTopic()
document_data = []  # Placeholder for the data

# 2. Process the PDFs to get the data. This step now ONLY extracts and preprocesses data.
#    It does NOT perform summarization or topic modeling.
for filename in os.listdir(pdf_folder):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(pdf_folder, filename)
        print(f"Processing {filename}...")

        text = extract_text_from_pdf(pdf_path)
        processed_text = preprocess_text(text)
        entities = extract_entities(text)

        # Extract fine amounts
        fine_amounts = detect_fines(text)
        if fine_amounts:
            entities["Fine Amount"] = ", ".join(fine_amounts)

        document_data.append({
            "Filename": filename,
            "Document Text": processed_text,
            "Case Number": entities["Case Number"],
            "Business Name": entities["Business Name"],
            "Fine Amount": entities["Fine Amount"],
            "Date": entities["Date"]
        })

# 3. Now fit the BERTopic model with actual data.
all_texts = [doc["Document Text"] for doc in document_data]
topic_model.fit(all_texts)

# 4. Now perform summarization, including topic modeling for each document.
for doc in document_data:
    # Summarization
    summary_programmatic, summary_user = summarize_case(doc["Document Text"], topic_model)  # Pass document text here
    doc["Summary Programmatic"] = summary_programmatic  # Add summaries to the document data
    doc["Summary User"] = summary_user

    topics, _ = topic_model.transform([doc["Document Text"]])
    doc["Dominant Topic"] = topic_model.get_topic(topics[0])

# Store data in DataFrame and save as CSV
df = pd.DataFrame(document_data)
df.to_csv("cra_case_summaries.csv", index=False)
# Display the 'Fine Amount' column
print(df[['Filename', 'Fine Amount']])  # Select and display Filename and Fine Amount columns
# --- Cosine Similarity Analysis ---
vectorizer = TfidfVectorizer()

# Filter out empty summaries
df_filtered = df[df['Summary Programmatic'] != ""]
# If df_filtered is empty, there are no summaries to compare, you'll need to debug why
if df_filtered.empty:
    print("No summaries found for similarity analysis. Check summarization results.")
else:
    tfidf_matrix = vectorizer.fit_transform(df_filtered['Summary Programmatic'])

    cosine_similarities = cosine_similarity(tfidf_matrix)

    # Find similar documents
    similarity_threshold = 0.7
    similar_docs = {}

    for i, filename in enumerate(df_filtered["Filename"]):  # Iterate through filtered DataFrame
        similar_indices = [
            j for j, similarity in enumerate(cosine_similarities[i])
            if similarity > similarity_threshold and j != i
        ]
        similar_docs[filename] = df_filtered.iloc[similar_indices]["Filename"].tolist()  # Get filenames from filtered DataFrame

    # Display Similar Documents
    print("\nSimilar Cases Found:")
    for doc, similar_cases in similar_docs.items():
        if similar_cases:
            print(f"{doc} is similar to: {', '.join(similar_cases)}")
        else:
            print(f"{doc} has no strong similarities.")



Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Processing Premier-Botanics-LLC-AUMB000106-Consent-Order-and-Stipulation-and-Formal-Complaint.pdf...
Processing Smoke-UP-Farms-LLC-dba-Tree-of-Life-Cannabis-AUGC001018-Consent-Order-and-Stipulation-and-Formal-Com.pdf...
Processing STAR-BUDZ-PROVISIONING-CENTER-LLC-dba-Star-Budz-AUR000611-Consent-Order-and-Stipulation-and-Formal-C.pdf...
Processing Compassionate-Advisors--Pinconning-LLC-dba-Pincanna-AUGC000270-Consent-Order-and-Stipulation-and-For.pdf...
Processing Revolution-Strains-Inc-dba-Nirvana-Center-Processing-PR000295-Consent-Order-and-Stipulation-and-Form.pdf...
Processing Revolution-Strains-Inc-dba-Nirvana-Center-Processing-AUP000286-Consent-Order-and-Stipulation-and-For.pdf...
Processing DJR-Michigan-Properties-LLC-dba-High-Level-Health-AUGC000331-Consent-Order-and-Stipulation-and-Forma.pdf...
Processing DJR-Michigan-Properties-LLC-dba-High-Level-Health-GRC000182-Consent-Order-and-Stipulation-and-Formal.pdf...
Processing 305-Farms-LLC-AUGC000793-Consent-Order-and-Stipulation-