## 1. Setup and Libraries

In [1]:
!pip install transformers scikit-learn



In [2]:
import os
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import T5Tokenizer, T5ForConditionalGeneration

2024-12-07 03:29:00.710515: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## 2. Load Metadata and Preprocess

In [3]:
metadata_file = "metadata_new.json"

def load_metadata(file_path):
    """Load metadata from a JSON file."""
    with open(file_path, "r") as f:
        return json.load(f)

metadata = load_metadata(metadata_file)
print(f"Loaded {len(metadata)} metadata entries.")

Loaded 3777 metadata entries.


In [5]:
# Extract texts for TF-IDF
texts = [item['text'] for item in metadata]
print(f"Sample text: {texts[0][:200]}")

Sample text: DAWSON, District Judge.
Petitioner, by his guardian, ad litem, sets forth that he is unlawfully restrained of his liberty by Lieutenant Commander J. S. Newell, naval officer in charge at this station,


## 3. Generate TF-IDF Embeddings

In [6]:
vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')

In [7]:
tfidf_matrix = vectorizer.fit_transform(texts)
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")

TF-IDF matrix shape: (3777, 9787)


## 4. Query with TF-IDF

In [9]:
def tfidf_query(query, top_k=5):
    """
    Perform a query using TF-IDF and return top-k results.
    
    Args:
        query (str): The query text.
        top_k (int): Number of top results to return.

    Returns:
        list: Top-k results with metadata and scores.
    """
    # Transform the query to match the TF-IDF matrix
    query_vector = vectorizer.transform([query])

    # Calculate cosine similarity
    scores = cosine_similarity(query_vector, tfidf_matrix).flatten()

    # Sort scores in descending order and get top-k indices
    top_indices = scores.argsort()[-top_k:][::-1]
    
    # Gather top-k results
    results = []
    for idx in top_indices:
        results.append({
            "file": metadata[idx]["file"],
            "text_snippet": metadata[idx]["text"][:200],  # First 200 characters
            "score": scores[idx]
        })
    
    return results


## 5. Summarization with T5

In [10]:
t5_model_name = "t5-small"  # Use "t5-base" or "t5-large" for larger models
tokenizer = T5Tokenizer.from_pretrained(t5_model_name)
model = T5ForConditionalGeneration.from_pretrained(t5_model_name)



tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [11]:
def generate_summary(query, results):
    """
    Generate a summary for the query based on top results using T5.

    Args:
        query (str): The user query.
        results (list): Top results from the query function.

    Returns:
        str: Generated summary.
    """
    # Combine text from top results
    context = " ".join([res["text_snippet"] for res in results])

    if not context.strip():
        return "No relevant document content found for summarization."

    # Prepare input for T5 summarization
    input_text = f"question: {query} context: {context}"
    inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)

    # Generate summary
    summary_ids = model.generate(inputs, max_length=150, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


## 6. Interactive Query System

In [12]:
def interactive_query():
    """
    Allow the user to enter their own query and return results and a summary.
    """
    while True:
        print("\n--- TF-IDF Query System ---")
        user_query = input("Enter your query (or type 'exit' to quit): ").strip()
        
        if user_query.lower() == 'exit':
            print("Exiting the query system. Goodbye!")
            break
        
        # Perform the query
        results = tfidf_query(user_query)
        
        # Display the results
        print("\nTop Results:")
        for res in results:
            print(f"File: {res['file']}, Score: {res['score']:.4f}")
            print(f"Text Snippet: {res['text_snippet']}\n")

        # Generate a summary for the query
        print("\nGenerating summary for the query...")
        summary = generate_summary(user_query, results)
        print("\nGenerated Summary:")
        print(summary)

In [13]:
interactive_query()


--- TF-IDF Query System ---


Enter your query (or type 'exit' to quit):  healthcare



Top Results:
File: 0001-01.json, Score: 0.0000
Text Snippet: DAWSON, District Judge.
Petitioner, by his guardian, ad litem, sets forth that he is unlawfully restrained of his liberty by Lieutenant Commander J. S. Newell, naval officer in charge at this station,

File: 0005-01.json, Score: 0.0000
Text Snippet:  all of them.
Every system of laws must necessarily contain defects, and cases often occur for which the law based on precedents and fixed rules affords no redress. It is in such cases only that equit

File: 0005-01.json, Score: 0.0000
Text Snippet: f cases where the law affords an ample remedy. Equity must necessarily have a place in every rational system of jurisprudence. It is impossible that any code or system of laws, however minute, should 

File: 0005-01.json, Score: 0.0000
Text Snippet: dy at law. To entitle the plaintiff to relief in equity and invoke remedial relief by injunction, it must be made to appear upon the face of the petition that he cannot redress his suppose

Enter your query (or type 'exit' to quit):  exit


Exiting the query system. Goodbye!
