In [None]:
!pip install PyPDF2
!pip install pdfplumber
!pip install pymupdf
!pip install pandas
!rm -r /content/extracted_pdfs


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/232.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Collecting pdfplumber
  Downloading pdfplumber-0.10.3-py3-none-any.whl (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.0/49.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20221105 (from pdfplumber)
  Downloading pdfminer.six-20221105-py3-none-any.whl (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
Collecting pypdfium2>=4.18.0 (from 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import re
import pandas as pd
import fitz  # PyMuPDF

def extract_info_from_pdf(pdf_path):
    pdf_document = fitz.open(pdf_path)
    text = ""
    for page_num in range(pdf_document.page_count):
        page = pdf_document[page_num]
        text += page.get_text()
    pdf_document.close()
    return text

def extract_data_from_text(text):
    sections = re.split(r'(?:IN THE SUPREME COURT OF INDIA CRIMINAL ORIGINAL JURISDICTION|Supreme Court of India|IN THE HIGH COURT OF JUDICATURE AT BOMBAY|Bombay High Court)', text)[1:]
    case_info = {'case_number': '', 'Year': '', 'Petitioner': '', 'Respondent': '',
                 'Judge Name': '', 'Lawyer Name': '', 'case_type': ''}

    for section in sections:
        case_info_match = re.search(r'(\d+) OF (\d+)', section)
        if case_info_match:
            case_info['case_number'] = case_info_match.group(1)
            case_info['Year'] = case_info_match.group(2)

        for line in section.split('\n'):
            if 'Bench:' in line:
                judge_match = re.search(r'Bench:\s*([^,]+(?:,\s*[^,]+)*)', line)
                if judge_match:
                    case_info['Judge Name'] = judge_match.group(1).strip()

            if 'vs' in line and 'on' in line and any(char.isdigit() for char in line):
                parts = line.split('vs')
                case_info['Petitioner'] = parts[0].strip()
                case_info['Respondent'] = parts[1].split('on')[0].strip()

            if any(keyword in line for keyword in ['ORDINARY', 'SPECIAL', 'ORIGINAL', 'CIVIL', 'JURISDICTION']):
                case_info['case_type'] = line.strip()

    return case_info

# Specify the folder paths
folder_paths = ['/content/drive/MyDrive/SUPREME COURT', '/content/drive/MyDrive/CourtCase']

csv_file_path = 'output.csv'

# Initialize an empty list to store the data
data_list = []

# Process each PDF file from both folders
for folder_path in folder_paths:
    for pdf_file in os.listdir(folder_path):
        if pdf_file.lower().endswith('.pdf'):
            pdf_path = os.path.join(folder_path, pdf_file)

            # Skip empty files
            if os.path.getsize(pdf_path) == 0:
                print(f"Skipping empty file: {pdf_file}")
                continue

            # Extract text from the PDF
            text = extract_info_from_pdf(pdf_path)

            # Extract data from the text
            extracted_data = extract_data_from_text(text)

            # Add PDF file name and folder name to the extracted data
            extracted_data['PDF File'] = pdf_file
            extracted_data['Folder'] = os.path.basename(folder_path)

            # Append the data to the list
            data_list.append(extracted_data)

# Create a DataFrame from the list of extracted data
df = pd.DataFrame(data_list)

# Save the DataFrame to CSV
df.to_csv(csv_file_path, index=False)

print(f'PDF file names and processed data are written to {csv_file_path}.')


Skipping empty file: M_S_Capital_First_Ltd_vs_Sandeep_Sawant_on_3_January_2023.PDF
PDF file names and processed data are written to output.csv.


In [None]:
import pandas as pd
from transformers import pipeline

# Load CSV data into a DataFrame
df = pd.read_csv("/content/output.csv")

# Create a question-answering pipeline
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

def answer_question(question):
    try:
        # Split the question into parts
        parts = question.split(":")

        # Check if there are enough parts
        if len(parts) < 2:
            raise ValueError("Invalid question format. Please use the format 'Key Column:XXXX, Target Column:?'.")

        # Extract key and target information
        key_column_parts = parts[0].split(",")
        if len(key_column_parts) < 2:
            raise ValueError("Invalid question format. Please provide a value for the key column.")

        key_column = key_column_parts[0].strip()
        key_value = key_column_parts[1].strip()
        target_column = parts[1].strip()

        # Process the question and retrieve the answer (replace this with your logic)
        answer = f"Processing question: Key Column={key_column}, Key Value={key_value}, Target Column={target_column}"

        return answer

    except ValueError as e:
        return f"Error: {e}"

# Example usage
while True:
    question = input("Enter your question in the format 'Key Column:XXXX, Target Column:?' (or type 'quit' to exit): ")

    if question.lower() == 'quit':
        break

    answer = answer_question(question)
    print(answer)



Error: Invalid question format. Please provide a value for the key column.
Error: Invalid question format. Please provide a value for the key column.
Enter your question in the format 'Key Column:XXXX, Target Column:?' (or type 'quit' to exit): quit


In [None]:
import pandas as pd
from transformers import pipeline

# Load CSV data into a DataFrame
df = pd.read_csv("/content/output.csv")

# Create a question-answering pipeline
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

def answer_question(question):
    """
    Answers a question about the case data, handling various error scenarios and formatting the response.

    Args:
        question (str): The question to answer, in the format "Case Number:XXXX, Column Name:?"

    Returns:
        str: The formatted answer to the question, or an appropriate error message if applicable.
    """

    # Extract case number and target column from question
    parts = question.split(":")
    case_number_str = parts[1].split(",")[0].strip()
    case_number = int(case_number_str)
    target_column = parts[1].split(",")[1].strip()

    # Filter data based on case number
    relevant_documents = df[df["case_number"] == case_number]

    # Answer question using the filtered DataFrame
    if not relevant_documents.empty:
        if target_column in relevant_documents.columns:
            if pd.api.types.is_string_dtype(relevant_documents[target_column]):
                combined_text = relevant_documents[target_column].str.cat(sep='\n')
            else:
                combined_text = relevant_documents[target_column].to_string(index=False)
            answer = qa_pipeline({'context': combined_text, 'question': question})['answer']

            # Format the answer
            answer = f"The {target_column.lower()} of Case Number {case_number} is {answer}."
        else:
            answer = f"Error: Column '{target_column}' not found in the data."
    else:
        answer = f"Error: No documents found for case number {case_number}."

    return answer

# Interactive loop for user input
while True:
    question = input("Enter your question in the format 'Case Number:XXXX, Column Name:?' (or type 'quit' to exit): ")
    if question.lower() == "quit":
        break

    answer = answer_question(question)
    print(answer)


Enter your question in the format 'Case Number:XXXX, Column Name:?' (or type 'quit' to exit): Case Number:739, Column Name:case_type
Error: Column 'Column Name' not found in the data.
Enter your question in the format 'Case Number:XXXX, Column Name:?' (or type 'quit' to exit): Case Number:739, case_type:?
The case_type of Case Number 739 is 
CIVIL APPEAL NO. 740 OF 2023.


ValueError: invalid literal for int() with base 10: 'ORDINARY ORIGINAL CIVIL JURISDICTION'

In [None]:
import pandas as pd
from transformers import pipeline

# Load the CSV file
df = pd.read_csv('/content/output.csv')

def answer_question(question):
    try:
        # 1. Inspect CSV Data
        # - Check if the values "3750" and "Year" exist in the respective columns.
        # - Ensure no leading/trailing spaces or unexpected characters in column names or values.
        # - Verify the consistent data types for "case_number" and "Year" (likely integers).

        # 2. Debug Code Execution
        # - Print df.head() after loading the CSV to examine the data frame structure.
        print("Data Frame Head:")
        print(df.head())

        # Split the question into parts
        parts = question.split(":")

        # Check if there are enough parts
        if len(parts) < 2:
            raise ValueError("Invalid question format. Please use the format 'Key Column:XXXX, Target Column:?'.")

        # Extract key and target information
        key_column_parts = parts[0].split(",")
        if len(key_column_parts) < 2:
            raise ValueError("Invalid question format. Please provide a value for the key column.")

        key_column = key_column_parts[0].strip()
        key_value = key_column_parts[1].strip()
        target_column = parts[1].strip()

        # Convert key_value to numeric if possible (handles NaN gracefully)
        key_value = pd.to_numeric(key_value, errors='coerce')

        # Process the question and retrieve the answer (replace this with your logic)
        relevant_documents = df[df[key_column] == key_value]
        answer = f"Processing question: Key Column={key_column}, Key Value={key_value}, Target Column={target_column}"

        return answer

    except ValueError as e:
        return f"Error: {e}"

# Example usage
while True:
    question = input("Enter your question in the format 'Key Column:XXXX, Target Column:?' (or type 'quit' to exit): ")

    if question.lower() == 'quit':
        break

    answer = answer_question(question)
    print(answer)


Enter your question in the format 'Key Column:XXXX, Target Column:?' (or type 'quit' to exit): case_number:3750, Year:?
Data Frame Head:
   case_number    Year                           Petitioner  \
0       3750.0  2022.0            Asian Hotels (North) Ltd.   
1       4703.0  2022.0        U.N. Krishnamurthy (Since ...   
2          NaN     NaN                                Ajmal   
3      11474.0  2018.0               Amarendra Kumar Pandey   
4        739.0  2017.0  Shahaja @ Shahajan Ismail Mohd. ...   

                 Respondent                                Judge Name  \
0          Alok Kumar Lodha                M.R. Shah, B.V. Nagarathna   
1        A.M. Krishnamurthy  Hon'Ble Ms. Banerjee, V. Ramasubramanian   
2       The State Of Kerala                 Hemant Gupta, Vikram Nath   
3                       Uni           A.M. Khanwilkar, J.B. Pardiwala   
4  The State Of Maharashtra           A.M. Khanwilkar, J.B. Pardiwala   

   Lawyer Name                              C

KeyboardInterrupt: Interrupted by user

In [None]:
import pandas as pd
from transformers import pipeline

# Load the CSV file
df = pd.read_csv('/content/output.csv')

def answer_question(question):
    try:
        # 1. Inspect CSV Data
        # - Check if the values "3750" and "Year" exist in the respective columns.
        # - Ensure no leading/trailing spaces or unexpected characters in column names or values.
        # - Verify the consistent data types for "case_number" and "Year" (likely integers).

        # 2. Debug Code Execution
        # - Print df.head() after loading the CSV to examine the data frame structure.
        print("Data Frame Head:")
        print(df.head())

        # Corrected question parsing
        parts = question.split(",", 1)

        # Check if there are enough parts
        if len(parts) < 2:
            raise ValueError("Invalid question format. Please use the format 'Key Column:XXXX, Target Column:?'.")

        # Extract key and target information
        key_column_value = parts[0].strip()  # Contains both key column and value
        target_column = parts[1].strip()

        key_column_parts = key_column_value.split(":")
        key_column = key_column_parts[0].strip()
        key_value = key_column_parts[1].strip()

        # Process the question and retrieve the answer (replace this with your logic)
        relevant_documents = df[df[key_column] == key_value]
        answer = f"Processing question: Key Column={key_column}, Key Value={key_value}, Target Column={target_column}"

        return answer

    except ValueError as e:
        return f"Error: {e}"

# Example usage
while True:
    question = input("Enter your question in the format 'Key Column:XXXX, Target Column:?' (or type 'quit' to exit): ")

    if question.lower() == 'quit':
        break

    answer = answer_question(question)
    print(answer)


Enter your question in the format 'Key Column:XXXX, Target Column:?' (or type 'quit' to exit): case_number : 3750, Year : ?
Data Frame Head:
   case_number    Year                           Petitioner  \
0       3750.0  2022.0            Asian Hotels (North) Ltd.   
1       4703.0  2022.0        U.N. Krishnamurthy (Since ...   
2          NaN     NaN                                Ajmal   
3      11474.0  2018.0               Amarendra Kumar Pandey   
4        739.0  2017.0  Shahaja @ Shahajan Ismail Mohd. ...   

                 Respondent                                Judge Name  \
0          Alok Kumar Lodha                M.R. Shah, B.V. Nagarathna   
1        A.M. Krishnamurthy  Hon'Ble Ms. Banerjee, V. Ramasubramanian   
2       The State Of Kerala                 Hemant Gupta, Vikram Nath   
3                       Uni           A.M. Khanwilkar, J.B. Pardiwala   
4  The State Of Maharashtra           A.M. Khanwilkar, J.B. Pardiwala   

   Lawyer Name                           

KeyboardInterrupt: Interrupted by user

In [None]:
import pandas as pd
from transformers import pipeline

# Load the CSV file
df = pd.read_csv('/content/output.csv')

# Create a dictionary to map numbers to column names
column_map = {
    1: "case_number",
    2: "Year",
    3: "Petitioner",
    4: "Respondent",
    5: "Judge Name",
    6: "Lawyer Name",
    7: "Case Type",
    # Add mappings for other columns as needed
}

# Create a question-answering pipeline
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

def answer_question(question):
    try:
        # Extract key and target information using the mapping
        key_column_number, key_value, target_column_number = map(int, question.split(","))
        key_column = column_map.get(key_column_number)
        target_column = column_map.get(target_column_number)

        # Check if the column numbers are valid
        if not key_column or not target_column:
            raise ValueError("Invalid column numbers. Please use valid numbers for columns.")

        # Process the question
        relevant_documents = df[df[key_column] == str(key_value)]  # Convert key_value to string
        if not relevant_documents.empty and target_column in relevant_documents.columns:
            combined_text = relevant_documents[target_column].astype(str).str.cat(sep='\n')
            answer = qa_pipeline({'context': combined_text, 'question': target_column})['answer']
            return f"The {target_column.lower()} of {key_column}:{key_value} is {answer}."
        else:
            return f"Error: No documents found for {key_column}:{key_value} or column '{target_column}' not found in the data."

    except ValueError as e:
        return f"Error: {e}"

# Example usage
while True:
    question = input("Enter your question in the format 'Key Column:XXXX, Target Column:?' (or type 'quit' to exit): ")

    if question.lower() == 'quit':
        break

    answer = answer_question(question)
    print(answer)


Enter your question in the format 'Key Column:XXXX, Target Column:?' (or type 'quit' to exit): 1,3750,2
Error: No documents found for case_number:3750 or column 'Year' not found in the data.


KeyboardInterrupt: Interrupted by user

In [None]:
import pandas as pd
from transformers import pipeline

# Load the CSV file
df = pd.read_csv('/content/output.csv')

# Clean column names (remove whitespaces and make lowercase for consistency)
df.columns = df.columns.str.strip().str.lower()

# Create a dictionary to map numbers to column names
column_map = {
    1: "case_number",
    2: "year",  # Ensure this matches the lowercase column name
    3: "petitioner",
    4: "respondent",
    5: "judge name",
    6: "lawyer name",
    7: "case type",
    # Add mappings for other columns as needed
}

# Create a question-answering pipeline
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

def answer_question(question):
    try:
        # Extract key and target information using the mapping
        key_column_number, key_value, target_column_number = map(int, question.split(","))
        key_column = column_map.get(key_column_number)
        target_column = column_map.get(target_column_number)

        # Check if the column numbers are valid
        if not key_column or not target_column:
            raise ValueError("Invalid column numbers. Please use valid numbers for columns.")

        # Process the question
        print(f"Key Column: {key_column}, Key Value: {key_value}, Target Column: {target_column}")

        # Make the key column name case-insensitive (already done)
        key_column_lower = key_column.lower()
        relevant_documents = df[df[key_column_lower] == str(key_value)]

        # Print data for debugging
        print("DataFrame head:")
        print(df.head())
        print("DataFrame shape:")
        print(df.shape)
        print("Relevant documents:")
        print(relevant_documents)

        if not relevant_documents.empty and target_column in relevant_documents.columns:
            combined_text = relevant_documents[target_column].astype(str).str.cat(sep='\n')
            print("Combined Text:")
            print(combined_text)

            answer = qa_pipeline({'context': combined_text, 'question': target_column})['answer']
            return f"The {target_column.lower()} of {key_column}:{key_value} is {answer}."
        else:
            return f"Error: No documents found for {key_column}:{key_value} or column '{target_column}' not found in the data."

    except ValueError as e:
        return f"Error: {e}"

# Example usage (unchanged)
# ...


# Example usage
while True:
    question = input("Enter your question in the format 'Key Column:XXXX, Target Column:?' (or type 'quit' to exit): ")

    if question.lower() == 'quit':
        break

    answer = answer_question(question)
    print(answer)


Exception ignored in: <function _xla_gc_callback at 0x7cacbf4ef1c0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/jax/_src/lib/__init__.py", line 97, in _xla_gc_callback
    def _xla_gc_callback(*args):
KeyboardInterrupt: 


KeyboardInterrupt: 

In [None]:
!pip install --upgrade tensorflow
!pip install --upgrade transformers




In [None]:
import pandas as pd
from transformers import pipeline

# Load CSV data into a DataFrame
df = pd.read_csv("/content/output.csv")

# Create a question-answering pipeline
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

def answer_question(question):
    """
    Answers a question about the case data, handling various error scenarios and formatting the response.

    Args:
        question (str): The question to answer, in the format "Case Type:XXXX, Column Name:?"

    Returns:
        str: The formatted answer to the question, or an appropriate error message if applicable.
    """

    # Extract case type and target column from question
    parts = question.split(":")
    case_type = parts[1].split(",")[0].strip()
    target_column = parts[1].split(",")[1].strip()

    # Filter data based on case type
    relevant_documents = df[df["case_type"] == case_type]

    # Answer question using the filtered DataFrame
    if not relevant_documents.empty:
        if target_column in relevant_documents.columns:
            if pd.api.types.is_string_dtype(relevant_documents[target_column]):
                combined_text = relevant_documents[target_column].str.cat(sep='\n')
            else:
                combined_text = relevant_documents[target_column].to_string(index=False)
            answer = qa_pipeline({'context': combined_text, 'question': question})['answer']

            # Format the answer
            answer = f"The {target_column.lower()} of Case Type {case_type} is {answer}."
        else:
            answer = f"Error: Column '{target_column}' not found in the data."
    else:
        answer = f"Error: No documents found for case type {case_type}."

    return answer

# Interactive loop for user input
while True:
    question = input("Enter your question in the format 'Case Type:XXXX, Column Name:?' (or type 'quit' to exit): ")
    if question.lower() == "quit":
        break

    answer = answer_question(question)
    print(answer)


Enter your question in the format 'Case Type:XXXX, Column Name:?' (or type 'quit' to exit): Case Type:ORDINARY ORIGINAL CIVIL JURISDICTION, case_number:?
The case_number of Case Type ORDINARY ORIGINAL CIVIL JURISDICTION is 211.0
 .


KeyboardInterrupt: Interrupted by user

In [None]:
import pandas as pd
from transformers import pipeline

# Load CSV data into a DataFrame
df = pd.read_csv("/content/output.csv")

# Create a question-answering pipeline
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

def answer_question(question):
    """
    Answers a question about the case data, handling various error scenarios and formatting the response.

    Args:
        question (str): The question to answer, in the format "Case Type:XXXX, Column Name:?"

    Returns:
        str: The formatted answer to the question, or an appropriate error message if applicable.
    """

    # Extract case type and target column from question
    parts = question.split(":")
    case_type = parts[1].split(",")[0].strip()
    target_column = parts[1].split(",")[1].strip()

    # Filter data based on case type
    relevant_documents = df[df["case_type"] == case_type]

    # Answer question using the filtered DataFrame
    if not relevant_documents.empty:
        if target_column in relevant_documents.columns:
            answers = []
            for index, row in relevant_documents.iterrows():
                case_number = row["case_number"]
                combined_text = str(row[target_column])  # Convert float to string if necessary
                answer = qa_pipeline({'context': combined_text, 'question': f"Case Number:{case_number}, {target_column}:?"})['answer']
                answers.append(f"The {target_column.lower()} of Case Number {case_number} is {answer}.")

            answer = "\n".join(answers)  # Combine individual answers
        else:
            answer = f"Error: Column '{target_column}' not found in the data."
    else:
        answer = f"Error: No documents found for case type {case_type}."

    return answer

# Interactive loop for user input
while True:
    question = input("Enter your question in the format 'Case Type:XXXX, Column Name:?' (or type 'quit' to exit): ")
    if question.lower() == "quit":
        break

    answer = answer_question(question)
    print(answer)


Enter your question in the format 'Case Type:XXXX, Column Name:?' (or type 'quit' to exit): Case Type:ORDINARY ORIGINAL CIVIL JURISDICTION, case_number:?
The case_number of Case Number 3875.0 is 3875.0.
The case_number of Case Number 38788.0 is 38788.0.
The case_number of Case Number 3933.0 is 3933.0.
The case_number of Case Number 29988.0 is 29988.0.
The case_number of Case Number 33102.0 is 33102.0.
The case_number of Case Number 30185.0 is 30185.0.
The case_number of Case Number 4575.0 is 4575.0.
The case_number of Case Number 37804.0 is 37804.0.
The case_number of Case Number 264.0 is 264.0.
The case_number of Case Number 222.0 is 222.0.
The case_number of Case Number 18362.0 is 18362.0.
The case_number of Case Number 18362.0 is 18362.0.
The case_number of Case Number 18362.0 is 18362.0.
The case_number of Case Number 18362.0 is 18362.0.
The case_number of Case Number 479.0 is 479.0.
The case_number of Case Number 409.0 is 409.0.
The case_number of Case Number 409.0 is 409.0.
The c

KeyboardInterrupt: Interrupted by user