<a href="https://colab.research.google.com/github/ms7039/Legal-Advice-Chatbot/blob/main/Mini_Project_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
os.environ['KAGGLE_USERNAME'] = 'manyashrivastava'
os.environ['KAGGLE_KEY'] = '6a059036a9090c8263edd6e2156ad2fc'

# Then initialize the KaggleApi
from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()

In [None]:
import re
import random
import nltk
import spacy
import numpy as np
import json
import os
import time
import zipfile
import io
from datetime import datetime, timedelta
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import requests
import pandas as pd
from kaggle.api.kaggle_api_extended import KaggleApi

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

class LegalAdvisorChatbot:
    def __init__(self, cache_dir="./legal_cache"):
        """
        Initialize the Legal Advisor Chatbot with Kaggle API integration
        Args:
            cache_dir (str): Directory to store cached data and downloads
        """
        # Load NLP models
        self.nlp = spacy.load('en_core_web_sm')

        # Kaggle API Configuration
        self.kaggle_username = "manyashrivastava"
        self.kaggle_key = "6a059036a9090c8263edd6e2156ad2fc"

        # Legal datasets to use from Kaggle
        self.legal_datasets = ["akshatgupta7/llm-fine-tuning-dataset-of-indian-legal-texts"]

        # Cache configuration
        self.cache_dir = cache_dir
        self.cache_expiry = 30  # Cache expiry in days
        self.ensure_cache_dir()

        # Local fallback knowledge base
        self.local_knowledge_path = os.path.join(cache_dir, "local_knowledge_base.json")
        self.load_or_create_local_knowledge()

        # Initialize Kaggle API
        self.setup_kaggle_api()

        # Cache frequently asked questions for better performance
        self.cached_questions = list(self.local_knowledge.keys())

        # Initialize the vectorizer before processing datasets
        self.vectorizer = TfidfVectorizer()
        self.tfidf_matrix = self.vectorizer.fit_transform(self.cached_questions)

        # Download and process datasets if needed
        self.dataset_path = os.path.join(cache_dir, "datasets")
        self.ensure_datasets()

        # Disclaimer message
        self.disclaimer = (
            "IMPORTANT: This is an AI legal advisor and does not constitute professional legal advice. "
            "Always consult with a qualified legal professional for specific legal guidance."
        )

    def setup_kaggle_api(self):
        """Set up Kaggle API authentication"""
        # Create kaggle.json if it doesn't exist
        kaggle_dir = os.path.expanduser('~/.kaggle')
        if not os.path.exists(kaggle_dir):
            os.makedirs(kaggle_dir)

        kaggle_cred_path = os.path.join(kaggle_dir, 'kaggle.json')
        if not os.path.exists(kaggle_cred_path):
            credentials = {
                "username": self.kaggle_username,
                "key": self.kaggle_key
            }
            with open(kaggle_cred_path, 'w') as f:
                json.dump(credentials, f)
            os.chmod(kaggle_cred_path, 0o600)  # Set appropriate permissions

        # Initialize the API
        self.kaggle_api = KaggleApi()
        self.kaggle_api.authenticate()

    def ensure_cache_dir(self):
        """Create cache directory if it doesn't exist"""
        if not os.path.exists(self.cache_dir):
            os.makedirs(self.cache_dir)
        if not os.path.exists(os.path.join(self.cache_dir, "datasets")):
            os.makedirs(os.path.join(self.cache_dir, "datasets"))

    def ensure_datasets(self):
        """Download and extract datasets if not already available"""
        # Check if datasets are already downloaded
        dataset_index_path = os.path.join(self.cache_dir, "dataset_index.json")
        if os.path.exists(dataset_index_path):
            # Check if index is expired
            file_time = os.path.getmtime(dataset_index_path)
            file_datetime = datetime.fromtimestamp(file_time)
            expiry_date = datetime.now() - timedelta(days=self.cache_expiry)
            if file_datetime > expiry_date:
                print("Using cached datasets")
                return

        print("Downloading legal datasets from Kaggle...")
        dataset_index = {}

        for dataset in self.legal_datasets:
            try:
                # Download the dataset
                dataset_path = os.path.join(self.dataset_path, dataset.split('/')[1])
                if not os.path.exists(dataset_path):
                    os.makedirs(dataset_path)

                print(f"Downloading {dataset}...")
                self.kaggle_api.dataset_download_files(
                    dataset,
                    path=dataset_path,
                    unzip=True
                )

                # Index the dataset contents
                dataset_files = []
                for root, _, files in os.walk(dataset_path):
                    for file in files:
                        if file.endswith('.csv') or file.endswith('.json'):
                            rel_path = os.path.relpath(os.path.join(root, file), dataset_path)
                            dataset_files.append(rel_path)

                dataset_index[dataset] = {
                    "path": dataset_path,
                    "files": dataset_files
                }

                print(f"Downloaded {dataset} successfully")
            except Exception as e:
                print(f"Error downloading {dataset}: {e}")

        # Save the dataset index
        with open(dataset_index_path, 'w') as f:
            json.dump(dataset_index, f, indent=2)

        # Process datasets to extract knowledge
        self.process_datasets(dataset_index)

    def process_datasets(self, dataset_index):
        """Process downloaded datasets and extract legal knowledge"""
        print("Processing datasets to extract legal knowledge...")
        extracted_knowledge = {}

        for dataset_name, dataset_info in dataset_index.items():
            dataset_path = dataset_info["path"]

            for file_path in dataset_info["files"]:
                full_path = os.path.join(dataset_path, file_path)
                try:
                    if file_path.endswith('.csv'):
                        # Process CSV files
                        df = pd.read_csv(full_path)

                        # Look for columns that might contain questions and answers
                        if 'question' in df.columns and 'answer' in df.columns:
                            for _, row in df.iterrows():
                                extracted_knowledge[row['question']] = row['answer']

                        # If legal articles dataset
                        elif 'title' in df.columns and 'content' in df.columns:
                            for _, row in df.iterrows():
                                title = row['title']
                                # Use title as a question
                                question = f"What is {title}?" if not title.endswith('?') else title
                                extracted_knowledge[question] = row['content'][:1000]  # Limit length

                    elif file_path.endswith('.json'):
                        # Process JSON files
                        with open(full_path, 'r', encoding='utf-8') as f:
                            try:
                                data = json.load(f)

                                # Process different JSON structures
                                if isinstance(data, dict):
                                    for key, value in data.items():
                                        if isinstance(value, str) and len(value) > 50:
                                            question = f"What is {key}?" if not key.endswith('?') else key
                                            extracted_knowledge[question] = value[:1000]  # Limit length

                                elif isinstance(data, list):
                                    for item in data:
                                        if isinstance(item, dict):
                                            # Try to find question-answer pairs
                                            q = item.get('question', item.get('title', ''))
                                            a = item.get('answer', item.get('content', item.get('description', '')))

                                            if q and a and len(a) > 50:
                                                question = f"What is {q}?" if not q.endswith('?') else q
                                                extracted_knowledge[question] = a[:1000]  # Limit length

                            except json.JSONDecodeError:
                                print(f"Error decoding JSON file: {full_path}")

                except Exception as e:
                    print(f"Error processing file {full_path}: {e}")

        # Merge with existing knowledge
        self.local_knowledge.update(extracted_knowledge)

        # Save updated knowledge base
        with open(self.local_knowledge_path, 'w', encoding='utf-8') as f:
            json.dump(self.local_knowledge, f, indent=2)

        # Update cached questions and TF-IDF matrix
        self.cached_questions = list(self.local_knowledge.keys())
        self.tfidf_matrix = self.vectorizer.fit_transform(self.cached_questions)

        print(f"Extracted {len(extracted_knowledge)} new legal knowledge items")

    def load_or_create_local_knowledge(self):
        """Load local knowledge base or create if it doesn't exist"""
        if os.path.exists(self.local_knowledge_path):
            try:
                with open(self.local_knowledge_path, 'r', encoding='utf-8') as f:
                    self.local_knowledge = json.load(f)
            except (json.JSONDecodeError, UnicodeDecodeError):
                print("Error reading local knowledge base. Creating new one.")
                self.create_default_knowledge_base()
        else:
            self.create_default_knowledge_base()

    def create_default_knowledge_base(self):
        """Create a default knowledge base with basic legal information"""
        self.local_knowledge = {
            "What is copyright?": "Copyright is a legal protection for original creative works, giving creators exclusive rights to use and distribute their work for a limited time.",
            "How do I file for a trademark?": "To file for a trademark, you must submit an application to the USPTO, including proof of use and a detailed description of your mark.",
            "What is fair use in copyright law?": "Fair use allows limited use of copyrighted material without permission for purposes like criticism, commentary, education, and research.",
            "What constitutes patent infringement?": "Patent infringement occurs when someone makes, uses, sells, or imports a patented item or process without permission from the patent holder.",
            "How long does copyright protection last?": "In the US, copyright generally lasts for the author's lifetime plus 70 years for works created after January 1, 1978.",
            "What is the process for filing a civil lawsuit?": "The process typically involves: 1) Filing a complaint, 2) Serving the defendant, 3) Defendant's response, 4) Discovery phase, 5) Pre-trial motions, 6) Trial, and 7) Potential appeals.",
            "What are the requirements for a valid contract?": "A valid contract requires: 1) Offer, 2) Acceptance, 3) Consideration (something of value exchanged), 4) Legal capacity of parties, 5) Legal purpose, and 6) Mutual agreement.",
            "What is considered employment discrimination?": "Employment discrimination involves treating job applicants or employees unfavorably because of protected characteristics like race, color, religion, sex, national origin, age, disability, or genetic information.",
            "What is the difference between a felony and misdemeanor?": "Felonies are more serious crimes punishable by imprisonment of more than one year or death. Misdemeanors are less serious offenses typically punishable by less than a year in jail.",
            "How does bankruptcy protection work?": "Bankruptcy protection allows individuals or businesses to eliminate or repay debts under court protection. Common types include Chapter 7 (liquidation), Chapter 11 (reorganization), and Chapter 13 (adjustment of debts)."
        }

        # Save the default knowledge base
        with open(self.local_knowledge_path, 'w', encoding='utf-8') as f:
            json.dump(self.local_knowledge, f, indent=2)

    def preprocess_text(self, text):
        """
        Preprocess input text using spaCy
        Args:
            text (str): Input text to preprocess
        Returns:
            str: Preprocessed text
        """
        # Convert to lowercase
        text = text.lower()

        # Remove punctuation and special characters
        text = re.sub(r'[^\w\s]', '', text)

        # Lemmatization and stopword removal
        doc = self.nlp(text)
        processed_tokens = [token.lemma_ for token in doc
                           if not token.is_stop and token.is_alpha]

        return ' '.join(processed_tokens)

    def find_best_match(self, user_query):
        """
        Find the best matching legal question using cosine similarity
        Args:
            user_query (str): User's input query
        Returns:
            tuple: Best matching question and its similarity score
        """
        # Preprocess user query
        processed_query = self.preprocess_text(user_query)

        # Transform query to TF-IDF vector
        query_vector = self.vectorizer.transform([processed_query])

        # Compute cosine similarities
        similarities = cosine_similarity(query_vector, self.tfidf_matrix)[0]

        # Find best match
        best_match_index = np.argmax(similarities)
        best_match_score = similarities[best_match_index]

        return (self.cached_questions[best_match_index], best_match_score)

    def search_kaggle_datasets(self, query):
        """
        Search Kaggle datasets for additional information about the query
        Args:
            query (str): Search query
        Returns:
            list: Results from Kaggle datasets
        """
        # Get hash of query for cache filename
        query_hash = str(hash(query) % 10000000)
        cache_file = os.path.join(self.cache_dir, f"kaggle_search_{query_hash}.json")

        # Check if we have a valid cached response
        if os.path.exists(cache_file):
            file_time = os.path.getmtime(cache_file)
            file_datetime = datetime.fromtimestamp(file_time)
            expiry_date = datetime.now() - timedelta(days=self.cache_expiry)

            if file_datetime > expiry_date:
                try:
                    with open(cache_file, 'r', encoding='utf-8') as f:
                        print("Using cached Kaggle search results")
                        return json.load(f)
                except (json.JSONDecodeError, UnicodeDecodeError):
                    print("Error reading cache file")

        print(f"Searching Kaggle datasets for: {query}")
        results = []

        # Search across all downloaded datasets
        dataset_index_path = os.path.join(self.cache_dir, "dataset_index.json")
        if os.path.exists(dataset_index_path):
            with open(dataset_index_path, 'r') as f:
                dataset_index = json.load(f)

            # Extract key terms from query
            doc = self.nlp(query)
            search_terms = [token.text.lower() for token in doc if not token.is_stop and token.is_alpha]

            for dataset_name, dataset_info in dataset_index.items():
                dataset_path = dataset_info["path"]

                for file_path in dataset_info["files"]:
                    full_path = os.path.join(dataset_path, file_path)

                    try:
                        if file_path.endswith('.csv'):
                            # Search in CSV files
                            df = pd.read_csv(full_path)

                            # Convert all columns to string for searching
                            text_df = df.select_dtypes(include=['object']).astype(str)

                            # Search each column for matching terms
                            for col in text_df.columns:
                                for term in search_terms:
                                    matches = text_df[text_df[col].str.contains(term, case=False, na=False)]

                                    if not matches.empty:
                                        for _, row in matches.iterrows():
                                            result = {
                                                "dataset": dataset_name,
                                                "file": file_path,
                                                "match": row.to_dict()
                                            }
                                            results.append(result)

                                            # Limit to prevent overwhelming results
                                            if len(results) >= 5:
                                                break

                        elif file_path.endswith('.json'):
                            # Search in JSON files
                            with open(full_path, 'r', encoding='utf-8') as f:
                                try:
                                    data = json.load(f)
                                    json_str = json.dumps(data, ensure_ascii=False).lower()

                                    # Check if any search term exists in the JSON
                                    for term in search_terms:
                                        if term.lower() in json_str:
                                            # Add match
                                            result = {
                                                "dataset": dataset_name,
                                                "file": file_path,
                                                "match": {"content": "JSON file contains relevant information"}
                                            }
                                            results.append(result)
                                            break

                                except json.JSONDecodeError:
                                    print(f"Error decoding JSON file: {full_path}")

                    except Exception as e:
                        print(f"Error searching file {full_path}: {e}")

                    # Limit results
                    if len(results) >= 5:
                        break

                if len(results) >= 5:
                    break

        # Cache the results
        with open(cache_file, 'w', encoding='utf-8') as f:
            json.dump(results, f, indent=2)

        return results

    def format_kaggle_results(self, results):
        """Format Kaggle search results into readable text"""
        if not results:
            return "No additional information found in datasets."

        formatted = "Additional information from legal datasets:\n\n"

        for i, result in enumerate(results[:3], 1):  # Limit to top 3 results
            dataset = result.get("dataset", "").split("/")[-1]
            match = result.get("match", {})

            formatted += f"{i}. From {dataset} dataset:\n"

            # Format based on match content
            if isinstance(match, dict):
                # Extract the most relevant fields
                relevant_fields = {}
                for key, value in match.items():
                    if isinstance(value, str) and len(value) > 5:
                        if key.lower() in ['title', 'question', 'content', 'answer', 'summary', 'description']:
                            relevant_fields[key] = value

                # Format the relevant fields
                for key, value in relevant_fields.items():
                    if len(value) > 500:
                        value = value[:500] + "..."
                    formatted += f"   {key.capitalize()}: {value}\n"

            formatted += "\n"

        return formatted

    def update_local_knowledge(self, query, answer):
        """
        Update local knowledge base with new information
        Args:
            query (str): The query
            answer (str): The answer
        """
        # Add to local knowledge if not already present
        if query not in self.local_knowledge:
            self.local_knowledge[query] = answer

            # Update cached questions and TF-IDF matrix
            self.cached_questions = list(self.local_knowledge.keys())
            self.tfidf_matrix = self.vectorizer.fit_transform(self.cached_questions)

            # Save updated knowledge base
            with open(self.local_knowledge_path, 'w', encoding='utf-8') as f:
                json.dump(self.local_knowledge, f, indent=2)

    def get_response(self, user_query):
        """
        Generate a response to the user's legal query
        Args:
            user_query (str): User's input query
        Returns:
            str: Appropriate legal advice or information
        """
        # First check local knowledge base
        best_match, similarity_score = self.find_best_match(user_query)

        # If we have a good match in local knowledge
        if similarity_score > 0.3:
            local_answer = self.local_knowledge[best_match]

            # If exact match, return immediately
            if similarity_score > 0.8:
                return f"{local_answer}\n\n{self.disclaimer}"

            # If good but not exact match, search Kaggle too
            kaggle_results = self.search_kaggle_datasets(user_query)

            if kaggle_results:
                formatted_kaggle = self.format_kaggle_results(kaggle_results)

                if best_match != user_query:
                    base_response = f"Based on your query, I found this similar question:\n\n'{best_match}'\n\n{local_answer}\n\n"
                else:
                    base_response = f"{local_answer}\n\n"

                return f"{base_response}---\n{formatted_kaggle}\n{self.disclaimer}"
            else:
                if best_match != user_query:
                    return f"Based on your query, I found this similar question:\n\n'{best_match}'\n\n{local_answer}\n\n{self.disclaimer}"
                else:
                    return f"{local_answer}\n\n{self.disclaimer}"
        else:
            # No good match in local knowledge, search Kaggle datasets
            kaggle_results = self.search_kaggle_datasets(user_query)

            if kaggle_results:
                formatted_kaggle = self.format_kaggle_results(kaggle_results)

                # Extract a potential answer from the first result
                first_result = kaggle_results[0].get("match", {})
                potential_answer = ""

                for key in ["answer", "content", "description", "summary"]:
                    if key in first_result and isinstance(first_result[key], str):
                        potential_answer = first_result[key]
                        if len(potential_answer) > 1000:
                            potential_answer = potential_answer[:1000] + "..."
                        break

                if potential_answer:
                    # Add to local knowledge base for future queries
                    self.update_local_knowledge(user_query, potential_answer)
                    return f"{potential_answer}\n\n---\n{formatted_kaggle}\n{self.disclaimer}"
                else:
                    return f"I found some information that might help with your query:\n\n{formatted_kaggle}\n{self.disclaimer}"
            else:
                # No information found
                fallback_responses = [
                    "I'm sorry, but I couldn't find a precise match for your legal query in my knowledge base.",
                    "Your query requires specialized legal knowledge that isn't in my database.",
                    "I recommend consulting a legal professional for this specific issue, as I don't have enough information to provide guidance."
                ]

                return f"{random.choice(fallback_responses)}\n\n{self.disclaimer}"

    def interactive_chat(self):
        """
        Run an interactive chat session with the legal advisor
        """
        print("Legal Advisor Chatbot: Hello! I can help answer general legal questions.")
        print("I use Kaggle datasets and a local knowledge base.")
        print("Type 'exit' to end the conversation.")

        while True:
            user_input = input("\nYour question: ")

            if user_input.lower() == 'exit':
                print("Thank you for using the Legal Advisor Chatbot. Stay informed!")
                break

            response = self.get_response(user_input)
            print("\nLegal Advisor:", response)

    def refresh_datasets(self):
        """Force refresh of all datasets from Kaggle"""
        # Remove the dataset index to force re-download
        dataset_index_path = os.path.join(self.cache_dir, "dataset_index.json")
        if os.path.exists(dataset_index_path):
            os.remove(dataset_index_path)

        # Re-download and process datasets
        self.ensure_datasets()
        print("Datasets refreshed successfully")

# Main execution
if __name__ == "__main__":
    # Initialize the chatbot with Kaggle API integration
    legal_chatbot = LegalAdvisorChatbot()

    # Uncomment to force refresh of datasets
    # legal_chatbot.refresh_datasets()

    legal_chatbot.interactive_chat()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Using cached datasets
Legal Advisor Chatbot: Hello! I can help answer general legal questions.
I use Kaggle datasets and a local knowledge base.
Type 'exit' to end the conversation.

Your question: what is divorce
Searching Kaggle datasets for: what is divorce

Legal Advisor: Based on your query, I found this similar question:

'What happens if a woman remarries after her divorce?'

The magistrate cancels the previous order as from the date of her remarriage.

---
Additional information from legal datasets:

1. From llm-fine-tuning-dataset-of-indian-legal-texts dataset:
   Content: JSON file contains relevant information

2. From llm-fine-tuning-dataset-of-indian-legal-texts dataset:
   Content: JSON file contains relevant information


IMPORTANT: This is an AI legal advisor and does not constitute professional legal advice. Always consult with a qualified legal professional for specific legal guidance.

Your question: what is government
Searching Kaggle datasets for: what is governmen