<a href="https://colab.research.google.com/github/omarShiraz/chatbotLawyer/blob/main/Domain_Specific_Dataset_Creation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade --verbose git+https://github.com/omarShiraz/Questgen.ai.git

In [None]:
!pip install fitz
!pip install PyMuPDF
!pip install transformers
!pip install --upgrade numpy
!pip install spaCy==2.3.3
!pip install --quiet git+https://github.com/boudinfl/pke.git
!python -m nltk.downloader universal_tagset
!python -m spacy download en_core_web_sm

## **Restart runtime before continuing**

In [None]:
!wget https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2015_md.tar.gz
!tar -xvf  s2v_reddit_2015_md.tar.gz
!ls s2v_old

In [None]:
import nltk
from pprint import pprint
from Questgen import main
qg = main.QGen()

In [None]:
import fitz  # PyMuPDF
import pandas as pd
import zipfile
import os

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_file):
    doc = fitz.open(pdf_file)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# Function to chunk text into pieces of a specified size
def chunk_text(text, chunk_size=4000):
    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
    return chunks

# Function to preprocess text (you can customize this)
def preprocess_text(text):
    text = ' '.join(text.split())
    return text

# Specify the path to the zip file containing multiple PDFs
zip_file_path = '/content/LawData1.zip'

# Create a DataFrame to store the results
results_df = pd.DataFrame(columns=['output', 'instruction'])

# Extract PDFs from the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall()

# List all PDF files in the current working directory
pdf_files = [file for file in os.listdir() if file.lower().endswith('.pdf')]
# Loop through each PDF file
for pdf_file_path in pdf_files:
    # Extract text from the PDF
    pdf_text = extract_text_from_pdf(pdf_file_path)

    # Preprocess the text
    cleaned_text = preprocess_text(pdf_text)

    # Chunk the text into 512-token pieces since questgen only excepts 512 tokens per input
    text_chunks = chunk_text(cleaned_text, chunk_size=512)

    payload = {"input_text": ""}

    # Loop through chunks and make predictions
    for chunk in text_chunks:
        try:
            payload["input_text"] = chunk
            outputs = qg.predict_shortq(payload)

            # Check the structure of the outputs dictionary
            if 'questions' in outputs:
                question_list = outputs['questions']
            elif 'your_custom_key' in outputs:
                question_list = outputs['your_custom_key']
            else:
                print("Unexpected structure in the 'outputs' dictionary. Check the structure and update the code.")
                continue

            # Iterate through the extracted questions and contexts
            for item in question_list:
                question = item.get('Question', '')
                context = item.get('context', '')

                # Check if question and context are non-empty before processing
                if question and context:
                    # Include context in the instruction field
                    instruction_text = context

                    # Append the results to the DataFrame
                    results_df = results_df.append({
                        "input": question,
                        "instruction": instruction_text
                    }, ignore_index=True)

        except RuntimeError as e:
            print(f"Error processing chunk: {e}")
            continue

# Save the DataFrame to a CSV file
results_df.to_csv('LawDataset4.csv', index=False)

# Print the generated DataFrame
print(results_df)


In [None]:
#Install Datasets library to load the dataset from hugging face into the Google Colab Notebook.
#Install Transformers library to import the Autotokenizer this will convert the raw text into tokens
#Install Sentence Transformers Library to download the Embedding Model
!pip install -q datasets transformers sentence_transformers faiss-gpu

#**Set the Hugging Face Token**

In [None]:
import os
os.environ["HF_TOKEN"] = "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"

#**Load the Dataset**

In [None]:
from datasets import load_dataset
dataset = load_dataset("zoom12/SriLankaLaw")
dataset

#**Load CSV File**

In [None]:
from datasets import load_dataset
dataset = load_dataset("csv", data_files="/content/LawDataset4.csv") #Change the file name to output csv file or any name you prefer
dataset

#**Map Chat templates**

In [None]:
def chat_template(example):
    example["instruction"] = f"### Instruction:\n{example['instruction']}\n\n### Response:\n"
    return example

dataset = dataset.map(chat_template)

#**Push the Dataset to Hugging Face Hub**

In [None]:
import os
os.environ["HF_TOKEN"] = "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"

In [None]:
dataset.push_to_hub("zoom12/SriLankaLaw")