In [1]:
!pip install -q gradio transformers torch pypdf2 sentencepiece protobuf

In [2]:
# Previous imports remain the same...
import warnings
warnings.filterwarnings("ignore")
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM
import PyPDF2
import torch
import gradio as gr
import os
import tempfile
import io

# Device setup and model loading functions remain the same...
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Initialize models with lower precision to save memory
def load_models():
    print("Loading Mistral model...")
    tokenizer = AutoTokenizer.from_pretrained(
        "mistralai/Mistral-7B-Instruct-v0.2",
        use_fast=True
    )
    model = AutoModelForCausalLM.from_pretrained(
        "mistralai/Mistral-7B-Instruct-v0.2",
        torch_dtype=torch.float16,
        device_map="auto"
    )
    
    print("Loading translation model...")
    model_name = "facebook/nllb-200-distilled-600M"
    tokenizer_ts = AutoTokenizer.from_pretrained(model_name)
    model_ts = AutoModelForSeq2SeqLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map="auto"
    )
    
    return tokenizer, model, tokenizer_ts, model_ts

# Load models
tokenizer, model, tokenizer_ts, model_ts = load_models()


Using device: cuda
Loading Mistral model...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading translation model...


In [6]:

def read_pdf_segments(file_path):
    segments = []
    try:
        # Open the PDF file directly
        pdf_reader = PyPDF2.PdfReader(file_path)
        for page in pdf_reader.pages:
            segment = page.extract_text()
            if segment.strip():  # Only add non-empty segments
                segments.append(segment)
        return segments
    except Exception as e:
        print(f"Error in read_pdf_segments: {str(e)}")  # Debug print
        return [f"Error reading PDF: {str(e)}"]

# generate_questions and translate_questions functions remain the same...
def generate_questions(theme, segments) -> list:
    resfin = []
    
    for segment in segments[:3]:
        messages = [
            {
            "role": "user", "content":'''I will provide you with some text and a theme, your job is to identify the language of the provided text and theme, then generate a question based upon the theme in English. 
                                        Make sure the question you generate is answerable only by the provided text.
                                        Please make no assumptions, and answer in English only.
                                        In the case where no such question is possible, answer with the word NO only.''',
            },
            {
            "role": "assistant", "content":"Sure, please provide me with the text and the theme."
            },

            {
            "role": "user", "content":"The text is demarkated by triple backticks:```होरी एक गरीब किसान था, जो मुश्किल से गुजारा कर पा रहा था। अपनी कठिनाइयों के बावजूद, वह एक गाय खरीदने का सपना देखता था, जो गाँव में समृद्धि का प्रतीक थी।```. The theme is demarkated by double backticks:``होरी की गाय खरीदने की इच्छा``.",
            },

            {"role":"assistant","content": "Why did Hori want a cow?"
            },
            {
            "role":"user","content": f'''The text is demarkated by triple backticks:```{segment}```. 
                                        The theme is demarkated by double backticks:``{theme}``.''',
            }
        ] 

        try:
            with torch.inference_mode():
                encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")
                model_inputs = encodeds.to(device)
                
                generated_ids = model.generate(
                    model_inputs, 
                    max_new_tokens=1000, 
                    do_sample=True,
                    temperature=0.7
                )
                decoded = tokenizer.batch_decode(generated_ids)
                res = decoded[0]
            
            last_inst_index = res.rfind('[/INST]')
            if last_inst_index != -1:
                res = res[last_inst_index + len('[/INST]'):]
                
            question_mark_index = res.find('?')
            if question_mark_index != -1:
                res = res[:question_mark_index + 1]
            
            if "NO" not in res.strip():
                resfin.append(res)
            if len(resfin) == 3:
                break
                
        except RuntimeError as e:
            print(f"Error during generation: {e}")
            continue

    return resfin if resfin else ["No questions could be generated. Try with a different theme or text."]
def validate_question(question, segments):
    resfin = []
    
    for segment in segments:
        messages = [
            {
                "role": "user", 
                "content": '''I will provide you with a question, and some text. 
                Your job is to tell me if the question that I provided is relevant to the given text or is incoherent. 
                Also if you find it relevant can u give the part of the text where you find its resemblance''',
            },
            {
                "role": "assistant", 
                "content": "Sure, please provide me with the text and the question."
            },
            {
                "role": "user",
                "content": f'''The text is demarkated by triple backticks:```{segment}```.
                The question is demarkated by double backticks:``{question}``.''',
            }
        ]
        
        try:
            with torch.inference_mode():
                encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")
                model_inputs = encodeds.to(device)
                
                generated_ids = model.generate(
                    model_inputs, 
                    max_new_tokens=1000, 
                    do_sample=True,
                    temperature=0.7
                )
                decoded = tokenizer.batch_decode(generated_ids)
                res = decoded[0]
                
                last_inst_index = res.rfind('[/INST]')
                if last_inst_index != -1:
                    res = res[last_inst_index + len('[/INST]'):]
                
                words_notexist = ['incoherent', 'irrelevant', 'not relevant', 'not coherent', 
                                'does not make sense', 'doesnt make sense', 'not related', 
                                'not coherent', 'not relevant', 'is not directly relevant']
                
                if all(word not in res.strip().lower() for word in words_notexist):
                    resfin.append(res)
                    break
                    
        except RuntimeError as e:
            print(f"Error during validation: {e}")
            continue
            
    return resfin[0] if resfin else "Question is not relevant to the provided text."

def translate_questions(questions, target_lang):
    translated_texts = []
    tokenizer_ts.src_lang = "eng_Latn"
    
    for question in questions:
        try:
            with torch.inference_mode():
                inputs = tokenizer_ts(question, return_tensors="pt")
                inputs = {k: v.to(device) for k, v in inputs.items()}
                
                forced_bos_token_id = tokenizer_ts.convert_tokens_to_ids(target_lang)
                translated_tokens = model_ts.generate(
                    **inputs, 
                    forced_bos_token_id=forced_bos_token_id, 
                    max_new_tokens=50
                )
                translated_text = tokenizer_ts.batch_decode(translated_tokens, skip_special_tokens=True)[0]
                translated_texts.append(translated_text)
        except Exception as e:
            translated_texts.append(f"Translation error: {str(e)}")
    
    return translated_texts

# Modified process_book function with proper file handling
def process_book_generator(file_obj, theme):
    if file_obj is None:
        return "Please upload a PDF file."
    
    try:
        file_name = file_obj.name if hasattr(file_obj, 'name') else "uploaded.pdf"
        segments = read_pdf_segments(file_obj)
        
        if not segments or (len(segments) == 1 and segments[0].startswith("Error")):
            return "No text could be extracted from the PDF."
        
        questions = generate_questions(theme, segments)
        if not questions:
            return "No questions could be generated."
        
        filename = file_name.lower()
        if "godan" in filename or "autobiography" in filename:
            questions = translate_questions(questions, "hin_Deva")
        elif "narayan-kwach" in filename:
            questions = translate_questions(questions, "san_Deva")
        
        return "\n\n".join(questions)
    
    except Exception as e:
        print(f"Error in process_book_generator: {str(e)}")
        return f"Error processing file: {str(e)}"

def process_book_validator(file_obj, question):
    if file_obj is None:
        return "Please upload a PDF file."
    
    if not question:
        return "Please enter a question to validate."
    
    try:
        segments = read_pdf_segments(file_obj)
        
        if not segments or (len(segments) == 1 and segments[0].startswith("Error")):
            return "No text could be extracted from the PDF."
        
        validation_result = validate_question(question, segments)
        return validation_result
    
    except Exception as e:
        print(f"Error in process_book_validator: {str(e)}")
        return f"Error processing file: {str(e)}"

def create_interfaces():
    # Question Generator Interface
    generator_interface = gr.Interface(
        fn=process_book_generator,
        inputs=[
            gr.File(label="Upload PDF", file_types=[".pdf"]),
            gr.Textbox(label="Enter theme")
        ],
        outputs=gr.Textbox(label="Generated Questions", lines=5),
        title="Book Question Generator",
        description="Upload a PDF and specify a theme to generate relevant questions. The system will automatically detect the language based on the filename and translate questions if necessary."
    )
    
    # Question Validator Interface
    validator_interface = gr.Interface(
        fn=process_book_validator,
        inputs=[
            gr.File(label="Upload PDF", file_types=[".pdf"]),
            gr.Textbox(label="Enter question to validate")
        ],
        outputs=gr.Textbox(label="Validation Result", lines=5),
        title="Question Validator",
        description="Upload a PDF and enter a question to check if it's relevant to the text content."
    )
    
    # Combine interfaces
    return gr.TabbedInterface(
        [generator_interface, validator_interface],
        ["Question Generator", "Question Validator"]
    )

if __name__ == "__main__":
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    iface = create_interfaces()
    iface.launch(share=True)

* Running on local URL:  http://127.0.0.1:7863
* Running on public URL: https://1b10f83c47016a49f2.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
