## Notes: 
- some parts of the notebook are not displayed properly on github, please download and view it locally to use it correctly.
- This notebook has been tested only in Google Colab. It should also work in other Jupyter environments, but you might run into issues due to differences in dependencies or setup. 

## install & import statements

In [None]:
!pip install vllm

In [18]:
from vllm import LLM, SamplingParams
import json, re, ast
from pprint import pprint
from typing import Dict, List, Any

## model initialization

In [None]:
model_name = "nis12ram/HindiNER-4B-v0.0" ## or "nis12ram/HindiNER-4B-v0.1"
llm = LLM(model=model_name, dtype="half", max_model_len=4096, gpu_memory_utilization=0.9)

## prompt

In [4]:
ner_user_msg = '''You are a Hindi language expert who specializes in extracting entities from text. Given a piece of text, extract all crucial entities along with their respective context-aware entity types. Ensure that entity type is in Hindi. The output should be in JSON format.

## Output format:
```json
{{
  "entities": [
    {{
      "type": "_",
      "value": ["_", "_"]
    }},
    {{
      "type": "_",
      "value": ["_"]
    }}
  ]
}}
```

## Text:
""" {text} """'''


prompt_format = '''<extra_id_0>System

<extra_id_1>User
{user_msg}
<extra_id_1>Assistant
'''

## utils

In [5]:
def extract_json(text) -> Any:
    # Regex to extract content between ```json and ```
    match = re.search(r'```json\s*([\s\S]*?)\s*```', text)

    if match:
        json_str: str = match.group(1).strip()
        try:
            ## for proper json structure
            return json.loads(json_str)
        except:
            try:
              ## for malformed json
              python_literal: Any = ast.literal_eval(json_str) ## any python literal(str or dict or list or tuple ,..)
              return json.loads(
                  json.dumps(python_literal, ensure_ascii=False) ## json encoded string
              ) ## json data loaded as python literal
            except:
              print(f"NOT ABLE TO EXTRACT JSON DATA FROM TEXT: {text}")
              return None

    print(f"NOT ABLE TO EXTRACT JSON DATA FROM TEXT: {text}")
    return None

In [6]:
def extract_source_text_from_prompt(prompt: str) -> str|None:
  # Regular expression to find the text inside triple double quotes under ## Text:
  match = re.search(r'## Text:\s*"""(.*?)"""', prompt, re.DOTALL)
  extracted_source_text: str|None =  match.group(1).strip() if match else None
  if not extracted_source_text:
    print(f"NOT ABLE TO EXTRACT SOURCE TEXT FROM PROMPT: {prompt}")
  return extracted_source_text

In [7]:
def inference(prompts: List[str], sampling_params: SamplingParams) -> List[Dict[str, Any]]:
  outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
  data = []
  for output in outputs:
    prompt: str = output.prompt
    source_text: str|None = extract_source_text_from_prompt(prompt)
    raw_output_texts: List[str] = [out.text for out in output.outputs]
    processed_output_texts: List[Any] = [extract_json(raw_output_text) for raw_output_text in raw_output_texts]
    data.append({"prompt":prompt, "source_text":source_text, "raw_output_texts":raw_output_texts, "processed_output_texts":processed_output_texts})
  return data

## main

In [8]:
text1 = """एअर इंडिया ने X पर बताया है कि गुजरात के अहमदाबाद एयरपोर्ट के पास गुरुवार को हुए विमान हादसे में फ्लाइट में सवार 242 लोगों में से 241 लोगों की मौत हो गई और सिर्फ एक शख्स जीवित बचा है। एअर इंडिया ने बताया, 'यात्रियों में 169 भारतीय नागरिक, 53 ब्रिटिश और 7 पुर्तगाली और 1 कनाडाई नागरिक था।'"""
text2 = """Fireflies launched a new feature called Talk to Fireflies. This turns its AI notetaker into an active meeting participant. It allows you to ask real-time questions, either about the meeting itself or the wider web, and get instant answers without leaving their video call. The integration works both ways: where Perplexity users can ask questions about past meetings."""
text3 = """इशान टेक्नोलॉजीज को भारत सरकार के 'इंडियाएआई मिशन' के तहत आधिकारिक रूप से सूचीबद्ध किया गया है, as one of seven AI compute infrastructure providers nationwide. Under this initiative, Ishan Technologies will offer access to over 1,000 high-performance Graphic Processing Units (GPUs), enabling affordable on-demand compute capacity for startups, academic researchers, and public sector developers."""

In [9]:
prompts: List[str] = [prompt_format.format(user_msg=ner_user_msg.format(text=text)) for text in [text1, text2, text3]]

In [10]:
'''NOTE:
1 nis12ram/HindiNER-4B-v0.0 and nis12ram/HindiNER-4B-v0.1 needs stopping strategy to work efficiently. <extra_id_1> act as stop token.
2 greedy sampling performs better then other settings.
'''
sampling_params = SamplingParams(temperature=0.0, max_tokens=2000, stop=["<extra_id_1>"] )

In [None]:
data = inference(prompts, sampling_params)

In [17]:
for inst in data:
  print("-"*50)
  print(f'** prompt:    {inst["prompt"]}')
  print(f'** source_text:     {inst["source_text"]}')
  print(f'** raw_output_texts:    {inst["raw_output_texts"]}')
  print(f'** processed_output_texts:    {inst["processed_output_texts"]}')
  print("-"*50)
  input()

--------------------------------------------------
** prompt:    <extra_id_0>System

<extra_id_1>User
You are a Hindi language expert who specializes in extracting entities from text. Given a piece of text, extract all crucial entities along with their respective context-aware entity types. Ensure that entity type is in Hindi. The output should be in JSON format.

## Output format:
```json
{
  "entities": [
    {
      "type": "_",
      "value": ["_", "_"]
    },
    {
      "type": "_",
      "value": ["_"]
    }
  ]
}
```

## Text:
""" एअर इंडिया ने X पर बताया है कि गुजरात के अहमदाबाद एयरपोर्ट के पास गुरुवार को हुए विमान हादसे में फ्लाइट में सवार 242 लोगों में से 241 लोगों की मौत हो गई और सिर्फ एक शख्स जीवित बचा है। एअर इंडिया ने बताया, 'यात्रियों में 169 भारतीय नागरिक, 53 ब्रिटिश और 7 पुर्तगाली और 1 कनाडाई नागरिक था।' """
<extra_id_1>Assistant

** source_text:     एअर इंडिया ने X पर बताया है कि गुजरात के अहमदाबाद एयरपोर्ट के पास गुरुवार को हुए विमान हादसे में फ्लाइट में सवार 242 लो