In [None]:
!python --version

Python 3.11.13


In [None]:
# Import required libraries
from gradio_client import Client
import html
import pandas as pd
import numpy as np
import time
import re

In [None]:
MODEL_ENDPOINT = "https://7ee6-165-204-156-250.ngrok-free.app/"

# Initialize client
client = Client(MODEL_ENDPOINT)

Loaded as API: https://7ee6-165-204-156-250.ngrok-free.app/ ✔


In [None]:
#Function to call API
def get_model_response(prompt: str) -> str:
    try:
        result = client.predict(
            prompt=prompt,
            max_tokens=7500,
            temperature=0.0,
            chat_history=[],
            api_name="/submit_message"
        )
        return result
    except Exception as e:
        print(f"Error in get_model_response: {e}")
        return None

In [None]:
#Function to extract assistance's response from the total json

def extract_assistant_content(result):
    """
    Extracts the assistant's raw content string from the predict() result.
    """
    if not result or len(result) == 0:
        return ""

    try:
        messages = result[0]
        for msg in messages:
            if msg.get("role") == "assistant":
                raw_content = msg.get("content", "")
                # Unescape HTML entities
                unescaped_content = html.unescape(raw_content)
                return unescaped_content
    except (IndexError, TypeError, AttributeError) as e:
        print(f"Error extracting assistant content: {e}")
        return ""

    return ""

In [None]:
"""
  Function to get two string
    1 - What model is thinking ?
    2- Wat model is responding ?
"""


def split_thinking_and_response(content):
    """
    Extracts thinking and response from assistant content string.
    Handles both complete and incomplete <think> tags.
    """
    if not content:
        return "", ""

    # Check for complete <think>...</think> tags
    think_match = re.search(r'<think>(.*?)</think>', content, re.DOTALL)

    if think_match:
        # Complete thinking tags found
        thinking = think_match.group(1).strip()
        # Extract visible response (everything outside <think> tags)
        visible = re.sub(r'<think>.*?</think>', '', content, flags=re.DOTALL).strip()
        return visible, thinking

    elif '<think>' in content:
        # Incomplete thinking tags (no closing tag)
        think_start = content.find('<think>')
        if think_start != -1:
            thinking = content[think_start + 7:].strip()  # Everything after <think>
            visible = content[:think_start].strip()       # Everything before <think>
            return visible, thinking

In [None]:
# Helper function to chcek for truncate responses


def analyze_response_structure(content):
    """
    Helper function to analyze the response structure
    """
    print("=== RESPONSE ANALYSIS ===")
    print(f"Content length: {len(content)}")
    print(f"Contains <think>: {'<think>' in content}")
    print(f"Contains </think>: {'</think>' in content}")

    if '<think>' in content:
        think_start = content.find('<think>')
        print(f"<think> position: {think_start}")

        if '</think>' in content:
            think_end = content.find('</think>')
            print(f"</think> position: {think_end}")
            print(f"Thinking content length: {think_end - think_start - 7}")
        else:
            print("No closing </think> tag found - content may be truncated")

    print(f"First 200 chars: {content[:200]}")
    print(f"Last 200 chars: {content[-200:]}")
    print("=" * 50)

In [None]:
# HF Login

from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Dataset Access

!wget -O data.csv "https://huggingface.co/datasets/OdiaGenAIdata/Reasoning_OD/resolve/main/Odia%20Reasoning%20Data.csv"


--2025-07-15 18:44:18--  https://huggingface.co/datasets/OdiaGenAIdata/Reasoning_OD/resolve/main/Odia%20Reasoning%20Data.csv
Resolving huggingface.co (huggingface.co)... 18.164.174.17, 18.164.174.23, 18.164.174.118, ...
Connecting to huggingface.co (huggingface.co)|18.164.174.17|:443... connected.
HTTP request sent, awaiting response... 307 Temporary Redirect
Location: /api/resolve-cache/datasets/OdiaGenAIdata/Reasoning_OD/3b72e70500791ad3fcdbe2a05fe5546c0b5a904e/Odia%20Reasoning%20Data.csv?%2Fdatasets%2FOdiaGenAIdata%2FReasoning_OD%2Fresolve%2Fmain%2FOdia+Reasoning+Data.csv=&etag=%221c6237d095d8e95d55ab4f47389ce921bfbe045f%22 [following]
--2025-07-15 18:44:19--  https://huggingface.co/api/resolve-cache/datasets/OdiaGenAIdata/Reasoning_OD/3b72e70500791ad3fcdbe2a05fe5546c0b5a904e/Odia%20Reasoning%20Data.csv?%2Fdatasets%2FOdiaGenAIdata%2FReasoning_OD%2Fresolve%2Fmain%2FOdia+Reasoning+Data.csv=&etag=%221c6237d095d8e95d55ab4f47389ce921bfbe045f%22
Reusing existing connection to huggingface.

In [None]:
#Reading and storing the data

import pandas as pd


df = pd.read_csv("data.csv")
df.head()

Unnamed: 0,_id,Question,Answer,Explanation,Type Of Question,Difficulty Score
0,ODR_0000,ଶୂନ୍ୟସ୍ଥାନ ପୂରଣ କର।\n(i) 3x + 2x = (3 + ______...,5x,(i) 3x + 2x\n= (3 + 2)x\n= 5x,Quantitative Reasoning,
1,ODR_0001,5x + 7x = (________ + 7)x = ________,12x,5x + 7x\n= (5 + 7)x\n= 12x,Quantitative Reasoning,
2,ODR_0002,-8 ର ଯୋଗାତ୍ମକ ବିଲୋମୀ ହେଉଛି ( ) ।,8,ସମାଧାନ:\n8,Quantitative Reasoning,
3,ODR_0003,ଗୋଟିଏ ସାଧାରଣ ଜ୍ଞାନ ପ୍ରତିଯୋଗିତାରେ ଗୋଟିଏ ପ୍ରଶ୍ନର...,ମନିଷା ମୋଟ 4 ନମ୍ବର ପାଇଲା ।,ମନିଷା ଚାରୋଟି ପାଳିରେ ପାଇଥିବା ନମ୍ବରଗୁଡ଼ିକ ହେଲା :...,Quantitative Reasoning,
4,ODR_0004,ଏକ ସମୟରେ ଗୋଟିଏ ଉଡ଼ାଜାହାଜ ସମୁଦ୍ରପତ୍ତନଠାରୁ 5000 ...,ସେହି ସମୟରେ ଉକ୍ତ ଜାହାଜ ଦୁଇଟି ମଧ୍ୟରେ ଦୂରତା 6500 ମି.,ଏକ ଉଡ଼ାଜାହାଜ ସମୁଦ୍ରପତ୍ତନଠାରୁ 5000 ମି. ଉପରେ ଉଡୁ...,Quantitative Reasoning,


In [None]:
# Testing the pipeline on a single response


sample_prompt = df["Question"].iloc[30]
print(f"Question: {sample_prompt}")
print("\n" + "="*50 + "\n")

response = get_model_response(sample_prompt)
if response:
    content = extract_assistant_content(response)
    if content:
        # Optional: Analyze response structure
        analyze_response_structure(content)

        visible_response, thinking_response = split_thinking_and_response(content)

        print("\n💬 Model's Visible Response:")
        print(visible_response if visible_response else "[EMPTY - All content is in thinking]")

        print("\n🧠 Machine's Thinking:")
        if thinking_response:
            print(thinking_response)
            if not thinking_response.endswith('</think>') and '<think>' in content and '</think>' not in content:
                print("\n⚠️  Note: Thinking content appears to be truncated (no closing tag)")
        else:
            print("[EMPTY - No thinking content found]")

        print(f"\n📊 Summary:")
        print(f"Visible response length: {len(visible_response)}")
        print(f"Thinking response length: {len(thinking_response)}")

    else:
        print("No assistant content found in response")
else:
    print("Failed to get model response")

Question: ଭ୍ରମ ଥିଲେ ସଂଶୋଧନ କର ।
ଉଦ୍‌ଜାନର ଆଣବିକ ସଂକେତ N2 |


=== RESPONSE ANALYSIS ===
Content length: 3050
Contains <think>: True
Contains </think>: False
<think> position: 0
No closing </think> tag found - content may be truncated
First 200 chars: <think>
ପ୍ରଥମେ, ପ୍ରଶ୍ନଟି କଣ ଅଟେ ତାହା ବୁଝିବା ଆବଶ୍ୟକ। ପ୍ରଶ୍ନଟି କୁହାଯାଉଛି ଯେ ଭ୍ରମ ଥିଲେ ସଂଶୋଧନ କରିବା ଆବଶ୍ୟକ, ଏବଂ ଉଦ୍‌ଜାନର ଆଣବିକ ସଂକେତ N2 ଅଟେ। ଏହା ଗଣିତର ଏକ ସମସ୍ୟା ଅଟେ, ଯେଉଁଥିରେ ଭ୍ରମ ଥିଲେ ସଂଶୋଧନ କରିବା ପାଇଁ 
Last 200 chars: ତ ପ୍ରତିକ୍ରିୟା କରିବା ପାଇଁ କିମ୍ବା ଅନ୍ୟ ପଦାର୍ଥର ସହିତ ପ୍ରତିକ୍ରିୟା କରିବା ପାଇଁ କିମ୍ବା ଅନ୍ୟ ପଦାର୍ଥର ସହିତ ପ୍ରତିକ୍ରିୟା କରିବା ପାଇଁ କିମ୍ବା ଅନ୍ୟ ପଦାର୍ଥର ସହିତ ପ୍ରତିକ୍ରିୟା କରିବା ପାଇଁ କିମ୍ବା ଅନ୍ୟ ପଦାର୍ଥର ସହିତ ପ୍ରତିକ

💬 Model's Visible Response:
[EMPTY - All content is in thinking]

🧠 Machine's Thinking:
ପ୍ରଥମେ, ପ୍ରଶ୍ନଟି କଣ ଅଟେ ତାହା ବୁଝିବା ଆବଶ୍ୟକ। ପ୍ରଶ୍ନଟି କୁହାଯାଉଛି ଯେ ଭ୍ରମ ଥିଲେ ସଂଶୋଧନ କରିବା ଆବଶ୍ୟକ, ଏବଂ ଉଦ୍‌ଜାନର ଆଣବିକ ସଂକେତ N2 ଅଟେ। ଏହା ଗଣିତର ଏକ ସମସ୍ୟା ଅଟେ, ଯେଉଁଥିରେ ଭ୍ରମ ଥିଲେ ସଂଶୋଧନ କରିବା ପାଇଁ ଆଣବିକ ସଂକେତର ଉପଯୋଗ କରାଯାଇପାରେ।

ପରବର୍ତ୍ତୀ ପଦକ୍ଷେପରେ,

In [None]:
df['Type Of Question'].nunique()
df['Type Of Question'].value_counts()

Unnamed: 0_level_0,count
Type Of Question,Unnamed: 1_level_1
Logical Reasoning,111
Quantitative Reasoning,106
Linguistic Ability,77
Verbal Reasoning,50
Scientific Reasoning,24
General Knowledge,16
Time & Calendar Reasoning,16


In [None]:
# Making a small batch to evaluate first

df_sampled = df.groupby("Type Of Question").sample(n=5, random_state=42).reset_index(drop=True)

In [None]:
df_sampled.shape

(35, 6)

In [None]:
df_sampled.head()

Unnamed: 0,_id,Question,Answer,Explanation,Type Of Question,Difficulty Score
0,ODR_0080,ଚାହିଦା ଲେଖା ସରଳରେଖା ହେଲେ ଏହାର ମଧ୍ୟ ବିନ୍ଦୁ ଓ ଊର...,ଏହା ଠିକ୍ ଅଟେ |,,General Knowledge,
1,ODR_0081,10% ଦାମ୍ ହ୍ରାସ ଘଟି ଚାହିଦାର ସମ୍ପ୍ରସାରଣ 8% ଘଟିଲେ...,ଯେହେତୁ ଦାମ୍‌ର ଆନୁପାତିକ ପରିବର୍ତ୍ତନଠାରୁ ଚାହିଦାର ...,,General Knowledge,
2,ODR_0085,ଭ୍ରମ ସଂଶୋଧନ କର । \nଭାରତ ସମ୍ବିଧାନର ୪୪ତମ ସଂଶୋଧନ ...,ଭାରତ ସମ୍ବିଧାନର ୪୨ତମ ସଂଶୋଧନ ଆଇନ ପ୍ରସ୍ତାବନାରେ ‘ଧ...,,General Knowledge,
3,ODR_0398,ନିମ୍ନଲିଖିତ ମଧ୍ୟରୁ ଅଲଗା ଗୋଟିକୁ ବାଛନ୍ତୁ:\nକ) ମଙ୍...,ଚନ୍ଦ୍ର,"ଅନ୍ୟ ତିନୋଟି ଗ୍ରହ, ଚନ୍ଦ୍ର ପୃଥିବୀର ଉପଗ୍ରହ",General Knowledge,
4,ODR_0397,ନିମ୍ନଲିଖିତ ମଧ୍ୟରୁ ଅଲଗା ଗୋଟିକୁ ବାଛନ୍ତୁ:\nକ) ଶିକ...,ଛାତ୍ର,"ଅନ୍ୟ ତିନୋଟି ପେଶା, ଛାତ୍ର ଏକ ଭୂମିକା",General Knowledge,


In [None]:
df_sampled['Type Of Question'].value_counts()

Unnamed: 0_level_0,count
Type Of Question,Unnamed: 1_level_1
General Knowledge,5
Linguistic Ability,5
Logical Reasoning,5
Quantitative Reasoning,5
Scientific Reasoning,5
Time & Calendar Reasoning,5
Verbal Reasoning,5


In [None]:
df_sampled.isnull().sum()


Unnamed: 0,0
_id,0
Question,0
Answer,0
Explanation,30
Type Of Question,0
Difficulty Score,35


In [None]:
df_sampled = df_sampled.drop("Difficulty Score", axis=1)


In [None]:
df_sampled.head(5)

Unnamed: 0,_id,Question,Answer,Explanation,Type Of Question
0,ODR_0080,ଚାହିଦା ଲେଖା ସରଳରେଖା ହେଲେ ଏହାର ମଧ୍ୟ ବିନ୍ଦୁ ଓ ଊର...,ଏହା ଠିକ୍ ଅଟେ |,,General Knowledge
1,ODR_0081,10% ଦାମ୍ ହ୍ରାସ ଘଟି ଚାହିଦାର ସମ୍ପ୍ରସାରଣ 8% ଘଟିଲେ...,ଯେହେତୁ ଦାମ୍‌ର ଆନୁପାତିକ ପରିବର୍ତ୍ତନଠାରୁ ଚାହିଦାର ...,,General Knowledge
2,ODR_0085,ଭ୍ରମ ସଂଶୋଧନ କର । \nଭାରତ ସମ୍ବିଧାନର ୪୪ତମ ସଂଶୋଧନ ...,ଭାରତ ସମ୍ବିଧାନର ୪୨ତମ ସଂଶୋଧନ ଆଇନ ପ୍ରସ୍ତାବନାରେ ‘ଧ...,,General Knowledge
3,ODR_0398,ନିମ୍ନଲିଖିତ ମଧ୍ୟରୁ ଅଲଗା ଗୋଟିକୁ ବାଛନ୍ତୁ:\nକ) ମଙ୍...,ଚନ୍ଦ୍ର,"ଅନ୍ୟ ତିନୋଟି ଗ୍ରହ, ଚନ୍ଦ୍ର ପୃଥିବୀର ଉପଗ୍ରହ",General Knowledge
4,ODR_0397,ନିମ୍ନଲିଖିତ ମଧ୍ୟରୁ ଅଲଗା ଗୋଟିକୁ ବାଛନ୍ତୁ:\nକ) ଶିକ...,ଛାତ୍ର,"ଅନ୍ୟ ତିନୋଟି ପେଶା, ଛାତ୍ର ଏକ ଭୂମିକା",General Knowledge


In [None]:
# Changing column name for better differentation among Human Data and Machine Data

df_sampled.columns = df_sampled.columns.str.strip()
df_sampled = df_sampled.rename(columns={
    "Answer": "Human_Answer",
    "Explanation": "Human_Explanation"
})


In [None]:
df_sampled.head(5)

Unnamed: 0,_id,Question,Human_Answer,Human_Explanation,Type Of Question
0,ODR_0080,ଚାହିଦା ଲେଖା ସରଳରେଖା ହେଲେ ଏହାର ମଧ୍ୟ ବିନ୍ଦୁ ଓ ଊର...,ଏହା ଠିକ୍ ଅଟେ |,,General Knowledge
1,ODR_0081,10% ଦାମ୍ ହ୍ରାସ ଘଟି ଚାହିଦାର ସମ୍ପ୍ରସାରଣ 8% ଘଟିଲେ...,ଯେହେତୁ ଦାମ୍‌ର ଆନୁପାତିକ ପରିବର୍ତ୍ତନଠାରୁ ଚାହିଦାର ...,,General Knowledge
2,ODR_0085,ଭ୍ରମ ସଂଶୋଧନ କର । \nଭାରତ ସମ୍ବିଧାନର ୪୪ତମ ସଂଶୋଧନ ...,ଭାରତ ସମ୍ବିଧାନର ୪୨ତମ ସଂଶୋଧନ ଆଇନ ପ୍ରସ୍ତାବନାରେ ‘ଧ...,,General Knowledge
3,ODR_0398,ନିମ୍ନଲିଖିତ ମଧ୍ୟରୁ ଅଲଗା ଗୋଟିକୁ ବାଛନ୍ତୁ:\nକ) ମଙ୍...,ଚନ୍ଦ୍ର,"ଅନ୍ୟ ତିନୋଟି ଗ୍ରହ, ଚନ୍ଦ୍ର ପୃଥିବୀର ଉପଗ୍ରହ",General Knowledge
4,ODR_0397,ନିମ୍ନଲିଖିତ ମଧ୍ୟରୁ ଅଲଗା ଗୋଟିକୁ ବାଛନ୍ତୁ:\nକ) ଶିକ...,ଛାତ୍ର,"ଅନ୍ୟ ତିନୋଟି ପେଶା, ଛାତ୍ର ଏକ ଭୂମିକା",General Knowledge


In [None]:
# Adding new columns to store  resonse , answer and think-tokens

import numpy as np
df_sampled["Model_Response"] = np.nan
df_sampled["Model_Answer"] = np.nan
df_sampled["Model_Explanation"] = np.nan


In [None]:
df_sampled.head(5)

Unnamed: 0,_id,Question,Human_Answer,Human_Explanation,Type Of Question,Model_Response,Model_Answer,Model_Explanation
0,ODR_0080,ଚାହିଦା ଲେଖା ସରଳରେଖା ହେଲେ ଏହାର ମଧ୍ୟ ବିନ୍ଦୁ ଓ ଊର...,ଏହା ଠିକ୍ ଅଟେ |,,General Knowledge,,,
1,ODR_0081,10% ଦାମ୍ ହ୍ରାସ ଘଟି ଚାହିଦାର ସମ୍ପ୍ରସାରଣ 8% ଘଟିଲେ...,ଯେହେତୁ ଦାମ୍‌ର ଆନୁପାତିକ ପରିବର୍ତ୍ତନଠାରୁ ଚାହିଦାର ...,,General Knowledge,,,
2,ODR_0085,ଭ୍ରମ ସଂଶୋଧନ କର । \nଭାରତ ସମ୍ବିଧାନର ୪୪ତମ ସଂଶୋଧନ ...,ଭାରତ ସମ୍ବିଧାନର ୪୨ତମ ସଂଶୋଧନ ଆଇନ ପ୍ରସ୍ତାବନାରେ ‘ଧ...,,General Knowledge,,,
3,ODR_0398,ନିମ୍ନଲିଖିତ ମଧ୍ୟରୁ ଅଲଗା ଗୋଟିକୁ ବାଛନ୍ତୁ:\nକ) ମଙ୍...,ଚନ୍ଦ୍ର,"ଅନ୍ୟ ତିନୋଟି ଗ୍ରହ, ଚନ୍ଦ୍ର ପୃଥିବୀର ଉପଗ୍ରହ",General Knowledge,,,
4,ODR_0397,ନିମ୍ନଲିଖିତ ମଧ୍ୟରୁ ଅଲଗା ଗୋଟିକୁ ବାଛନ୍ତୁ:\nକ) ଶିକ...,ଛାତ୍ର,"ଅନ୍ୟ ତିନୋଟି ପେଶା, ଛାତ୍ର ଏକ ଭୂମିକା",General Knowledge,,,


In [None]:
# Batch Processing

import time
import json

for index, row in df_sampled.iterrows():
    print(f"\n🔄 Collecting Response for Index: {index}")

    try:
        # Step 1: Get response from model
        response = get_model_response(row["Question"])

        # Step 2: Extract clean content
        content = extract_assistant_content(response)

        # Step 3: Split into final visible response and thinking
        visible_response, thinking_response = split_thinking_and_response(content)

        # Step 4: Handle missing explanation
        if not thinking_response.strip():
            thinking_response = "[No explanation provided]"

        # Step 5: Store raw response as JSON string (preserves structure)
        try:
            # Convert the complex response to a JSON string
            df_sampled.at[index, "Model_Response"] = json.dumps(response, ensure_ascii=False, indent=2)
        except (TypeError, ValueError):
            # If JSON serialization fails, convert to string representation
            df_sampled.at[index, "Model_Response"] = repr(response)

        # Step 6: Store processed answers
        df_sampled.at[index, "Model_Answer"] = str(visible_response).strip()
        df_sampled.at[index, "Model_Explanation"] = str(thinking_response).strip()

        print("✅ Collected.")

    except Exception as e:
        print(f"❌ Error at index {index}: {e}")
        # Store error info
        df_sampled.at[index, "Model_Response"] = f"ERROR: {str(e)}"
        df_sampled.at[index, "Model_Answer"] = "ERROR"
        df_sampled.at[index, "Model_Explanation"] = f"Error occurred: {str(e)}"

    time.sleep(1)

print("\n🎉 Batch processing completed!")
print(f"📊 Results preview:")
print(df_sampled[["Question", "Model_Answer", "Model_Explanation"]].head())



🔄 Collecting Response for Index: 0


  [
    {
      "role": "user",
      "metadata": null,
      "content": "ଚାହିଦା ଲେଖା ସରଳରେଖା ହେଲେ ଏହାର ମଧ୍ୟ ବିନ୍ଦୁ ଓ ଊର୍ଦୁବିନ୍ଦୁ ମଧ୍ୟରେ ଚାହିଦାର ସ୍ଥିତିସ୍ଥାପକତା  ଠାରୁ ଅଧିକ । ଆବଶ୍ୟକ ସ୍ଥଳେ ସଂଶୋଧନ କର",
      "options": null
    },
    {
      "role": "assistant",
      "metadata": null,
      "content": "&lt;think&gt;\nପ୍ରଥମେ, ଚାହିଦା ଲେଖା ସରଳରେଖା ହେଲେ ଏହାର ମଧ୍ୟ ବିନ୍ଦୁ ଓ ଊର୍ଦୁବିନ୍ଦୁ ମଧ୍ୟରେ ଚାହିଦାର ସ୍ଥିତିସ୍ଥାପକତା  ଠାରୁ ଅଧିକ । ଆବଶ୍ୟକ ସ୍ଥଳେ ସଂଶୋଧନ କର। ଏହି ପ୍ରସ୍ତୁତିରେ କେହି ଅଛି ଯିଏ ଚାହିଦା ଲେଖା ସରଳରେଖା ହେଲେ ଏହାର ମଧ୍ୟ ବିନ୍ଦୁ ଓ ଊର୍ଦୁ",
      "options": null
    }
  ],
  {
    "visible": true,
    "value": "/tmp/gradio/cbe9aa3a340a7cebf40cbaae4a78cfaacfc35938a174a2db57891c6e9cf786ff/previous_conversation.xlsx",
    "__type__": "update"
  },
  "",
  {
    "visible": false,
    "__type__": "update"
  }
]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df_sampled.at[index, "Model_Response"] = json.dumps(response, ensure_ascii=False, indent=2)
  df_sampled.at[

✅ Collected.

🔄 Collecting Response for Index: 1
✅ Collected.

🔄 Collecting Response for Index: 2
✅ Collected.

🔄 Collecting Response for Index: 3
✅ Collected.

🔄 Collecting Response for Index: 4
✅ Collected.

🔄 Collecting Response for Index: 5
✅ Collected.

🔄 Collecting Response for Index: 6
✅ Collected.

🔄 Collecting Response for Index: 7
✅ Collected.

🔄 Collecting Response for Index: 8
✅ Collected.

🔄 Collecting Response for Index: 9
✅ Collected.

🔄 Collecting Response for Index: 10
✅ Collected.

🔄 Collecting Response for Index: 11
✅ Collected.

🔄 Collecting Response for Index: 12
✅ Collected.

🔄 Collecting Response for Index: 13
✅ Collected.

🔄 Collecting Response for Index: 14
✅ Collected.

🔄 Collecting Response for Index: 15
✅ Collected.

🔄 Collecting Response for Index: 16
✅ Collected.

🔄 Collecting Response for Index: 17
✅ Collected.

🔄 Collecting Response for Index: 18
✅ Collected.

🔄 Collecting Response for Index: 19
✅ Collected.

🔄 Collecting Response for Index: 20
✅ Collect

In [None]:
df_sampled.isnull().sum()


Unnamed: 0,0
_id,0
Question,0
Human_Answer,0
Human_Explanation,30
Type Of Question,0
Model_Response,0
Model_Answer,0
Model_Explanation,0


In [None]:
df_sampled.head(1)

Unnamed: 0,_id,Question,Human_Answer,Human_Explanation,Type Of Question,Model_Response,Model_Answer,Model_Explanation
0,ODR_0080,ଚାହିଦା ଲେଖା ସରଳରେଖା ହେଲେ ଏହାର ମଧ୍ୟ ବିନ୍ଦୁ ଓ ଊର...,ଏହା ଠିକ୍ ଅଟେ |,,General Knowledge,"[\n [\n {\n ""role"": ""user"",\n ""m...",,"ପ୍ରଥମେ, ଚାହିଦା ଲେଖା ସରଳରେଖା ହେଲେ ଏହାର ମଧ୍ୟ ବିନ..."


In [None]:
# Function to tokenize the text (Essentail for BLEU)

import re

def simple_tokenize_odia(text):
    """
    Tokenizes Odia text using a simple regex-based approach.
    Splits on whitespace and punctuation.
    """
    # Remove unnecessary punctuation but keep meaningful ones (like ।)
    tokens = re.findall(r'\w+|[।!?]', text)
    return tokens

In [None]:
odia_text = df_sampled['Question'].iloc[0]
tokens = simple_tokenize_odia(odia_text)
print(tokens)

['ଚ', 'ହ', 'ଦ', 'ଲ', 'ଖ', 'ସରଳର', 'ଖ', 'ହ', 'ଲ', 'ଏହ', 'ର', 'ମଧ', 'ୟ', 'ବ', 'ନ', 'ଦ', 'ଓ', 'ଊର', 'ଦ', 'ବ', 'ନ', 'ଦ', 'ମଧ', 'ୟର', 'ଚ', 'ହ', 'ଦ', 'ର', 'ସ', 'ଥ', 'ତ', 'ସ', 'ଥ', 'ପକତ', 'ଠ', 'ର', 'ଅଧ', 'କ', '।', 'ଆବଶ', 'ୟକ', 'ସ', 'ଥଳ', 'ସ', 'ଶ', 'ଧନ', 'କର']


In [None]:
!pip install nltk rouge-score


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=4c74a1916688da8c71fa90fbfb684f168bc47558f669ee961888de84d2b300c6
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Function To Check BLEU Score

from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize

def compute_bleu(reference, prediction):
    """
    Compute BLEU score between reference and prediction (both Odia strings).
    """
    smoothie = SmoothingFunction().method4

    ref_tokens = simple_tokenize_odia(reference)
    pred_tokens = simple_tokenize_odia(prediction)

    return sentence_bleu([ref_tokens], pred_tokens, smoothing_function=smoothie)


In [None]:
# Function To Check ROUGE Score


from rouge_score import rouge_scorer

def compute_rouge(reference, prediction):
    """
    Compute ROUGE scores between reference and prediction (Odia strings).
    """
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=False)
    return scorer.score(reference, prediction)


In [None]:
# Function To Check Accuracy


def compute_accuracy(reference, prediction):
    """
    Computes an accuracy score as the average F1 score of ROUGE-1, ROUGE-2, and ROUGE-L.

    Args:
        reference (str): Ground truth Odia text.
        prediction (str): Predicted Odia text.

    Returns:
        float: Accuracy score between 0 and 1.
    """
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=False)
    scores = scorer.score(reference, prediction)

    # Extract F1 scores
    f1_1 = scores['rouge1'].fmeasure
    f1_2 = scores['rouge2'].fmeasure
    f1_L = scores['rougeL'].fmeasure

    # Average F1 as "accuracy"
    accuracy = (f1_1 + f1_2 + f1_L) / 3
    return accuracy


In [None]:
#Testing the function

ref = "ମୁଁ ଆଜି ବଜାରକୁ ଯାଇଛି।"
pred = "ମୁଁ ବଜାରକୁ ଗଲି।"

print("🔵 BLEU Score:", compute_bleu(ref, pred))
print("✅ Accuracy:", round(compute_accuracy(ref, pred), 4))

rouge_scores = compute_rouge(ref, pred)
print("\n🔴 ROUGE Scores:")
for k, v in rouge_scores.items():
    print(f"{k}: Precision={v.precision:.2f}, Recall={v.recall:.2f}, F1={v.fmeasure:.2f}")


🔵 BLEU Score: 0.09662322044337945
✅ Accuracy: 0.0

🔴 ROUGE Scores:
rouge1: Precision=0.00, Recall=0.00, F1=0.00
rouge2: Precision=0.00, Recall=0.00, F1=0.00
rougeL: Precision=0.00, Recall=0.00, F1=0.00


In [None]:
df_sampled.head(1)

Unnamed: 0,_id,Question,Human_Answer,Human_Explanation,Type Of Question,Model_Response,Model_Answer,Model_Explanation
0,ODR_0080,ଚାହିଦା ଲେଖା ସରଳରେଖା ହେଲେ ଏହାର ମଧ୍ୟ ବିନ୍ଦୁ ଓ ଊର...,ଏହା ଠିକ୍ ଅଟେ |,,General Knowledge,"[\n [\n {\n ""role"": ""user"",\n ""m...",,"ପ୍ରଥମେ, ଚାହିଦା ଲେଖା ସରଳରେଖା ହେଲେ ଏହାର ମଧ୍ୟ ବିନ..."


In [None]:
#Adding columns to store BLEU , ROUGE and Accuracy of each row

df_sampled['BLEU'] = None
df_sampled['ROUGE'] = None
df_sampled['Accuracy'] = None


In [None]:
"""Run the evaluation on the batch data
      the [ref] is the real text
      the [pred] is the model's response
"""

for idx, row in df_sampled.iterrows():
    ref = row['Human_Answer']
    pred = row['Model_Answer']

    bleu = compute_bleu(ref, pred)
    rouge = compute_rouge(ref, pred)
    acc = compute_accuracy(ref, pred)

    df_sampled.at[idx, 'BLEU'] = round(bleu, 4)
    df_sampled.at[idx, 'ROUGE'] = round(rouge['rougeL'].fmeasure, 4)
    df_sampled.at[idx, 'Accuracy'] = round(acc, 4)

In [None]:
df_sampled.head(4)

Unnamed: 0,_id,Question,Human_Answer,Human_Explanation,Type Of Question,Model_Response,Model_Answer,Model_Explanation,BLEU,ROUGE,Accuracy
0,ODR_0080,ଚାହିଦା ଲେଖା ସରଳରେଖା ହେଲେ ଏହାର ମଧ୍ୟ ବିନ୍ଦୁ ଓ ଊର...,ଏହା ଠିକ୍ ଅଟେ |,,General Knowledge,"[\n [\n {\n ""role"": ""user"",\n ""m...",,"ପ୍ରଥମେ, ଚାହିଦା ଲେଖା ସରଳରେଖା ହେଲେ ଏହାର ମଧ୍ୟ ବିନ...",0,0,0.0
1,ODR_0081,10% ଦାମ୍ ହ୍ରାସ ଘଟି ଚାହିଦାର ସମ୍ପ୍ରସାରଣ 8% ଘଟିଲେ...,ଯେହେତୁ ଦାମ୍‌ର ଆନୁପାତିକ ପରିବର୍ତ୍ତନଠାରୁ ଚାହିଦାର ...,,General Knowledge,"[\n [\n {\n ""role"": ""user"",\n ""m...",,"ପ୍ରଥମେ, ପ୍ରଶ୍ନଟିକୁ ଭୁଲିବା ଆବଶ୍ୟକ। ପ୍ରଶ୍ନଟି କହି...",0,0,0.0
2,ODR_0085,ଭ୍ରମ ସଂଶୋଧନ କର । \nଭାରତ ସମ୍ବିଧାନର ୪୪ତମ ସଂଶୋଧନ ...,ଭାରତ ସମ୍ବିଧାନର ୪୨ତମ ସଂଶୋଧନ ଆଇନ ପ୍ରସ୍ତାବନାରେ ‘ଧ...,,General Knowledge,"[\n [\n {\n ""role"": ""user"",\n ""m...",,"ପ୍ରଥମେ, ମୁଁ ଭାରତ ସମ୍ବିଧାନର ୪୪ତମ ସଂଶୋଧନ ବିଷୟରେ ...",0,0,0.0
3,ODR_0398,ନିମ୍ନଲିଖିତ ମଧ୍ୟରୁ ଅଲଗା ଗୋଟିକୁ ବାଛନ୍ତୁ:\nକ) ମଙ୍...,ଚନ୍ଦ୍ର,"ଅନ୍ୟ ତିନୋଟି ଗ୍ରହ, ଚନ୍ଦ୍ର ପୃଥିବୀର ଉପଗ୍ରହ",General Knowledge,"[\n [\n {\n ""role"": ""user"",\n ""m...",,"ପ୍ରଥମେ, ପ୍ରଶ୍ନଟି ବୁଝିବା ଆବଶ୍ୟକ। ପ୍ରଶ୍ନଟି କୁହାଯ...",0,0,0.0


In [None]:
mean_bleu = df['BLEU'].mean()
mean_rouge = df['ROUGE'].mean()
mean_acc = df['Accuracy'].mean()

# Mode
mode_bleu = df['BLEU'].mode().iloc[0] if not df['BLEU'].mode().empty else None
mode_rouge = df['ROUGE'].mode().iloc[0] if not df['ROUGE'].mode().empty else None
mode_acc = df['Accuracy'].mode().iloc[0] if not df['Accuracy'].mode().empty else None

# Print Results
print("🔵 Mean Scores:")
print(f"BLEU Mean: {mean_bleu:.4f}")
print(f"ROUGE Mean: {mean_rouge:.4f}")
print(f"Accuracy Mean: {mean_acc:.4f}")

print("\n🔴 Mode Scores:")
print(f"BLEU Mode: {mode_bleu}")
print(f"ROUGE Mode: {mode_rouge}")
print(f"Accuracy Mode: {mode_acc}")

🔵 Mean Scores:
BLEU Mean: 0.0286
ROUGE Mean: 0.0000
Accuracy Mean: 0.0000

🔴 Mode Scores:
BLEU Mode: 0.0
ROUGE Mode: 0.0
Accuracy Mode: 0.0


**Next Steps**

Check why the BLEU is so low and ROUGE , Accuracy are such low ?