# **Computer Science and Engineering Department**
#**Artificial Intelligence (UCS 521)**
# Project: **Automatic Answer Sheet Evaluation System**

---



Authors:

    SAMARTH MAHAJAN - 102303717
    MADHAV KAPILA - 102303721
    SNEHA GOSWAMI - 102303723

---


# Google Drive Connecting

In [91]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
%cd /content/drive/MyDrive/Project/Project/Answer\ Sheet\ Evaluation\ System/

[Errno 2] No such file or directory: '/content/drive/MyDrive/Project/Project/Answer Sheet Evaluation System/'
/content


# Resolving PATHS

In [93]:
PATHS = {
    # Core datasets
    'questions': '/content/drive/MyDrive/Project/Project/Answer Sheet Evaluation System/data/QuestionAnswersDataSet.csv',
    'keywords': '/content/drive/MyDrive/Project/Project/Answer Sheet Evaluation System/data/Keywords.csv',
    'weights': '/content/drive/MyDrive/Project/Project/Answer Sheet Evaluation System/data/Evaluation_Weightage.csv',

    # Output files
    'output': {
        'results': '/content/drive/MyDrive/Project/Project/Answer Sheet Evaluation System/results/results.csv',
        'metadata': '/content/drive/MyDrive/Project/Project/Answer Sheet Evaluation System/results/result_metadata.csv'
    },

    # PDF input
    'data': '/content/drive/MyDrive/Project/Project/Answer Sheet Evaluation System/Test/test_perfect.pdf'
}

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Project/Project/Answer Sheet Evaluation System


# pdf_preprocessor.py

Used to extract full text out of the pdf starting from a given page

In [94]:
# pdf_Preprocessor.py

import fitz
class PDFProcessor:
    def __init__(self, start_page=1):
        self.start_page = start_page - 1  # 0-based index
        logging.getLogger("pdfminer").setLevel(logging.ERROR)

    def extract_text(self, pdf_path):
        doc = fitz.open(pdf_path)
        full_text = ""
        for page_num in range(self.start_page, len(doc)):
            full_text += doc[page_num].get_text()
        return full_text


# answer_parser.py

Used to separate Question headings and answer content from preprocessed text using **pdf_plumber** library and **regex**(regular expression)

In [95]:
# answer_parser.py

import pdfplumber
import re

class AnswerParser:
    def __init__(self):
        # Comprehensive question pattern
        self.question_pattern = re.compile(
            r'(?:^|\n)(?:Q|Question|Problem|\d+)[\s.)-]*\s*(\d+)[\s:)-]*',
            re.IGNORECASE
        )

    def parse(self, pdf_path):
        """Parse PDF into {q_num: answer_text} using layout analysis"""
        answers = {}
        current_q = None
        answer_buffer = []

        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                # Extract text with layout preservation
                text = page.extract_text(layout=True, x_tolerance=5, y_tolerance=2)

                # Process text in order of appearance
                for line in text.split('\n'):
                    line = line.strip()
                    if not line:
                        continue

                    # Check for new question
                    match = self.question_pattern.search(line)
                    if match:
                        # Save previous answer
                        if current_q is not None:
                            answers[current_q] = ' '.join(answer_buffer).strip()
                            answer_buffer = []

                        current_q = int(match.group(1))
                        answer_start = match.end()
                        answer_buffer.append(line[answer_start:].strip())
                    else:
                        # Continue current answer
                        if current_q is not None:
                            answer_buffer.append(line)

        # Save last answer
        if current_q is not None:
            answers[current_q] = ' '.join(answer_buffer).strip()

        return self._merge_split_answers(answers)

    def _merge_split_answers(self, answers):
        """Merge answers split across pages/pages"""
        merged = {}
        sorted_qs = sorted(answers.keys())

        for q_num in sorted_qs:
            answer = answers[q_num]

            # Check if previous answer ends with continuation marker
            if q_num-1 in merged and merged[q_num-1].endswith((' ', '-')):
                merged[q_num-1] += ' ' + answer
            else:
                merged[q_num] = answer

        return merged

# scoring_engine.py

The heart of the project where all code related to the model lies. Here we firstly load data according to our need and then score our answers based on three major categories weighted according to type of question


1.   *Semantic Similarity* : Analysis performed using **SentenceTransformer** library a new age semantic analysis libraries replacing old RNN/LSTM models
2.   *Keyword Matching* : Using **string manipulation** we find total no. of required keywords used in the answer by the student
3.   *Grammar Check* : Using **TextBlob** NLP library we check if the word we used in the answer is correctly matching to the grammatically correct word that should be used there as TextBlob gives correct word suggestions for every word it parses.





In [96]:
# scoring_engine.py

import os
import pandas as pd
from textblob import TextBlob
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

class ScoringEngine:
    def __init__(self):
        #Initialize with paths from config
        self.weights = self._load_data('weights')
        self.keywords = self._load_data('keywords')
        self.teacher_answers = self._load_answers('questions')
        self.model = SentenceTransformer('all-MiniLM-L6-v2')

    def _load_data(self, path_key):
        path = PATHS[path_key]
        if not path or not os.path.exists(path):
            raise FileNotFoundError(f"Path not found: {path}")
        return pd.read_csv(path, encoding='latin1')

    def _load_answers(self, path_key):
        #Specialized answer loader with column renaming
        df = self._load_data(path_key)
        return df.rename(columns={'Question Number': 'Question_Number'})[['Question_Number', 'Answer']]#We are only interested in these two columns
        #Ensure the column names are consistent with the rest of the code

    def _get_weights(self, q_num):
        matches = self.weights[self.weights['question'] == q_num]
        if matches.empty:
            raise KeyError(f"No weights found for Q{q_num}")  # Changed to KeyError
        return matches.iloc[0]

    def _get_model_answer(self, q_num):
        matches = self.teacher_answers[self.teacher_answers['Question_Number'] == q_num]
        if matches.empty:
            raise KeyError(f"No model answer found for Q{q_num}")  # Changed to KeyError
        return matches['Answer'].values[0]

    # def _get_keywords(self, q_num):
    #     #Retrieve and clean keywords
    #     keywords = self.keywords[self.keywords['Question Number'] == q_num].iloc[:, 1:]
    #     return [
    #         str(kw).strip().lower()
    #         for col in keywords.values
    #         for kw in col
    #         if pd.notna(kw) and str(kw).strip()
    #     ]

    def _get_keywords(self, q_num):
        keywords = self.keywords[self.keywords['Question Number'] == q_num].iloc[:, 1:]
        return [
            ' '.join(str(kw).strip().lower().split())
            for col in keywords.values
            for kw in col
            if pd.notna(kw) and str(kw).strip()
        ]


    def calculate_grammar_score(self, text):
        #Calculate grammar score with error normalization
        if not text.strip():
            return 0.0

        blob = TextBlob(text)
        total_words = len(blob.words)
        if total_words == 0:
            return 0.0

        error_count = sum(
            1 for word in blob.words
            if word.spellcheck()[0][0].lower() != word.lower()
        )
        return 1 - (error_count / total_words)

    # def calculate_keyword_score(self, student_answer, q_num):
    #     #Calculate keyword match score
    #     target_keywords = self._get_keywords(q_num)
    #     if not target_keywords:
    #         return 0.0

    #     student_words = student_answer.lower().split()
    #     matches = sum(1 for kw in target_keywords if any(kw in word for word in student_words))
    #     return matches / len(target_keywords)

    def calculate_keyword_score(self, student_answer, q_num):
        target_keywords = self._get_keywords(q_num)
        if not target_keywords:
            return 0.0

        # Split student answer into words
        student_words = student_answer.lower().split()

        # Convert keywords to word sequences
        keyword_phrases = [kw.split() for kw in target_keywords]

        matches = 0
        for phrase in keyword_phrases:
            # Check for phrase using sliding window
            phrase_length = len(phrase)
            found = False
            for i in range(len(student_words) - phrase_length + 1):
                if student_words[i:i+phrase_length] == phrase:
                    found = True
                    break
            if found:
                matches += 1

        return matches / len(target_keywords)

    def calculate_semantic_score(self, student_answer, q_num):
        #Calculate semantic similarity score
        model_answer = self._get_model_answer(q_num)
        embeddings = self.model.encode([model_answer, student_answer])
        return cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]

    def calculate_total_score(self, student_answer, q_num):
        weights = self._get_weights(q_num)

        # Already correct - weights are normalized to 0-1 scale

        g = self.calculate_grammar_score(student_answer) * (weights['grammarwt']/100)
        k = self.calculate_keyword_score(student_answer, q_num) * (weights['Keywordswt']/100)
        s = self.calculate_semantic_score(student_answer, q_num) * (weights['similarityWt']/100)


        print(f'Question no. {q_num}')
        print(f'Grammar Score is {g} out of {weights["grammarwt"]/100.0}')
        print(f'Keyword Score is {k} out of {weights["Keywordswt"]/100.0}')
        print(f'Semantic Score is {s} out of {weights["similarityWt"]/100.0}')

        return (g + k + s)

# result_generator.py

It gets the marks scored by student and saves those marks into a csv named results.csv for future referencing

In [None]:
# result_generator.py

import pandas as pd


class ResultGenerator:
    def __init__(self):
        self.results_df = pd.DataFrame(columns=['RollNo', 'Name', 'Total', 'Percentage'])
        self.metadata_df = pd.DataFrame()

    def add_student(self, rollno, name, scores):
        # Add default 0 scores for missing questions
        for q_num in range(1, 201):
            if q_num not in scores:
                scores[q_num] = 0

        # Calculate total and percentage CORRECTLY
        max_possible = 200  # 200 questions
        total = sum(scores.values())
        perc = (total / max_possible) * 100  # Fixed here

        self.results_df.loc[len(self.results_df)] = [rollno, name, total, perc]

        # Update metadata
        student_data = {'RollNo': rollno, 'Name': name}
        student_data.update({'Total': total, 'Percentage': perc})

        self.metadata_df = pd.concat([self.metadata_df, pd.DataFrame([student_data])], ignore_index=True)

        return [rollno, name, total, perc]

    def save_results(self):
        self.results_df.to_csv(PATHS['output']['results'], mode='a', header=False, index=False)
        self.metadata_df.to_csv(PATHS['output']['metadata'], mode='a', header=False, index=False)


# main.py

The central code of project which accepts all inputs, call all functions and handles all output

In [98]:
#Main.py

def evaluate_student(pdf_path, rollno, name, start_page=1):
    # Initialize components
    processor = PDFProcessor(start_page)
    parser = AnswerParser()
    scorer = ScoringEngine()
    reporter = ResultGenerator()

    # Process PDF
    raw_text = processor.extract_text(pdf_path)
    # clean_text = parser.preprocess_pdf_text(raw_text)  # Static method call
    answers = parser.parse(pdf_path)  # Instance method

    # Score answers
    scores = {}
    for q_num, answer_text in answers.items():  # Direct answer access
        try:
            scores[q_num] = scorer.calculate_total_score(answer_text, q_num)
        except KeyError as e:
            print(f"Skipping Q{q_num}: {str(e)}")

    # Generate reports
    ans = reporter.add_student(rollno, name, scores)
    reporter.save_results()
    print(f"Processed {ans[0]} {ans[1]}. Total: {ans[2]}.  Percentage: {ans[3]}%")



In [99]:
!python -m textblob.download_corpora

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
Finished.


# Driver Code - cli.py

The driver code of our project to use when running project using Command Line Interface

In [100]:
# Driver code - cli.py

def colab_evaluate_student():
    # Input parameters
    rollno = input("Enter Roll Number: ")
    name = input("Enter Student Name: ")
    start_page = int(input("Start page [1]: ") or 1)

    # try:
        # Verify PDF path
    pdf_path = PATHS['data']
    if not os.path.isfile(pdf_path):
      raise FileNotFoundError(f"PDF not found: {pdf_path}")

        # Process evaluation
    evaluate_student(pdf_path, rollno, name, start_page)
    print("\n✅ Evaluation Complete!")

    # except Exception as e:
        # print(f"\n❌ Error: {str(e)}")


colab_evaluate_student()

Enter Roll Number: 1
Enter Student Name: A
Start page [1]: 1




Processed 1 A. Total: 9667.643528009507.  Percentage: 4833.821764004753%

✅ Evaluation Complete!


# Test 1
Using non-erronous pdf from all aspects

In [None]:
!pip install pymupdf pdfplumber pandas numpy easyocr pdf2image nltk scikit-learn sentence-transformers flask reportlab textblob
!pip install --upgrade pymupdf
import fitz
import pdfplumber
import re
import os
import logging
import pandas as pd
from textblob import TextBlob
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
import os
import argparse
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Project/Project/Answer\ Sheet\ Evaluation\ System/
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/Project/Project/Answer\ Sheet\ Evaluation\ System/

PATHS = {
    # Core datasets
    'questions': '/content/drive/MyDrive/Project/Project/Answer Sheet Evaluation System/data/QuestionAnswersDataSet.csv',
    'keywords': '/content/drive/MyDrive/Project/Project/Answer Sheet Evaluation System/data/Keywords.csv',
    'weights': '/content/drive/MyDrive/Project/Project/Answer Sheet Evaluation System/data/Evaluation_Weightage.csv',

    # Output files
    'output': {
        'results': '/content/drive/MyDrive/Project/Project/Answer Sheet Evaluation System/results/results.csv',
        'metadata': '/content/drive/MyDrive/Project/Project/Answer Sheet Evaluation System/results/result_metadata.csv'
    },

    # PDF input
    'data': '/content/drive/MyDrive/Project/Project/Answer Sheet Evaluation System/Test/test_perfect.pdf'
}
import fitz
class PDFProcessor:
    def __init__(self, start_page=1):
        self.start_page = start_page - 1  # 0-based index
        logging.getLogger("pdfminer").setLevel(logging.ERROR)

    def extract_text(self, pdf_path):
        doc = fitz.open(pdf_path)
        full_text = ""
        for page_num in range(self.start_page, len(doc)):
            full_text += doc[page_num].get_text()
        return full_text


import pdfplumber
import re

class AnswerParser:
    def __init__(self):
        # Comprehensive question pattern
        self.question_pattern = re.compile(
            r'(?:^|\n)(?:Q|Question|Problem|\d+)[\s.)-]*\s*(\d+)[\s:)-]*',
            re.IGNORECASE
        )

    def parse(self, pdf_path):
        """Parse PDF into {q_num: answer_text} using layout analysis"""
        answers = {}
        current_q = None
        answer_buffer = []

        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                # Extract text with layout preservation
                text = page.extract_text(layout=True, x_tolerance=5, y_tolerance=2)

                # Process text in order of appearance
                for line in text.split('\n'):
                    line = line.strip()
                    if not line:
                        continue

                    # Check for new question
                    match = self.question_pattern.search(line)
                    if match:
                        # Save previous answer
                        if current_q is not None:
                            answers[current_q] = ' '.join(answer_buffer).strip()
                            answer_buffer = []

                        current_q = int(match.group(1))
                        answer_start = match.end()
                        answer_buffer.append(line[answer_start:].strip())
                    else:
                        # Continue current answer
                        if current_q is not None:
                            answer_buffer.append(line)

        # Save last answer
        if current_q is not None:
            answers[current_q] = ' '.join(answer_buffer).strip()

        return self._merge_split_answers(answers)

    def _merge_split_answers(self, answers):
        """Merge answers split across pages/pages"""
        merged = {}
        sorted_qs = sorted(answers.keys())

        for q_num in sorted_qs:
            answer = answers[q_num]

            # Check if previous answer ends with continuation marker
            if q_num-1 in merged and merged[q_num-1].endswith((' ', '-')):
                merged[q_num-1] += ' ' + answer
            else:
                merged[q_num] = answer

        return merged

import os
import pandas as pd
from textblob import TextBlob
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

class ScoringEngine:
    def __init__(self):
        #Initialize with paths from config
        self.weights = self._load_data('weights')
        self.keywords = self._load_data('keywords')
        self.teacher_answers = self._load_answers('questions')
        self.model = SentenceTransformer('all-MiniLM-L6-v2')

    def _load_data(self, path_key):
        path = PATHS[path_key]
        if not path or not os.path.exists(path):
            raise FileNotFoundError(f"Path not found: {path}")
        return pd.read_csv(path, encoding='latin1')

    def _load_answers(self, path_key):
        #Specialized answer loader with column renaming
        df = self._load_data(path_key)
        return df.rename(columns={'Question Number': 'Question_Number'})[['Question_Number', 'Answer']]#We are only interested in these two columns
        #Ensure the column names are consistent with the rest of the code

    def _get_weights(self, q_num):
        matches = self.weights[self.weights['question'] == q_num]
        if matches.empty:
            raise KeyError(f"No weights found for Q{q_num}")  # Changed to KeyError
        return matches.iloc[0]

    def _get_model_answer(self, q_num):
        matches = self.teacher_answers[self.teacher_answers['Question_Number'] == q_num]
        if matches.empty:
            raise KeyError(f"No model answer found for Q{q_num}")  # Changed to KeyError
        return matches['Answer'].values[0]

    # def _get_keywords(self, q_num):
    #     #Retrieve and clean keywords
    #     keywords = self.keywords[self.keywords['Question Number'] == q_num].iloc[:, 1:]
    #     return [
    #         str(kw).strip().lower()
    #         for col in keywords.values
    #         for kw in col
    #         if pd.notna(kw) and str(kw).strip()
    #     ]

    def _get_keywords(self, q_num):
        keywords = self.keywords[self.keywords['Question Number'] == q_num].iloc[:, 1:]
        return [
            ' '.join(str(kw).strip().lower().split())
            for col in keywords.values
            for kw in col
            if pd.notna(kw) and str(kw).strip()
        ]


    def calculate_grammar_score(self, text):
        #Calculate grammar score with error normalization
        if not text.strip():
            return 0.0

        blob = TextBlob(text)
        total_words = len(blob.words)
        if total_words == 0:
            return 0.0

        error_count = sum(
            1 for word in blob.words
            if word.spellcheck()[0][0].lower() != word.lower()
        )
        return 1 - (error_count / total_words)

    # def calculate_keyword_score(self, student_answer, q_num):
    #     #Calculate keyword match score
    #     target_keywords = self._get_keywords(q_num)
    #     if not target_keywords:
    #         return 0.0

    #     student_words = student_answer.lower().split()
    #     matches = sum(1 for kw in target_keywords if any(kw in word for word in student_words))
    #     return matches / len(target_keywords)

    def calculate_keyword_score(self, student_answer, q_num):
        target_keywords = self._get_keywords(q_num)
        if not target_keywords:
            return 0.0

        # Split student answer into words
        student_words = student_answer.lower().split()

        # Convert keywords to word sequences
        keyword_phrases = [kw.split() for kw in target_keywords]

        matches = 0
        for phrase in keyword_phrases:
            # Check for phrase using sliding window
            phrase_length = len(phrase)
            found = False
            for i in range(len(student_words) - phrase_length + 1):
                if student_words[i:i+phrase_length] == phrase:
                    found = True
                    break
            if found:
                matches += 1

        return matches / len(target_keywords)

    def calculate_semantic_score(self, student_answer, q_num):
        #Calculate semantic similarity score
        model_answer = self._get_model_answer(q_num)
        embeddings = self.model.encode([model_answer, student_answer])
        return cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]

    def calculate_total_score(self, student_answer, q_num):
        weights = self._get_weights(q_num)

        # Already correct - weights are normalized to 0-1 scale

        g = self.calculate_grammar_score(student_answer) * (weights['grammarwt']/100)
        k = self.calculate_keyword_score(student_answer, q_num) * (weights['Keywordswt']/100)
        s = self.calculate_semantic_score(student_answer, q_num) * (weights['similarityWt']/100)


        print(f'Question no. {q_num}')
        print(f'Grammar Score is {g} out of {weights["grammarwt"]/100.0}')
        print(f'Keyword Score is {k} out of {weights["Keywordswt"]/100.0}')
        print(f'Semantic Score is {s} out of {weights["similarityWt"]/100.0}')

        return (g + k + s)

import pandas as pd


class ResultGenerator:
    def __init__(self):
        self.results_df = pd.DataFrame(columns=['RollNo', 'Name', 'Total', 'Percentage'])
        self.metadata_df = pd.DataFrame()

    def add_student(self, rollno, name, scores):
        # Add default 0 scores for missing questions
        for q_num in range(1, 201):
            if q_num not in scores:
                scores[q_num] = 0

        # Calculate total and percentage CORRECTLY
        max_possible = 200  # 200 questions
        total = sum(scores.values())
        perc = (total / max_possible) * 100  # Fixed here

        self.results_df.loc[len(self.results_df)] = [rollno, name, total, perc]

        # Update metadata
        student_data = {'RollNo': rollno, 'Name': name}
        student_data.update({'Total': total, 'Percentage': perc})

        self.metadata_df = pd.concat([self.metadata_df, pd.DataFrame([student_data])], ignore_index=True)

        return [rollno, name, total, perc]

    def save_results(self):
        self.results_df.to_csv(PATHS['output']['results'], mode='a', header=False, index=False)
        self.metadata_df.to_csv(PATHS['output']['metadata'], mode='a', header=False, index=False)


#Main.py

logging.getLogger("pdfminer").setLevel(logging.ERROR)

def evaluate_student(pdf_path, rollno, name, start_page=1):
    # Initialize components
    processor = PDFProcessor(start_page)
    parser = AnswerParser()
    scorer = ScoringEngine()
    reporter = ResultGenerator()

    # Process PDF
    raw_text = processor.extract_text(pdf_path)
    # clean_text = parser.preprocess_pdf_text(raw_text)  # Static method call
    answers = parser.parse(pdf_path)  # Instance method

    # Score answers with detailed logging
    scores = {}
    for q_num, answer_text in answers.items():
        try:
            # answer_text = answer_data['answer']
            score = scorer.calculate_total_score(answer_text, q_num)
            print(f"Q{q_num}: {score:.2f}/1.0")  # Print individual scores
            scores[q_num] = score
        except KeyError as e:
            print(f"\n⚠️ Skipping Q{q_num}: {str(e)}")
        except Exception as e:
            print(f"\n❌ Error processing Q{q_num}: {str(e)}")

    # Generate reports
    reporter.add_student(rollno, name, scores)
    reporter.save_results()
    total = sum(scores.values())
    max_score = len(answers)
    print(f"\nProcessed {name} ({rollno}). Total: {total}/{max_score} ({total/max_score*100:.2f}%)")



def colab_evaluate_student():
    # Input parameters
    rollno = input("Enter Roll Number: ")
    name = input("Enter Student Name: ")
    start_page = int(input("Start page [1]: ") or 1)

    # try:
        # Verify PDF path
    pdf_path = PATHS['data']
    if not os.path.isfile(pdf_path):
      raise FileNotFoundError(f"PDF not found: {pdf_path}")

        # Process evaluation
    evaluate_student(pdf_path, rollno, name, start_page)
    print("\n✅ Evaluation Complete!")

    # except Exception as e:
        # print(f"\n❌ Error: {str(e)}")


colab_evaluate_student()



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Project/Project/Answer Sheet Evaluation System
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Project/Project/Answer Sheet Evaluation System
Enter Roll Number: 5
Enter Student Name: E
Start page [1]: 1
Question no. 1
Grammar Score is 0.07727272727272727 out of 0.1
Keyword Score is 0.4 out of 0.4
Semantic Score is 0.5000000596046448 out of 0.5
Q1: 0.98/1.0
Question no. 2
Grammar Score is 0.2 out of 0.2
Keyword Score is 0.5 out of 0.5
Semantic Score is 0.3 out of 0.3
Q2: 1.00/1.0
Question no. 3
Grammar Score is 0.2 out of 0.2
Keyword Score is 0.3333333333333333 out of 0.5
Semantic Score is 0.29999996423721315 out of 0.3
Q3: 0.83/1.0
Question no. 4
Grammar Score is 0.16363636363636364 out of 0.2
Keyword Score is 0.25 out of 0.5
Semantic

# Test 2

Using pdf that is same as the previous one with no errors generated to text but having large extra_spaces

In [None]:
!pip install pymupdf pdfplumber pandas numpy easyocr pdf2image nltk scikit-learn sentence-transformers flask reportlab textblob
!pip install --upgrade pymupdf
import fitz
import pdfplumber
import re
import os
import logging
import pandas as pd
from textblob import TextBlob
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
import os
import argparse
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Project/Project/Answer\ Sheet\ Evaluation\ System/
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/Project/Project/Answer\ Sheet\ Evaluation\ System/

PATHS = {
    # Core datasets
    'questions': '/content/drive/MyDrive/Project/Project/Answer Sheet Evaluation System/data/QuestionAnswersDataSet.csv',
    'keywords': '/content/drive/MyDrive/Project/Project/Answer Sheet Evaluation System/data/Keywords.csv',
    'weights': '/content/drive/MyDrive/Project/Project/Answer Sheet Evaluation System/data/Evaluation_Weightage.csv',

    # Output files
    'output': {
        'results': '/content/drive/MyDrive/Project/Project/Answer Sheet Evaluation System/results/results.csv',
        'metadata': '/content/drive/MyDrive/Project/Project/Answer Sheet Evaluation System/results/result_metadata.csv'
    },

    # PDF input
    'data': '/content/drive/MyDrive/Project/Project/Answer Sheet Evaluation System/Test/test_perfect_refined.pdf'
}
import fitz
class PDFProcessor:
    def __init__(self, start_page=1):
        self.start_page = start_page - 1  # 0-based index
        logging.getLogger("pdfminer").setLevel(logging.ERROR)

    def extract_text(self, pdf_path):
        doc = fitz.open(pdf_path)
        full_text = ""
        for page_num in range(self.start_page, len(doc)):
            full_text += doc[page_num].get_text()
        return full_text


import pdfplumber
import re

class AnswerParser:
    def __init__(self):
        # Comprehensive question pattern
        self.question_pattern = re.compile(
            r'(?:^|\n)(?:Q|Question|Problem|\d+)[\s.)-]*\s*(\d+)[\s:)-]*',
            re.IGNORECASE
        )

    def parse(self, pdf_path):
        """Parse PDF into {q_num: answer_text} using layout analysis"""
        answers = {}
        current_q = None
        answer_buffer = []

        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                # Extract text with layout preservation
                text = page.extract_text(layout=True, x_tolerance=5, y_tolerance=2)

                # Process text in order of appearance
                for line in text.split('\n'):
                    line = line.strip()
                    if not line:
                        continue

                    # Check for new question
                    match = self.question_pattern.search(line)
                    if match:
                        # Save previous answer
                        if current_q is not None:
                            answers[current_q] = ' '.join(answer_buffer).strip()
                            answer_buffer = []

                        current_q = int(match.group(1))
                        answer_start = match.end()
                        answer_buffer.append(line[answer_start:].strip())
                    else:
                        # Continue current answer
                        if current_q is not None:
                            answer_buffer.append(line)

        # Save last answer
        if current_q is not None:
            answers[current_q] = ' '.join(answer_buffer).strip()

        return self._merge_split_answers(answers)

    def _merge_split_answers(self, answers):
        """Merge answers split across pages/pages"""
        merged = {}
        sorted_qs = sorted(answers.keys())

        for q_num in sorted_qs:
            answer = answers[q_num]

            # Check if previous answer ends with continuation marker
            if q_num-1 in merged and merged[q_num-1].endswith((' ', '-')):
                merged[q_num-1] += ' ' + answer
            else:
                merged[q_num] = answer

        return merged

import os
import pandas as pd
from textblob import TextBlob
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

class ScoringEngine:
    def __init__(self):
        #Initialize with paths from config
        self.weights = self._load_data('weights')
        self.keywords = self._load_data('keywords')
        self.teacher_answers = self._load_answers('questions')
        self.model = SentenceTransformer('all-MiniLM-L6-v2')

    def _load_data(self, path_key):
        path = PATHS[path_key]
        if not path or not os.path.exists(path):
            raise FileNotFoundError(f"Path not found: {path}")
        return pd.read_csv(path, encoding='latin1')

    def _load_answers(self, path_key):
        #Specialized answer loader with column renaming
        df = self._load_data(path_key)
        return df.rename(columns={'Question Number': 'Question_Number'})[['Question_Number', 'Answer']]#We are only interested in these two columns
        #Ensure the column names are consistent with the rest of the code

    def _get_weights(self, q_num):
        matches = self.weights[self.weights['question'] == q_num]
        if matches.empty:
            raise KeyError(f"No weights found for Q{q_num}")  # Changed to KeyError
        return matches.iloc[0]

    def _get_model_answer(self, q_num):
        matches = self.teacher_answers[self.teacher_answers['Question_Number'] == q_num]
        if matches.empty:
            raise KeyError(f"No model answer found for Q{q_num}")  # Changed to KeyError
        return matches['Answer'].values[0]

    # def _get_keywords(self, q_num):
    #     #Retrieve and clean keywords
    #     keywords = self.keywords[self.keywords['Question Number'] == q_num].iloc[:, 1:]
    #     return [
    #         str(kw).strip().lower()
    #         for col in keywords.values
    #         for kw in col
    #         if pd.notna(kw) and str(kw).strip()
    #     ]

    def _get_keywords(self, q_num):
        keywords = self.keywords[self.keywords['Question Number'] == q_num].iloc[:, 1:]
        return [
            ' '.join(str(kw).strip().lower().split())
            for col in keywords.values
            for kw in col
            if pd.notna(kw) and str(kw).strip()
        ]


    def calculate_grammar_score(self, text):
        #Calculate grammar score with error normalization
        if not text.strip():
            return 0.0

        blob = TextBlob(text)
        total_words = len(blob.words)
        if total_words == 0:
            return 0.0

        error_count = sum(
            1 for word in blob.words
            if word.spellcheck()[0][0].lower() != word.lower()
        )
        return 1 - (error_count / total_words)

    # def calculate_keyword_score(self, student_answer, q_num):
    #     #Calculate keyword match score
    #     target_keywords = self._get_keywords(q_num)
    #     if not target_keywords:
    #         return 0.0

    #     student_words = student_answer.lower().split()
    #     matches = sum(1 for kw in target_keywords if any(kw in word for word in student_words))
    #     return matches / len(target_keywords)

    def calculate_keyword_score(self, student_answer, q_num):
        target_keywords = self._get_keywords(q_num)
        if not target_keywords:
            return 0.0

        # Split student answer into words
        student_words = student_answer.lower().split()

        # Convert keywords to word sequences
        keyword_phrases = [kw.split() for kw in target_keywords]

        matches = 0
        for phrase in keyword_phrases:
            # Check for phrase using sliding window
            phrase_length = len(phrase)
            found = False
            for i in range(len(student_words) - phrase_length + 1):
                if student_words[i:i+phrase_length] == phrase:
                    found = True
                    break
            if found:
                matches += 1

        return matches / len(target_keywords)

    def calculate_semantic_score(self, student_answer, q_num):
        #Calculate semantic similarity score
        model_answer = self._get_model_answer(q_num)
        embeddings = self.model.encode([model_answer, student_answer])
        return cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]

    def calculate_total_score(self, student_answer, q_num):
        weights = self._get_weights(q_num)

        # Already correct - weights are normalized to 0-1 scale

        g = self.calculate_grammar_score(student_answer) * (weights['grammarwt']/100)
        k = self.calculate_keyword_score(student_answer, q_num) * (weights['Keywordswt']/100)
        s = self.calculate_semantic_score(student_answer, q_num) * (weights['similarityWt']/100)


        print(f'Question no. {q_num}')
        print(f'Grammar Score is {g} out of {weights["grammarwt"]/100.0}')
        print(f'Keyword Score is {k} out of {weights["Keywordswt"]/100.0}')
        print(f'Semantic Score is {s} out of {weights["similarityWt"]/100.0}')

        return (g + k + s)

import pandas as pd


class ResultGenerator:
    def __init__(self):
        self.results_df = pd.DataFrame(columns=['RollNo', 'Name', 'Total', 'Percentage'])
        self.metadata_df = pd.DataFrame()

    def add_student(self, rollno, name, scores):
        # Add default 0 scores for missing questions
        for q_num in range(1, 201):
            if q_num not in scores:
                scores[q_num] = 0

        # Calculate total and percentage CORRECTLY
        max_possible = 200  # 200 questions 
        total = sum(scores.values())
        perc = (total / max_possible) * 100  # Fixed here

        self.results_df.loc[len(self.results_df)] = [rollno, name, total, perc]

        # Update metadata
        student_data = {'RollNo': rollno, 'Name': name}
        student_data.update({'Total': total, 'Percentage': perc})

        self.metadata_df = pd.concat([self.metadata_df, pd.DataFrame([student_data])], ignore_index=True)

        return [rollno, name, total, perc]

    def save_results(self):
        self.results_df.to_csv(PATHS['output']['results'], mode='a', header=False, index=False)
        self.metadata_df.to_csv(PATHS['output']['metadata'], mode='a', header=False, index=False)


#Main.py

logging.getLogger("pdfminer").setLevel(logging.ERROR)

def evaluate_student(pdf_path, rollno, name, start_page=1):
    # Initialize components
    processor = PDFProcessor(start_page)
    parser = AnswerParser()
    scorer = ScoringEngine()
    reporter = ResultGenerator()

    # Process PDF
    raw_text = processor.extract_text(pdf_path)
    # clean_text = parser.preprocess_pdf_text(raw_text)  # Static method call
    answers = parser.parse(pdf_path)  # Instance method

    # Score answers with detailed logging
    scores = {}
    for q_num, answer_text in answers.items():
        try:
            # answer_text = answer_data['answer']
            score = scorer.calculate_total_score(answer_text, q_num)
            print(f"Q{q_num}: {score:.2f}/1.0")  # Print individual scores
            scores[q_num] = score
        except KeyError as e:
            print(f"\n⚠️ Skipping Q{q_num}: {str(e)}")
        except Exception as e:
            print(f"\n❌ Error processing Q{q_num}: {str(e)}")

    # Generate reports
    reporter.add_student(rollno, name, scores)
    reporter.save_results()
    total = sum(scores.values())
    max_score = len(answers)
    print(f"\nProcessed {name} ({rollno}). Total: {total}/{max_score} ({total/max_score*100:.2f}%)")



def colab_evaluate_student():
    # Input parameters
    rollno = input("Enter Roll Number: ")
    name = input("Enter Student Name: ")
    start_page = int(input("Start page [1]: ") or 1)

    # try:
        # Verify PDF path
    pdf_path = PATHS['data']
    if not os.path.isfile(pdf_path):
      raise FileNotFoundError(f"PDF not found: {pdf_path}")

        # Process evaluation
    evaluate_student(pdf_path, rollno, name, start_page)
    print("\n✅ Evaluation Complete!")

    # except Exception as e:
        # print(f"\n❌ Error: {str(e)}")


colab_evaluate_student()

Enter Roll Number: 6
Enter Student Name: F
Start page [1]: 1
Question no. 1
Grammar Score is 0.07727272727272727 out of 0.1
Keyword Score is 0.4 out of 0.4
Semantic Score is 0.5000000596046448 out of 0.5
Q1: 0.98/1.0
Question no. 2
Grammar Score is 0.2 out of 0.2
Keyword Score is 0.5 out of 0.5
Semantic Score is 0.3 out of 0.3
Q2: 1.00/1.0
Question no. 3
Grammar Score is 0.2 out of 0.2
Keyword Score is 0.3333333333333333 out of 0.5
Semantic Score is 0.29999996423721315 out of 0.3
Q3: 0.83/1.0
Question no. 4
Grammar Score is 0.16363636363636364 out of 0.2
Keyword Score is 0.25 out of 0.5
Semantic Score is 0.29999996423721315 out of 0.3
Q4: 0.71/1.0
Question no. 5
Grammar Score is 0.2 out of 0.2
Keyword Score is 0.08333333333333333 out of 0.5
Semantic Score is 0.3 out of 0.3
Q5: 0.58/1.0
Question no. 6
Grammar Score is 0.16666666666666669 out of 0.2
Keyword Score is 0.4166666666666667 out of 0.5
Semantic Score is 0.29999998211860657 out of 0.3
Q6: 0.88/1.0
Question no. 7
Grammar Score is

# Test 3

More hurdles added, text is misalligned Question headings are non uniform and Answering order is changed 1 can be answered after answering Q5

In [None]:
!pip install pymupdf pdfplumber pandas numpy easyocr pdf2image nltk scikit-learn sentence-transformers flask reportlab textblob
!pip install --upgrade pymupdf
import fitz
import pdfplumber
import re
import os
import logging
import pandas as pd
from textblob import TextBlob
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
import os
import argparse
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Project/Project/Answer\ Sheet\ Evaluation\ System/
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/Project/Project/Answer\ Sheet\ Evaluation\ System/

PATHS = {
    # Core datasets
    'questions': '/content/drive/MyDrive/Project/Project/Answer Sheet Evaluation System/data/QuestionAnswersDataSet.csv',
    'keywords': '/content/drive/MyDrive/Project/Project/Answer Sheet Evaluation System/data/Keywords.csv',
    'weights': '/content/drive/MyDrive/Project/Project/Answer Sheet Evaluation System/data/Evaluation_Weightage.csv',

    # Output files
    'output': {
        'results': '/content/drive/MyDrive/Project/Project/Answer Sheet Evaluation System/results/results.csv',
        'metadata': '/content/drive/MyDrive/Project/Project/Answer Sheet Evaluation System/results/result_metadata.csv'
    },

    # PDF input
    'data': '/content/drive/MyDrive/Project/Project/Answer Sheet Evaluation System/Test/test_anomalous (2).pdf'
}
import fitz
class PDFProcessor:
    def __init__(self, start_page=1):
        self.start_page = start_page - 1  # 0-based index
        logging.getLogger("pdfminer").setLevel(logging.ERROR)

    def extract_text(self, pdf_path):
        doc = fitz.open(pdf_path)
        full_text = ""
        for page_num in range(self.start_page, len(doc)):
            full_text += doc[page_num].get_text()
        return full_text


import pdfplumber
import re

class AnswerParser:
    def __init__(self):
        # Comprehensive question pattern
        self.question_pattern = re.compile(
            r'(?:^|\n)(?:Q|Question|Problem|\d+)[\s.)-]*\s*(\d+)[\s:)-]*',
            re.IGNORECASE
        )

    def parse(self, pdf_path):
        """Parse PDF into {q_num: answer_text} using layout analysis"""
        answers = {}
        current_q = None
        answer_buffer = []

        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                # Extract text with layout preservation
                text = page.extract_text(layout=True, x_tolerance=5, y_tolerance=2)

                # Process text in order of appearance
                for line in text.split('\n'):
                    line = line.strip()
                    if not line:
                        continue

                    # Check for new question
                    match = self.question_pattern.search(line)
                    if match:
                        # Save previous answer
                        if current_q is not None:
                            answers[current_q] = ' '.join(answer_buffer).strip()
                            answer_buffer = []

                        current_q = int(match.group(1))
                        answer_start = match.end()
                        answer_buffer.append(line[answer_start:].strip())
                    else:
                        # Continue current answer
                        if current_q is not None:
                            answer_buffer.append(line)

        # Save last answer
        if current_q is not None:
            answers[current_q] = ' '.join(answer_buffer).strip()

        return self._merge_split_answers(answers)

    def _merge_split_answers(self, answers):
        """Merge answers split across pages/pages"""
        merged = {}
        sorted_qs = sorted(answers.keys())

        for q_num in sorted_qs:
            answer = answers[q_num]

            # Check if previous answer ends with continuation marker
            if q_num-1 in merged and merged[q_num-1].endswith((' ', '-')):
                merged[q_num-1] += ' ' + answer
            else:
                merged[q_num] = answer

        return merged

import os
import pandas as pd
from textblob import TextBlob
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

class ScoringEngine:
    def __init__(self):
        #Initialize with paths from config
        self.weights = self._load_data('weights')
        self.keywords = self._load_data('keywords')
        self.teacher_answers = self._load_answers('questions')
        self.model = SentenceTransformer('all-MiniLM-L6-v2')

    def _load_data(self, path_key):
        path = PATHS[path_key]
        if not path or not os.path.exists(path):
            raise FileNotFoundError(f"Path not found: {path}")
        return pd.read_csv(path, encoding='latin1')

    def _load_answers(self, path_key):
        #Specialized answer loader with column renaming
        df = self._load_data(path_key)
        return df.rename(columns={'Question Number': 'Question_Number'})[['Question_Number', 'Answer']]#We are only interested in these two columns
        #Ensure the column names are consistent with the rest of the code

    def _get_weights(self, q_num):
        matches = self.weights[self.weights['question'] == q_num]
        if matches.empty:
            raise KeyError(f"No weights found for Q{q_num}")  # Changed to KeyError
        return matches.iloc[0]

    def _get_model_answer(self, q_num):
        matches = self.teacher_answers[self.teacher_answers['Question_Number'] == q_num]
        if matches.empty:
            raise KeyError(f"No model answer found for Q{q_num}")  # Changed to KeyError
        return matches['Answer'].values[0]

    # def _get_keywords(self, q_num):
    #     #Retrieve and clean keywords
    #     keywords = self.keywords[self.keywords['Question Number'] == q_num].iloc[:, 1:]
    #     return [
    #         str(kw).strip().lower()
    #         for col in keywords.values
    #         for kw in col
    #         if pd.notna(kw) and str(kw).strip()
    #     ]

    def _get_keywords(self, q_num):
        keywords = self.keywords[self.keywords['Question Number'] == q_num].iloc[:, 1:]
        return [
            ' '.join(str(kw).strip().lower().split())
            for col in keywords.values
            for kw in col
            if pd.notna(kw) and str(kw).strip()
        ]


    def calculate_grammar_score(self, text):
        #Calculate grammar score with error normalization
        if not text.strip():
            return 0.0

        blob = TextBlob(text)
        total_words = len(blob.words)
        if total_words == 0:
            return 0.0

        error_count = sum(
            1 for word in blob.words
            if word.spellcheck()[0][0].lower() != word.lower()
        )
        return 1 - (error_count / total_words)

    # def calculate_keyword_score(self, student_answer, q_num):
    #     #Calculate keyword match score
    #     target_keywords = self._get_keywords(q_num)
    #     if not target_keywords:
    #         return 0.0

    #     student_words = student_answer.lower().split()
    #     matches = sum(1 for kw in target_keywords if any(kw in word for word in student_words))
    #     return matches / len(target_keywords)

    def calculate_keyword_score(self, student_answer, q_num):
        target_keywords = self._get_keywords(q_num)
        if not target_keywords:
            return 0.0

        # Split student answer into words
        student_words = student_answer.lower().split()

        # Convert keywords to word sequences
        keyword_phrases = [kw.split() for kw in target_keywords]

        matches = 0
        for phrase in keyword_phrases:
            # Check for phrase using sliding window
            phrase_length = len(phrase)
            found = False
            for i in range(len(student_words) - phrase_length + 1):
                if student_words[i:i+phrase_length] == phrase:
                    found = True
                    break
            if found:
                matches += 1

        return matches / len(target_keywords)

    def calculate_semantic_score(self, student_answer, q_num):
        #Calculate semantic similarity score
        model_answer = self._get_model_answer(q_num)
        embeddings = self.model.encode([model_answer, student_answer])
        return cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]

    def calculate_total_score(self, student_answer, q_num):
        weights = self._get_weights(q_num)

        # Already correct - weights are normalized to 0-1 scale

        g = self.calculate_grammar_score(student_answer) * (weights['grammarwt']/100)
        k = self.calculate_keyword_score(student_answer, q_num) * (weights['Keywordswt']/100)
        s = self.calculate_semantic_score(student_answer, q_num) * (weights['similarityWt']/100)


        print(f'Question no. {q_num}')
        print(f'Grammar Score is {g} out of {weights["grammarwt"]/100.0}')
        print(f'Keyword Score is {k} out of {weights["Keywordswt"]/100.0}')
        print(f'Semantic Score is {s} out of {weights["similarityWt"]/100.0}')

        return (g + k + s)

import pandas as pd


class ResultGenerator:
    def __init__(self):
        self.results_df = pd.DataFrame(columns=['RollNo', 'Name', 'Total', 'Percentage'])
        self.metadata_df = pd.DataFrame()

    def add_student(self, rollno, name, scores):
        # Add default 0 scores for missing questions
        for q_num in range(1, 201):
            if q_num not in scores:
                scores[q_num] = 0

        # Calculate total and percentage CORRECTLY
        max_possible = 200  # 200 questions
        total = sum(scores.values())
        perc = (total / max_possible) * 100  # Fixed here

        self.results_df.loc[len(self.results_df)] = [rollno, name, total, perc]

        # Update metadata
        student_data = {'RollNo': rollno, 'Name': name}
        student_data.update({'Total': total, 'Percentage': perc})

        self.metadata_df = pd.concat([self.metadata_df, pd.DataFrame([student_data])], ignore_index=True)

        return [rollno, name, total, perc]

    def save_results(self):
        self.results_df.to_csv(PATHS['output']['results'], mode='a', header=False, index=False)
        self.metadata_df.to_csv(PATHS['output']['metadata'], mode='a', header=False, index=False)


#Main.py

logging.getLogger("pdfminer").setLevel(logging.ERROR)

def evaluate_student(pdf_path, rollno, name, start_page=1):
    # Initialize components
    processor = PDFProcessor(start_page)
    parser = AnswerParser()
    scorer = ScoringEngine()
    reporter = ResultGenerator()

    # Process PDF
    raw_text = processor.extract_text(pdf_path)
    # clean_text = parser.preprocess_pdf_text(raw_text)  # Static method call
    answers = parser.parse(pdf_path)  # Instance method

    # Score answers with detailed logging
    scores = {}
    for q_num, answer_text in answers.items():
        try:
            # answer_text = answer_data['answer']
            score = scorer.calculate_total_score(answer_text, q_num)
            print(f"Q{q_num}: {score:.2f}/1.0")  # Print individual scores
            scores[q_num] = score
        except KeyError as e:
            print(f"\n⚠️ Skipping Q{q_num}: {str(e)}")
        except Exception as e:
            print(f"\n❌ Error processing Q{q_num}: {str(e)}")

    # Generate reports
    reporter.add_student(rollno, name, scores)
    reporter.save_results()
    total = sum(scores.values())
    max_score = len(answers)
    print(f"\nProcessed {name} ({rollno}). Total: {total}/{max_score} ({total/max_score*100:.2f}%)")



def colab_evaluate_student():
    # Input parameters
    rollno = input("Enter Roll Number: ")
    name = input("Enter Student Name: ")
    start_page = int(input("Start page [1]: ") or 1)

    # try:
        # Verify PDF path
    pdf_path = PATHS['data']
    if not os.path.isfile(pdf_path):
      raise FileNotFoundError(f"PDF not found: {pdf_path}")

        # Process evaluation
    evaluate_student(pdf_path, rollno, name, start_page)
    print("\n✅ Evaluation Complete!")

    # except Exception as e:
        # print(f"\n❌ Error: {str(e)}")


colab_evaluate_student()

/content/drive/MyDrive/Project/Project/Answer Sheet Evaluation System
Enter Roll Number: 7
Enter Student Name: G
Start page [1]: 1

⚠️ Skipping Q0: 'No weights found for Q0'
Question no. 1
Grammar Score is 0.1 out of 0.1
Keyword Score is 0.0 out of 0.4
Semantic Score is 0.1341467797756195 out of 0.5
Q1: 0.23/1.0
Question no. 2
Grammar Score is 0.2 out of 0.2
Keyword Score is 0.5 out of 0.5
Semantic Score is 0.3 out of 0.3
Q2: 1.00/1.0
Question no. 3
Grammar Score is 0.16923076923076924 out of 0.2
Keyword Score is 0.0 out of 0.5
Semantic Score is 0.018051613122224808 out of 0.3
Q3: 0.19/1.0
Question no. 4
Grammar Score is 0.16363636363636364 out of 0.2
Keyword Score is 0.0 out of 0.5
Semantic Score is 0.007958633825182915 out of 0.3
Q4: 0.17/1.0
Question no. 5
Grammar Score is 0.17500000000000002 out of 0.2
Keyword Score is 0.0 out of 0.5
Semantic Score is 0.045598626136779785 out of 0.3
Q5: 0.22/1.0
Question no. 6
Grammar Score is 0.17500000000000002 out of 0.2
Keyword Score is 0.0 out

# Test 4

The most erronous pdf. It even has errors in answers's text that is keywords missing, grammatical error, non-similarity etc.

In [None]:
!pip install pymupdf pdfplumber pandas numpy easyocr pdf2image nltk scikit-learn sentence-transformers flask reportlab textblob
!pip install --upgrade pymupdf
import fitz
import pdfplumber
import re
import os
import logging
import pandas as pd
from textblob import TextBlob
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
import os
import argparse
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Project/Project/Answer\ Sheet\ Evaluation\ System/
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/Project/Project/Answer\ Sheet\ Evaluation\ System/

PATHS = {
    # Core datasets
    'questions': '/content/drive/MyDrive/Project/Project/Answer Sheet Evaluation System/data/QuestionAnswersDataSet.csv',
    'keywords': '/content/drive/MyDrive/Project/Project/Answer Sheet Evaluation System/data/Keywords.csv',
    'weights': '/content/drive/MyDrive/Project/Project/Answer Sheet Evaluation System/data/Evaluation_Weightage.csv',

    # Output files
    'output': {
        'results': '/content/drive/MyDrive/Project/Project/Answer Sheet Evaluation System/results/results.csv',
        'metadata': '/content/drive/MyDrive/Project/Project/Answer Sheet Evaluation System/results/result_metadata.csv'
    },

    # PDF input
    'data': '/content/drive/MyDrive/Project/Project/Answer Sheet Evaluation System/Test/test_anomalous_refined (1).pdf'
}
import fitz
class PDFProcessor:
    def __init__(self, start_page=1):
        self.start_page = start_page - 1  # 0-based index
        logging.getLogger("pdfminer").setLevel(logging.ERROR)

    def extract_text(self, pdf_path):
        doc = fitz.open(pdf_path)
        full_text = ""
        for page_num in range(self.start_page, len(doc)):
            full_text += doc[page_num].get_text()
        return full_text


import pdfplumber
import re

class AnswerParser:
    def __init__(self):
        # Comprehensive question pattern
        self.question_pattern = re.compile(
            r'(?:^|\n)(?:Q|Question|Problem|\d+)[\s.)-]*\s*(\d+)[\s:)-]*',
            re.IGNORECASE
        )

    def parse(self, pdf_path):
        """Parse PDF into {q_num: answer_text} using layout analysis"""
        answers = {}
        current_q = None
        answer_buffer = []

        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                # Extract text with layout preservation
                text = page.extract_text(layout=True, x_tolerance=5, y_tolerance=2)

                # Process text in order of appearance
                for line in text.split('\n'):
                    line = line.strip()
                    if not line:
                        continue

                    # Check for new question
                    match = self.question_pattern.search(line)
                    if match:
                        # Save previous answer
                        if current_q is not None:
                            answers[current_q] = ' '.join(answer_buffer).strip()
                            answer_buffer = []

                        current_q = int(match.group(1))
                        answer_start = match.end()
                        answer_buffer.append(line[answer_start:].strip())
                    else:
                        # Continue current answer
                        if current_q is not None:
                            answer_buffer.append(line)

        # Save last answer
        if current_q is not None:
            answers[current_q] = ' '.join(answer_buffer).strip()

        return self._merge_split_answers(answers)

    def _merge_split_answers(self, answers):
        """Merge answers split across pages/pages"""
        merged = {}
        sorted_qs = sorted(answers.keys())

        for q_num in sorted_qs:
            answer = answers[q_num]

            # Check if previous answer ends with continuation marker
            if q_num-1 in merged and merged[q_num-1].endswith((' ', '-')):
                merged[q_num-1] += ' ' + answer
            else:
                merged[q_num] = answer

        return merged

import os
import pandas as pd
from textblob import TextBlob
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

class ScoringEngine:
    def __init__(self):
        #Initialize with paths from config
        self.weights = self._load_data('weights')
        self.keywords = self._load_data('keywords')
        self.teacher_answers = self._load_answers('questions')
        self.model = SentenceTransformer('all-MiniLM-L6-v2')

    def _load_data(self, path_key):
        path = PATHS[path_key]
        if not path or not os.path.exists(path):
            raise FileNotFoundError(f"Path not found: {path}")
        return pd.read_csv(path, encoding='latin1')

    def _load_answers(self, path_key):
        #Specialized answer loader with column renaming
        df = self._load_data(path_key)
        return df.rename(columns={'Question Number': 'Question_Number'})[['Question_Number', 'Answer']]#We are only interested in these two columns
        #Ensure the column names are consistent with the rest of the code

    def _get_weights(self, q_num):
        matches = self.weights[self.weights['question'] == q_num]
        if matches.empty:
            raise KeyError(f"No weights found for Q{q_num}")  # Changed to KeyError
        return matches.iloc[0]

    def _get_model_answer(self, q_num):
        matches = self.teacher_answers[self.teacher_answers['Question_Number'] == q_num]
        if matches.empty:
            raise KeyError(f"No model answer found for Q{q_num}")  # Changed to KeyError
        return matches['Answer'].values[0]

    # def _get_keywords(self, q_num):
    #     #Retrieve and clean keywords
    #     keywords = self.keywords[self.keywords['Question Number'] == q_num].iloc[:, 1:]
    #     return [
    #         str(kw).strip().lower()
    #         for col in keywords.values
    #         for kw in col
    #         if pd.notna(kw) and str(kw).strip()
    #     ]

    def _get_keywords(self, q_num):
        keywords = self.keywords[self.keywords['Question Number'] == q_num].iloc[:, 1:]
        return [
            ' '.join(str(kw).strip().lower().split())
            for col in keywords.values
            for kw in col
            if pd.notna(kw) and str(kw).strip()
        ]


    def calculate_grammar_score(self, text):
        #Calculate grammar score with error normalization
        if not text.strip():
            return 0.0

        blob = TextBlob(text)
        total_words = len(blob.words)
        if total_words == 0:
            return 0.0

        error_count = sum(
            1 for word in blob.words
            if word.spellcheck()[0][0].lower() != word.lower()
        )
        return 1 - (error_count / total_words)

    # def calculate_keyword_score(self, student_answer, q_num):
    #     #Calculate keyword match score
    #     target_keywords = self._get_keywords(q_num)
    #     if not target_keywords:
    #         return 0.0

    #     student_words = student_answer.lower().split()
    #     matches = sum(1 for kw in target_keywords if any(kw in word for word in student_words))
    #     return matches / len(target_keywords)

    def calculate_keyword_score(self, student_answer, q_num):
        target_keywords = self._get_keywords(q_num)
        if not target_keywords:
            return 0.0

        # Split student answer into words
        student_words = student_answer.lower().split()

        # Convert keywords to word sequences
        keyword_phrases = [kw.split() for kw in target_keywords]

        matches = 0
        for phrase in keyword_phrases:
            # Check for phrase using sliding window
            phrase_length = len(phrase)
            found = False
            for i in range(len(student_words) - phrase_length + 1):
                if student_words[i:i+phrase_length] == phrase:
                    found = True
                    break
            if found:
                matches += 1

        return matches / len(target_keywords)

    def calculate_semantic_score(self, student_answer, q_num):
        #Calculate semantic similarity score
        model_answer = self._get_model_answer(q_num)
        embeddings = self.model.encode([model_answer, student_answer])
        return cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]

    def calculate_total_score(self, student_answer, q_num):
        weights = self._get_weights(q_num)

        # Already correct - weights are normalized to 0-1 scale

        g = self.calculate_grammar_score(student_answer) * (weights['grammarwt']/100)
        k = self.calculate_keyword_score(student_answer, q_num) * (weights['Keywordswt']/100)
        s = self.calculate_semantic_score(student_answer, q_num) * (weights['similarityWt']/100)


        print(f'Question no. {q_num}')
        print(f'Grammar Score is {g} out of {weights["grammarwt"]/100.0}')
        print(f'Keyword Score is {k} out of {weights["Keywordswt"]/100.0}')
        print(f'Semantic Score is {s} out of {weights["similarityWt"]/100.0}')

        return (g + k + s)

import pandas as pd


class ResultGenerator:
    def __init__(self):
        self.results_df = pd.DataFrame(columns=['RollNo', 'Name', 'Total', 'Percentage'])
        self.metadata_df = pd.DataFrame()

    def add_student(self, rollno, name, scores):
        # Add default 0 scores for missing questions
        for q_num in range(1, 201):
            if q_num not in scores:
                scores[q_num] = 0

        # Calculate total and percentage CORRECTLY
        max_possible = 200  # 200 questions 
        total = sum(scores.values())
        perc = (total / max_possible) * 100  # Fixed here

        self.results_df.loc[len(self.results_df)] = [rollno, name, total, perc]

        # Update metadata
        student_data = {'RollNo': rollno, 'Name': name}
        student_data.update({'Total': total, 'Percentage': perc})

        self.metadata_df = pd.concat([self.metadata_df, pd.DataFrame([student_data])], ignore_index=True)

        return [rollno, name, total, perc]

    def save_results(self):
        self.results_df.to_csv(PATHS['output']['results'], mode='a', header=False, index=False)
        self.metadata_df.to_csv(PATHS['output']['metadata'], mode='a', header=False, index=False)


#Main.py

logging.getLogger("pdfminer").setLevel(logging.ERROR)

def evaluate_student(pdf_path, rollno, name, start_page=1):
    # Initialize components
    processor = PDFProcessor(start_page)
    parser = AnswerParser()
    scorer = ScoringEngine()
    reporter = ResultGenerator()

    # Process PDF
    raw_text = processor.extract_text(pdf_path)
    # clean_text = parser.preprocess_pdf_text(raw_text)  # Static method call
    answers = parser.parse(pdf_path)  # Instance method

    # Score answers with detailed logging
    scores = {}
    for q_num, answer_text in answers.items():
        try:
            # answer_text = answer_data['answer']
            score = scorer.calculate_total_score(answer_text, q_num)
            print(f"Q{q_num}: {score:.2f}/1.0")  # Print individual scores
            scores[q_num] = score
        except KeyError as e:
            print(f"\n⚠️ Skipping Q{q_num}: {str(e)}")
        except Exception as e:
            print(f"\n❌ Error processing Q{q_num}: {str(e)}")

    # Generate reports
    reporter.add_student(rollno, name, scores)
    reporter.save_results()
    total = sum(scores.values())
    max_score = len(answers)
    print(f"\nProcessed {name} ({rollno}). Total: {total}/{max_score} ({total/max_score*100:.2f}%)")



def colab_evaluate_student():
    # Input parameters
    rollno = input("Enter Roll Number: ")
    name = input("Enter Student Name: ")
    start_page = int(input("Start page [1]: ") or 1)

    # try:
        # Verify PDF path
    pdf_path = PATHS['data']
    if not os.path.isfile(pdf_path):
      raise FileNotFoundError(f"PDF not found: {pdf_path}")

        # Process evaluation
    evaluate_student(pdf_path, rollno, name, start_page)
    print("\n✅ Evaluation Complete!")

    # except Exception as e:
        # print(f"\n❌ Error: {str(e)}")


colab_evaluate_student()

Enter Roll Number: 8
Enter Student Name: H
Start page [1]: 1
Question no. 1
Grammar Score is 0.07727272727272727 out of 0.1
Keyword Score is 0.4 out of 0.4
Semantic Score is 0.5000000596046448 out of 0.5
Q1: 0.98/1.0
Question no. 2
Grammar Score is 0.2 out of 0.2
Keyword Score is 0.5 out of 0.5
Semantic Score is 0.3 out of 0.3
Q2: 1.00/1.0
Question no. 3
Grammar Score is 0.2 out of 0.2
Keyword Score is 0.3333333333333333 out of 0.5
Semantic Score is 0.29999996423721315 out of 0.3
Q3: 0.83/1.0
Question no. 4
Grammar Score is 0.16363636363636364 out of 0.2
Keyword Score is 0.25 out of 0.5
Semantic Score is 0.29999996423721315 out of 0.3
Q4: 0.71/1.0
Question no. 5
Grammar Score is 0.2 out of 0.2
Keyword Score is 0.08333333333333333 out of 0.5
Semantic Score is 0.3 out of 0.3
Q5: 0.58/1.0
Question no. 6
Grammar Score is 0.16666666666666669 out of 0.2
Keyword Score is 0.4166666666666667 out of 0.5
Semantic Score is 0.29999998211860657 out of 0.3
Q6: 0.88/1.0
Question no. 8
Grammar Score is