In [None]:
!pip install sklearn
!pip install scikit-learn
!pip install pytesseract
!pip install pymupdf
!apt-get install poppler-utils -y
!pip install pdf2image
!pip install spacy transformers scikit-learn openpyxl
!apt install tesseract-ocr
!python -m spacy download en_core_web_sm


Collecting sklearn
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13
Collecting pymupdf
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_

In [None]:
import fitz
import spacy
import pandas as pd
from google.colab import files
import os
import torch
from transformers import BertTokenizer, BertModel
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from xgboost import XGBRegressor
import pytesseract
from PIL import Image
from pdf2image import convert_from_path
import re

# Load NLP and BERT
nlp = spacy.load("en_core_web_sm")
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Preprocess text
def preprocess(text):
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    text = text.lower()
    return text

# Extract text from scanned PDFs using OCR
def extract_text_from_scanned_pdf(pdf_path):
    pages = convert_from_path(pdf_path, 300)
    text = ""
    for page in pages:
        text += pytesseract.image_to_string(page)
    return text

# Extract text from normal PDF (answer key)
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    return "\n".join([page.get_text() for page in doc])

# BERT similarity
def compute_bert_similarity(text1, text2):
    def get_embedding(text):
        tokens = bert_tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            output = bert_model(**tokens)
        return output.last_hidden_state.mean(dim=1).squeeze()

    emb1 = get_embedding(text1)
    emb2 = get_embedding(text2)
    similarity = torch.nn.functional.cosine_similarity(emb1, emb2, dim=0).item()
    return similarity * 100

# LDA similarity
def lda_topic_similarity(text1, text2):
    vectorizer = CountVectorizer(stop_words='english')
    text1_tokens = vectorizer.build_analyzer()(text1)
    text2_tokens = vectorizer.build_analyzer()(text2)

    if not text1_tokens or not text2_tokens:
        return 0

    dtm = vectorizer.fit_transform([text1, text2])
    lda = LatentDirichletAllocation(n_components=1, random_state=42)
    lda.fit(dtm)
    topics = lda.transform(dtm)
    similarity = 1 - abs(topics[0][0] - topics[1][0])
    return similarity * 100

# Grade assignment
def assign_grade(score, max_score):
    percent = (score / max_score) * 100
    if percent >= 90:
        return "A+"
    elif percent >= 80:
        return "A"
    elif percent >= 70:
        return "B"
    elif percent >= 60:
        return "C"
    elif percent >= 50:
        return "D"
    else:
        return "F"

# Save to Excel
def save_grading_to_excel(student_name, student_id, marks_obtained, total_marks, grade):
    filename = "grading_results_XGBoost.xlsx"
    new_row = pd.DataFrame([{
        "Student Name": student_name,
        "Student ID": student_id,
        "Marks Obtained": marks_obtained,
        "Total Marks": total_marks,
        "Grade": grade
    }])
    if os.path.exists(filename):
        df = pd.read_excel(filename)
        df = pd.concat([df, new_row], ignore_index=True)
    else:
        df = new_row
    df.to_excel(filename, index=False)
    return filename

# Upload PDFs
print("\U0001F4C2 Upload scanned student answer script (PDF):")
uploaded = files.upload()
student_file = list(uploaded.keys())[0]
student_text = extract_text_from_scanned_pdf(student_file)

print("\U0001F4C2 Upload answer key (PDF):")
uploaded = files.upload()
answer_key_file = list(uploaded.keys())[0]
answer_text = extract_text_from_pdf(answer_key_file)

# Parse answer key
lines = [line.strip() for line in answer_text.split('\n') if line.strip()]
answer_key = {}
for i in range(0, len(lines) - 1, 2):
    question, answer = lines[i], lines[i + 1]
    answer_key[question] = answer

# Choose total mark (50 or 100)
while True:
    try:
        max_score = int(input("Enter total marks (50 or 100): "))
        if max_score in [50, 100]:
            break
        else:
            print("❌ Please enter either 50 or 100.")
    except ValueError:
        print("❌ Invalid input. Enter a number.")

# Generate features
features = []
scaled_marks = []

if answer_key:
    per_question = max_score / len(answer_key)
    for q, model_answer in answer_key.items():
        model_clean = preprocess(model_answer)
        student_clean = preprocess(student_text)

        bert_score = compute_bert_similarity(student_clean, model_clean)
        lda_score = lda_topic_similarity(student_clean, model_clean)

        tfidf = TfidfVectorizer()
        X = tfidf.fit_transform([model_clean, student_clean])
        tfidf_diff = abs(X[0] - X[1]).sum()
        length_diff = abs(len(model_clean.split()) - len(student_clean.split()))

        features.append([bert_score, lda_score, tfidf_diff, length_diff])
        scaled_marks.append(per_question)

# Train XGBoost and predict
if features:
    df = pd.DataFrame(features, columns=["bert", "lda", "tfidf_diff", "length_diff"])
    y = pd.Series(scaled_marks)
    xgb = XGBRegressor()
    xgb.fit(df, y)
    predicted_marks = xgb.predict(df)
    predicted_total = predicted_marks.sum()
    grade = assign_grade(predicted_total, max_score)
else:
    predicted_total = 0
    grade = "F"

# Get student info
student_name = input("Enter student name: ")
student_id = input("Enter student ID: ")

# Save results
result_file = save_grading_to_excel(student_name, student_id, predicted_total, max_score, grade)
print("✅ Grading complete. Score:", predicted_total, "| Grade:", grade)
files.download(result_file)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

📂 Upload scanned student answer script (PDF):


Saving stuent script.pdf to stuent script.pdf
📂 Upload answer key (PDF):


Saving answer key.docx to answer key.docx
Enter total marks (50 or 100): 50
Enter student name: naveen 
Enter student ID: 12345
✅ Grading complete. Score: 50.0 | Grade: A+


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>