In [8]:
import spacy
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from transformers import GPT2Tokenizer, GPT2Model
import torch

In [9]:
nlp = spacy.load('en_core_web_lg')
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
tokenizer.pad_token = tokenizer.eos_token  # Set EOS as padding token to avoid errors
gpt_model = GPT2Model.from_pretrained('distilgpt2')
word2vec_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [10]:
NUM_STUDENTS = 5  # Number of students to process
SCORE_SCALE = 5   # Maximum score for final output

def preprocess_text(text):
    if not isinstance(text, str):  
        return ""
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalnum()]
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    doc = nlp(" ".join(tokens))
    lemmatized_tokens = [token.lemma_ for token in doc]
    return " ".join(lemmatized_tokens)

def compute_distilgpt2_similarity(sent1, sent2):
    """
    Compute cosine similarity using DistilGPT2 embeddings by averaging token vectors.
    """
    inputs1 = tokenizer(sent1, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs2 = tokenizer(sent2, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs1 = gpt_model(**inputs1)
        outputs2 = gpt_model(**inputs2)
    embedding1 = outputs1.last_hidden_state.mean(dim=1).squeeze().numpy()
    embedding2 = outputs2.last_hidden_state.mean(dim=1).squeeze().numpy()
    similarity = cosine_similarity([embedding1], [embedding2])[0][0]
    return max(0, min(1, similarity))

def compute_word2vec_similarity(sent1, sent2):
    """
    Compute cosine similarity using pretrained GoogleNews Word2Vec embeddings.
    Provides word-level similarity with richer vocabulary.
    """
    def sentence_vector(words):
        valid_words = [word for word in words if word in word2vec_model]
        if not valid_words:
            return np.zeros(300)
        return np.mean([word2vec_model[word] for word in valid_words], axis=0)
    vec1 = sentence_vector(sent1.split())
    vec2 = sentence_vector(sent2.split())
    similarity = cosine_similarity([vec1], [vec2])[0][0]
    return max(0, min(1, similarity))

In [11]:
df = pd.read_excel("QnA.xlsx") 

expected_columns = ["S.No", "Questions", "Expected Answers"] + [f"Student{i} Answers" for i in range(1, NUM_STUDENTS + 1)]
if not all(col in df.columns for col in expected_columns):
    raise ValueError("Input Excel file is missing required columns!")

# Prepare TF-IDF vectorizer with all text data for consistent vocabulary
tfidf_vectorizer = TfidfVectorizer()
tfidf_corpus = df.iloc[:, 2:].map(preprocess_text).values.flatten()  # All answer columns
tfidf_vectorizer.fit(tfidf_corpus)  # Fit on entire corpus for better term weighting

In [None]:
results = {
    f"Student{i}": {
        "TF-IDF": [],
        "Word2Vec": [],
        "Semantic": [],
        "Total": [],
        "RawScore": []
    } for i in range(1, NUM_STUDENTS + 1)
}

for index, row in df.iterrows():
    expected_answer = preprocess_text(row["Expected Answers"])
    for i in range(1, NUM_STUDENTS + 1):
        student_answer = preprocess_text(row[f"Student{i} Answers"])
        
        tfidf_matrix = tfidf_vectorizer.transform([expected_answer, student_answer])
        tfidf_similarity = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
        results[f"Student{i}"]["TF-IDF"].append(tfidf_similarity)
        
        word2vec_similarity = compute_word2vec_similarity(expected_answer, student_answer)
        results[f"Student{i}"]["Word2Vec"].append(word2vec_similarity)
        
        semantic_similarity = compute_distilgpt2_similarity(expected_answer, student_answer)  # Updated
        results[f"Student{i}"]["Semantic"].append(semantic_similarity)
        
        avg_score = (tfidf_similarity + word2vec_similarity + semantic_similarity) / 3
        total_score_numeric = round(avg_score * SCORE_SCALE, 1)
        total_score_str = f"{total_score_numeric}/{SCORE_SCALE}"
        results[f"Student{i}"]["Total"].append(total_score_str)
        results[f"Student{i}"]["RawScore"].append(total_score_numeric)

for i in range(1, NUM_STUDENTS + 1):
    prefix = f"Student{i}"
    df[f"{prefix} TF-IDF Similarity"] = results[prefix]["TF-IDF"]
    df[f"{prefix} Word2Vec Similarity"] = results[prefix]["Word2Vec"]
    df[f"{prefix} Semantic Similarity"] = results[prefix]["Semantic"]
    df[f"{prefix} Total Score"] = results[prefix]["Total"]

In [None]:
def get_highest_scorer(row):

    scores = [row[f"Student{i} Total Score"] for i in range(1, NUM_STUDENTS + 1)]  # Get all total scores
    numeric_scores = [float(score.split('/')[0]) for score in scores]  # Extract numeric part (e.g., 4.3 from "4.3/5")
    max_score = max(numeric_scores)  # Find the highest score
    
    # Identify all students with the maximum score (to handle ties)
    top_students = [
        f"Student{i}" for i in range(1, NUM_STUDENTS + 1)
        if numeric_scores[i-1] == max_score
    ]
    return ", ".join(top_students)  # Return as comma-separated string (e.g., "Student1, Student3")

# Add the new column to the DataFrame
df["Highest Scoring Student"] = df.apply(get_highest_scorer, axis=1)
# ---------------------------------------

In [None]:
output_file = "distilgpt_output.xlsx"
df.to_excel(output_file, index=False)
print(f"Output File '{output_file}' is Saved Successfully")