In [None]:
import spacy
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sentence_transformers import SentenceTransformer, util

In [None]:
nlp = spacy.load('en_core_web_lg') 
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
word2vec_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
NUM_STUDENTS = 5  # Number of students to process
SCORE_SCALE = 5   # Maximum score for final output

def preprocess_text(text):
    if not isinstance(text, str):  
        return ""
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalnum()]
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    doc = nlp(" ".join(tokens))
    lemmatized_tokens = [token.lemma_ for token in doc]
    return " ".join(lemmatized_tokens)

def compute_sbert_similarity(sent1, sent2):
    embedding1 = sbert_model.encode(sent1, convert_to_tensor=True)  # Convert sentence to vector
    embedding2 = sbert_model.encode(sent2, convert_to_tensor=True)
    similarity = util.pytorch_cos_sim(embedding1, embedding2).item()  # Compute cosine similarity
    return max(0, min(1, similarity))  # Ensure similarity stays between 0 and 1

def compute_word2vec_similarity(sent1, sent2):
    """
    Compute cosine similarity using pretrained GoogleNews Word2Vec embeddings.
    Provides word-level similarity with richer vocabulary.
    """
    def sentence_vector(words):
        valid_words = [word for word in words if word in word2vec_model]  # Filter words in pretrained vocab
        if not valid_words:  # Return zero vector if no valid words
            return np.zeros(300)  # GoogleNews uses 300-dimensional vectors
        return np.mean([word2vec_model[word] for word in valid_words], axis=0)
    
    vec1 = sentence_vector(sent1.split())
    vec2 = sentence_vector(sent2.split())
    similarity = cosine_similarity([vec1], [vec2])[0][0]  # Compute cosine similarity
    return max(0, min(1, similarity))

In [None]:
df = pd.read_excel("QnA.xlsx") 

expected_columns = ["S.No", "Questions", "Expected Answers"] + [f"Student{i} Answers" for i in range(1, NUM_STUDENTS + 1)]
if not all(col in df.columns for col in expected_columns):
    raise ValueError("Input Excel file is missing required columns!")

# Prepare TF-IDF vectorizer with all text data for consistent vocabulary
tfidf_vectorizer = TfidfVectorizer()
tfidf_corpus = df.iloc[:, 2:].map(preprocess_text).values.flatten()  # All answer columns
tfidf_vectorizer.fit(tfidf_corpus)  # Fit on entire corpus for better term weighting

In [None]:
# Dictionary to store results for each student, including raw scores for comparison
results = {
    f"Student{i}": {
        "TF-IDF": [],
        "Word2Vec": [], 
        "Semantic": [],
        "Total": [],
        "RawScore": [] 
    } for i in range(1, NUM_STUDENTS + 1)
}


# Process each question row
for index, row in df.iterrows():
    expected_answer = preprocess_text(row["Expected Answers"])  # Preprocess reference answer
    
    # Compare each student's answer to the expected answer
    for i in range(1, NUM_STUDENTS + 1):
        student_answer = preprocess_text(row[f"Student{i} Answers"])
        
        # TF-IDF Similarity: Focuses on exact word overlap and frequency
        tfidf_matrix = tfidf_vectorizer.transform([expected_answer, student_answer])
        tfidf_similarity = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
        results[f"Student{i}"]["TF-IDF"].append(tfidf_similarity)
        
        # Word2Vec Similarity: Captures word relationships within the pair
        word2vec_similarity = compute_word2vec_similarity(expected_answer, student_answer)
        results[f"Student{i}"]["Word2Vec"].append(word2vec_similarity)
        
        # SBERT Similarity: Understands deeper semantic meaning
        semantic_similarity = compute_sbert_similarity(expected_answer, student_answer)
        results[f"Student{i}"]["Semantic"].append(semantic_similarity)
        
        # Compute total score as an average, scaled to SCORE_SCALE (5)
        avg_score = (tfidf_similarity + word2vec_similarity + semantic_similarity) / 3
        total_score_numeric = round(avg_score * SCORE_SCALE, 1)  # Numeric score for comparison
        total_score_str = f"{total_score_numeric}/{SCORE_SCALE}"  # Formatted string for display
        results[f"Student{i}"]["Total"].append(total_score_str)
        results[f"Student{i}"]["RawScore"].append(total_score_numeric)  # Store numeric score

# Add results to DataFrame with descriptive column names
for i in range(1, NUM_STUDENTS + 1):
    prefix = f"Student{i}"
    df[f"{prefix} TF-IDF Similarity"] = results[prefix]["TF-IDF"]
    df[f"{prefix} Word2Vec Similarity"] = results[prefix]["Word2Vec"]
    df[f"{prefix} Semantic Similarity"] = results[prefix]["Semantic"]
    df[f"{prefix} Total Score"] = results[prefix]["Total"]

In [None]:
def get_highest_scorer(row):

    scores = [row[f"Student{i} Total Score"] for i in range(1, NUM_STUDENTS + 1)]  # Get all total scores
    numeric_scores = [float(score.split('/')[0]) for score in scores]  # Extract numeric part (e.g., 4.3 from "4.3/5")
    max_score = max(numeric_scores)  # Find the highest score
    
    # Identify all students with the maximum score (to handle ties)
    top_students = [
        f"Student{i}" for i in range(1, NUM_STUDENTS + 1)
        if numeric_scores[i-1] == max_score
    ]
    return ", ".join(top_students)  # Return as comma-separated string (e.g., "Student1, Student3")

# Add the new column to the DataFrame
df["Highest Scoring Student"] = df.apply(get_highest_scorer, axis=1)
# ---------------------------------------

In [None]:
output_file = "sbert_output.xlsx"
df.to_excel(output_file, index=False)
print(f"Output File '{output_file}' is Saved Successfully")