In [1]:
#Trắc Nghiệm VNHSGE - Lý, Hóa, Sinh
# Dataset: https://github.com/Xdao85/VNHSGE

In [2]:
# Import và Setup 
import pandas as pd
import numpy as np
import json
import re
import random
import os
import glob
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import pickle
import streamlit as st
import warnings
warnings.filterwarnings('ignore')

print("Import completed successfully")

Import completed successfully


In [3]:
# Load Data Functions 
def load_vnhsge_data(data_folder='Dataset'):
    """Load dữ liệu từ thư mục Dataset"""
    subjects = ['Biology', 'Chemistry', 'Physics']
    all_data = []
    
    for subject in subjects:
        subject_path = os.path.join(data_folder, subject)
        if not os.path.exists(subject_path):
            continue
            
        json_files = glob.glob(os.path.join(subject_path, "*.json"))
        
        for json_file in json_files:
            try:
                with open(json_file, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                
                for item in data:
                    if 'Question' in item and 'Choice' in item:
                        question_text, options = parse_question(item['Question'])
                        
                        question_data = {
                            'id': item.get('ID', ''),
                            'question': question_text,
                            'options': options,
                            'answer': item['Choice'],
                            'subject': subject.lower(),
                            'explanation': item.get('Explanation', '')
                        }
                        all_data.append(question_data)
            except:
                continue
    
    return pd.DataFrame(all_data)

def parse_question(question_full):
    """Tách câu hỏi và đáp án"""
    lines = question_full.split('\n')
    
    question = lines[0]
    if question.startswith('Câu'):
        question = re.sub(r'^Câu \d+:\s*', '', question)
    
    options = []
    for line in lines[1:]:
        line = line.strip()
        if line and (line.startswith('A.') or line.startswith('B.') or 
                    line.startswith('C.') or line.startswith('D.')):
            options.append(line)
    
    return question.strip(), options

# Test load data
data = load_vnhsge_data()
print(f"Loaded {len(data)} questions")

Loaded 600 questions


In [4]:
# Data Statistics 
def get_data_stats(data):
    """Thống kê dữ liệu"""
    stats = {}
    for subject in ['biology', 'chemistry', 'physics']:
        subject_data = data[data['subject'] == subject]
        stats[subject] = len(subject_data)
    
    return stats

stats = get_data_stats(data)
print("Data statistics:")
for subject, count in stats.items():
    print(f"{subject}: {count} questions")

Data statistics:
biology: 200 questions
chemistry: 200 questions
physics: 200 questions


In [5]:
# Text Preprocessing 
def preprocess_text(text):
    """Tiền xử lý văn bản"""
    text = text.lower()
    text = re.sub(r'[^\w\sàáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ]', ' ', text)
    text = ' '.join(text.split())
    return text

# Test preprocessing
sample_text = "Câu 1: Chất béo là gì?"
processed = preprocess_text(sample_text)
print(f"Original: {sample_text}")
print(f"Processed: {processed}")

Original: Câu 1: Chất béo là gì?
Processed: câu 1 chất béo là gì


In [6]:
# Prepare Training Data 
def prepare_training_data(data):
    """Chuẩn bị dữ liệu training"""
    questions = []
    subjects = []
    
    for _, row in data.iterrows():
        full_text = row['question'] + ' ' + ' '.join(row['options']) if row['options'] else row['question']
        processed_question = preprocess_text(full_text)
        questions.append(processed_question)
        subjects.append(row['subject'])
    
    return questions, subjects

questions, subjects = prepare_training_data(data)
print(f"Prepared {len(questions)} training samples")

Prepared 600 training samples


In [7]:
# Split Train/Test Data 
X_train, X_test, y_train, y_test = train_test_split(
    questions, subjects, 
    test_size=0.2, 
    random_state=42, 
    stratify=subjects
)

print(f"Train samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

Train samples: 480
Test samples: 120


In [8]:
# Create and Train Model =====
def create_and_train_model(X_train, y_train, X_test, y_test):
    """Tạo và train model"""
    vectorizer = TfidfVectorizer(
        max_features=5000,
        ngram_range=(1, 2),
        min_df=2,
        max_df=0.95
    )
    
    classifier = MultinomialNB(alpha=0.1)
    
    # Vectorize và train
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    
    classifier.fit(X_train_vec, y_train)
    
    # Đánh giá
    y_pred = classifier.predict(X_test_vec)
    accuracy = accuracy_score(y_test, y_pred)
    
    # Classification report chi tiết
    report = classification_report(y_test, y_pred, target_names=['biology', 'chemistry', 'physics'], output_dict=True)
    
    return vectorizer, classifier, accuracy, report

vectorizer, classifier, accuracy, report = create_and_train_model(X_train, y_train, X_test, y_test)
print(f"Model accuracy: {accuracy:.3f}")

# Hiển thị Precision, Recall, F1-score cho từng môn
print("\nDetailed metrics by subject:")
for subject in ['biology', 'chemistry', 'physics']:
    metrics = report[subject]
    print(f"{subject.capitalize()}:")
    print(f"  Precision: {metrics['precision']:.3f}")
    print(f"  Recall: {metrics['recall']:.3f}")
    print(f"  F1-score: {metrics['f1-score']:.3f}")
    print(f"  Support: {int(metrics['support'])}")
    
print(f"\nMacro avg F1-score: {report['macro avg']['f1-score']:.3f}")
print(f"Weighted avg F1-score: {report['weighted avg']['f1-score']:.3f}")

Model accuracy: 0.992

Detailed metrics by subject:
Biology:
  Precision: 1.000
  Recall: 0.975
  F1-score: 0.987
  Support: 40
Chemistry:
  Precision: 1.000
  Recall: 1.000
  F1-score: 1.000
  Support: 40
Physics:
  Precision: 0.976
  Recall: 1.000
  F1-score: 0.988
  Support: 40

Macro avg F1-score: 0.992
Weighted avg F1-score: 0.992


In [9]:
# Save/Load Model
def save_model(vectorizer, classifier, path='vnhsge_model.pkl'):
    """Lưu model"""
    model_data = {
        'vectorizer': vectorizer,
        'classifier': classifier
    }
    with open(path, 'wb') as f:
        pickle.dump(model_data, f)
    return True

def load_model(path='vnhsge_model.pkl'):
    """Load model"""
    try:
        with open(path, 'rb') as f:
            model_data = pickle.load(f)
        return model_data['vectorizer'], model_data['classifier']
    except:
        return None, None

# Save model
save_status = save_model(vectorizer, classifier)
print(f"Model saved: {save_status}")


Model saved: True


In [10]:
# Prediction Function 
def predict_subject(question, vectorizer, classifier):
    """Dự đoán môn học"""
    processed = preprocess_text(question)
    question_vec = vectorizer.transform([processed])
    prediction = classifier.predict(question_vec)[0]
    return prediction

# Test prediction
test_question = "Axit amin là đơn phân cấu tạo nên phân tử nào?"
predicted = predict_subject(test_question, vectorizer, classifier)
print(f"Predicted subject: {predicted}")

Predicted subject: biology


In [11]:
# Question Selection 
def get_random_question(data, subject=None):
    """Lấy câu hỏi ngẫu nhiên"""
    if subject:
        filtered_data = data[data['subject'] == subject]
    else:
        filtered_data = data
    
    if len(filtered_data) == 0:
        return None
    
    return filtered_data.sample(1).iloc[0]

# Test question selection
random_question = get_random_question(data, 'biology')
print(f"Random question: {random_question['question'][:50]}...")

Random question: Câu 119. Phát biểu nào sau đây về quan hệ cạnh tra...


In [12]:
# Answer Checking 
def check_answer(user_answer, correct_answer):
    """Kiểm tra đáp án"""
    user_answer = user_answer.upper().strip()
    correct_answer = correct_answer.upper().strip()
    
    is_correct = user_answer == correct_answer
    return is_correct

# Test answer checking
result = check_answer('A', 'A')
print(f"Answer check result: {result}")

Answer check result: True


In [13]:
# Score Tracking 
class ScoreTracker:
    def __init__(self):
        self.correct = 0
        self.total = 0
    
    def add_result(self, is_correct):
        self.total += 1
        if is_correct:
            self.correct += 1
    
    def get_accuracy(self):
        if self.total == 0:
            return 0
        return (self.correct / self.total) * 100
    
    def reset(self):
        self.correct = 0
        self.total = 0

# Test score tracker
score_tracker = ScoreTracker()
score_tracker.add_result(True)
score_tracker.add_result(False)
print(f"Score: {score_tracker.correct}/{score_tracker.total} ({score_tracker.get_accuracy():.1f}%)")

Score: 1/2 (50.0%)


In [14]:
# Streamlit 
def create_streamlit_app():
    """Tạo giao diện Streamlit"""
    
    st.title("Trắc Nghiệm VNHSGE")
    st.write("Hệ thống hỏi đáp trắc nghiệm Lý - Hóa - Sinh")
    
    # Initialize session state
    if 'data' not in st.session_state:
        st.session_state.data = load_vnhsge_data()
        st.session_state.vectorizer, st.session_state.classifier = load_model()
        st.session_state.score_tracker = ScoreTracker()
        st.session_state.current_question = None
    
    # Subject selection
    subject_options = {
        'Ngẫu nhiên': None,
        'Sinh học': 'biology',
        'Hóa học': 'chemistry', 
        'Vật lý': 'physics'
    }
    
    selected_subject = st.selectbox("Chọn môn học:", list(subject_options.keys()))
    subject_code = subject_options[selected_subject]
    
    # Get new question button
    if st.button("Câu hỏi mới"):
        st.session_state.current_question = get_random_question(st.session_state.data, subject_code)
    
    # Display current question
    if st.session_state.current_question is not None:
        question = st.session_state.current_question
        
        st.write("**Câu hỏi:**")
        st.write(question['question'])
        
        st.write("**Các đáp án:**")
        for option in question['options']:
            st.write(option)
        
        # Answer input
        user_answer = st.radio("Chọn đáp án:", ['A', 'B', 'C', 'D'])
        
        # Submit answer
        if st.button("Gửi đáp án"):
            is_correct = check_answer(user_answer, question['answer'])
            st.session_state.score_tracker.add_result(is_correct)
            
            if is_correct:
                st.success("Đúng!")
            else:
                st.error(f"Sai! Đáp án đúng là: {question['answer']}")
            
            if question['explanation']:
                st.info(f"Giải thích: {question['explanation']}")
        
        # Display score
        st.write("---")
        st.write(f"**Điểm số:** {st.session_state.score_tracker.correct}/{st.session_state.score_tracker.total}")
        st.write(f"**Độ chính xác:** {st.session_state.score_tracker.get_accuracy():.1f}%")
    
    else:
        st.write("Nhấn 'Câu hỏi mới' để bắt đầu!")

In [15]:
# Run Streamlit App 
if __name__ == "__main__":
    # Để chạy app streamlit, sử dụng lệnh:
    # streamlit run main.py
    create_streamlit_app()

print("Streamlit app ready. Run with: streamlit run main.py")

2025-07-25 01:26:30.629 
  command:

    streamlit run C:\Users\thanh\AppData\Roaming\Python\Python310\site-packages\ipykernel_launcher.py [ARGUMENTS]
2025-07-25 01:26:30.636 Session state does not function when running a script without `streamlit run`


Streamlit app ready. Run with: streamlit run main.py
