In [1]:
# Cell 1 - Import th∆∞ vi·ªán c·∫ßn thi·∫øt
import pandas as pd
import numpy as np
import json
import re
import os
import glob
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
warnings.filterwarnings('ignore')

In [2]:
# Cell 2 - Vietnamese NLP Functions
class VietnameseNLP:
    def __init__(self):
        self.stopwords = {
            'v√†', 'c·ªßa', 'c√≥', 'l√†', 'trong', 'v·ªõi', 'ƒë∆∞·ª£c', 'cho', 't·ª´', 'c√°c', 'm·ªôt', 'nh·ªØng',
            'n√†y', 'ƒë√≥', 'khi', 'ƒë·ªÉ', 'kh√¥ng', 'v·ªÅ', 'sau', 'tr∆∞·ªõc', 'hay', 'ho·∫∑c', 'n·∫øu', 'nh∆∞'
        }
        
    def normalize_vietnamese(self, text):
        return text.lower() if isinstance(text, str) else ""
    
    def remove_stopwords(self, text):
        if not isinstance(text, str):
            return ""
        words = text.split()
        filtered_words = [word for word in words if word not in self.stopwords]
        return ' '.join(filtered_words)
    
    def clean_text_advanced(self, text, remove_stopwords=True, normalize=True):
        if not isinstance(text, str):
            return ""
        text = re.sub(r'[^\w\s√†√°·∫°·∫£√£√¢·∫ß·∫•·∫≠·∫©·∫´ƒÉ·∫±·∫Ø·∫∑·∫≥·∫µ√®√©·∫π·∫ª·∫Ω√™·ªÅ·∫ø·ªá·ªÉ·ªÖ√¨√≠·ªã·ªâƒ©√≤√≥·ªç·ªè√µ√¥·ªì·ªë·ªô·ªï·ªó∆°·ªù·ªõ·ª£·ªü·ª°√π√∫·ª•·ªß≈©∆∞·ª´·ª©·ª±·ª≠·ªØ·ª≥√Ω·ªµ·ª∑·ªπƒë]', ' ', text)
        text = re.sub(r'\s+', ' ', text).strip()
        if normalize:
            text = self.normalize_vietnamese(text)
        if remove_stopwords:
            text = self.remove_stopwords(text)
        return text

def clean_vietnamese_text(text, remove_stopwords=True, normalize=True):
    nlp = VietnameseNLP()
    return nlp.clean_text_advanced(text, remove_stopwords, normalize)

In [3]:
# Cell 3 - Utility Functions
def convert_latex_to_text(text):
    if not isinstance(text, str):
        return text
    text = text.replace('\\n', '\n')
    replacements = {
        r'\\frac\{([^}]+)\}\{([^}]+)\}': r'(\1)/(\2)',
        r'\^{([^}]+)}': r'^(\1)',
        r'_{([^}]+)}': r'_(\1)',
        r'\\times': '√ó', r'\\div': '√∑', r'\\pm': '¬±',
    }
    for pattern, replacement in replacements.items():
        text = re.sub(pattern, replacement, text)
    return text.strip()

def parse_question(question_full):
    lines = question_full.split('\n')
    question = lines[0]
    if question.startswith('C√¢u'):
        question = re.sub(r'^C√¢u \d+:\s*', '', question)
    
    options = []
    for line in lines[1:]:
        line = line.strip()
        if line and line.startswith(('A.', 'B.', 'C.', 'D.')):
            options.append(line)
    
    return question.strip(), options

class ScoreTracker:
    def __init__(self):
        self.correct = 0
        self.total = 0
    
    def add_result(self, is_correct):
        self.total += 1
        if is_correct:
            self.correct += 1
    
    def get_accuracy(self):
        return (self.correct / self.total * 100) if self.total > 0 else 0

In [4]:
# Cell 4 - Load d·ªØ li·ªáu VNHSGE
def load_vnhsge_data(data_folder='Dataset'):
    subjects = ['Biology', 'Chemistry', 'Physics']
    all_data = []
    
    for subject in subjects:
        subject_path = os.path.join(data_folder, subject)
        if not os.path.exists(subject_path):
            continue
            
        json_files = glob.glob(os.path.join(subject_path, "*.json"))
        
        for json_file in json_files:
            try:
                with open(json_file, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                
                for item in data:
                    if 'Question' in item and 'Choice' in item:
                        question_text, options = parse_question(item['Question'])
                        
                        question_data = {
                            'id': item.get('ID', ''),
                            'question': question_text,
                            'options': options,
                            'answer': item['Choice'],
                            'subject': subject.lower(),
                            'explanation': convert_latex_to_text(item.get('Explanation', ''))
                        }
                        all_data.append(question_data)
            except:
                continue
    
    return pd.DataFrame(all_data)

raw_data = load_vnhsge_data()
print(f"üìä ƒê√£ t·∫£i {len(raw_data)} c√¢u h·ªèi")
print(f"üìã C·ªôt d·ªØ li·ªáu: {raw_data.columns.tolist()}")

üìä ƒê√£ t·∫£i 600 c√¢u h·ªèi
üìã C·ªôt d·ªØ li·ªáu: ['id', 'question', 'options', 'answer', 'subject', 'explanation']


In [5]:
# Cell 5 - Ph√¢n t√≠ch d·ªØ li·ªáu c∆° b·∫£n
print("üìà TH·ªêNG K√ä D·ªÆ LI·ªÜU:")
print(f"T·ªïng s·ªë c√¢u h·ªèi: {len(raw_data)}")
print(f"S·ªë m√¥n h·ªçc: {raw_data['subject'].nunique()}")
print("\nüìä Ph√¢n ph·ªëi theo m√¥n:")
print(raw_data['subject'].value_counts())

print("\nüîç M·∫´u d·ªØ li·ªáu:")
raw_data.head()

üìà TH·ªêNG K√ä D·ªÆ LI·ªÜU:
T·ªïng s·ªë c√¢u h·ªèi: 600
S·ªë m√¥n h·ªçc: 3

üìä Ph√¢n ph·ªëi theo m√¥n:
subject
biology      200
chemistry    200
physics      200
Name: count, dtype: int64

üîç M·∫´u d·ªØ li·ªáu:


Unnamed: 0,id,question,options,answer,subject,explanation
0,MET_Bio_IE_2019_1,C√≥ th·ªÉ s·ª≠ d·ª•ng h√≥a ch·∫•t n√†o sau ƒë√¢y ƒë·ªÉ ph√°t hi...,"[A. Dung d·ªãch NaCl., B. Dung d·ªãch Ca(OH)2., C....",B,biology,C√≥ th·ªÉ ƒë∆∞·ª£c s·ª≠ d·ª•ng Ca(OH)2 ƒë·ªÉ ph√°t hi·ªán qu√° t...
1,MET_Bio_IE_2019_2,ƒê·ªông v·∫≠t n√†o sau ƒë√¢y trao ƒë·ªïi kh√≠ v·ªõi m√¥i tr∆∞·ªù...,"[A. Ch√¢u ch·∫•u., B. S∆∞ t·ª≠., C. Chu·ªôt., D. ·∫æch ƒë...",A,biology,Ch√¢u ch·∫•u trao ƒë·ªïi kh√≠ v·ªõi m√¥i tr∆∞·ªùng th√¥ng qu...
2,MET_Bio_IE_2019_3,Axit amin l√† ƒë∆°n ph√¢n c·∫•u t·∫°o n√™n ph√¢n t·ª≠ n√†o ...,"[A. ADN., B. mARN., C. tARN., D. Pr√¥t√™in.]",D,biology,Axit amin l√† ƒë∆°n ph√¢n c·∫•u t·∫°o n√™n ph√¢n t·ª≠ Pr√¥t...
3,MET_Bio_IE_2019_4,Ph√¢n t·ª≠ n√†o sau ƒë√¢y tr·ª±c ti·∫øp l√†m khu√¥n cho qu...,"[A. ADN., B. mARN., C. tARN., D. rARN.]",B,biology,Ph√¢n t·ª≠ mARM tr·ª±c ti·∫øp l√†m khu√¥n cho qu√° tr√¨nh...
4,MET_Bio_IE_2019_5,M·ªôt ph√¢n t·ª≠ ADN ·ªü vi khu·∫©n c√≥ 10% s·ªë nucl√™√¥tit...,"[A. 10%., B. 30%., C. 20%., D. 40%.]",D,biology,"Theo nguy√™n t·∫Øc b·ªï sung A = T, G = X n√™n %A +%..."


In [6]:
# Cell 6 - Ki·ªÉm tra ch·∫•t l∆∞·ª£ng d·ªØ li·ªáu
print("üîç KI·ªÇM TRA CH·∫§T L∆Ø·ª¢NG D·ªÆ LI·ªÜU:")
print(f"Null values:")
print(raw_data.isnull().sum())

print(f"\nüìè ƒê·ªô d√†i c√¢u h·ªèi:")
question_lengths = raw_data['question'].str.len()
print(f"Trung b√¨nh: {question_lengths.mean():.1f} k√Ω t·ª±")
print(f"Min: {question_lengths.min()}, Max: {question_lengths.max()}")

print(f"\nüìù Ph√¢n ph·ªëi ƒë√°p √°n:")
print(raw_data['answer'].value_counts())

üîç KI·ªÇM TRA CH·∫§T L∆Ø·ª¢NG D·ªÆ LI·ªÜU:
Null values:
id             0
question       0
options        0
answer         0
subject        0
explanation    0
dtype: int64

üìè ƒê·ªô d√†i c√¢u h·ªèi:
Trung b√¨nh: 167.3 k√Ω t·ª±
Min: 21, Max: 754

üìù Ph√¢n ph·ªëi ƒë√°p √°n:
answer
A    188
C    144
B    140
D    128
Name: count, dtype: int64


In [7]:
# Cell 7 - DifficultyClassifier Class
class DifficultyClassifier:
    def __init__(self):
        self.text_vectorizer = TfidfVectorizer(max_features=1500, ngram_range=(1, 2), min_df=2, max_df=0.9)
        self.model = RandomForestClassifier(n_estimators=50, max_depth=8, random_state=42, n_jobs=-1)
        self.is_trained = False
        self.evaluation_results = {}
    
    def _extract_features(self, question_text, options_text):
        full_text = question_text + " " + options_text
        full_lower = full_text.lower()
        
        patterns = {
            'analysis': ['ph√¢n t√≠ch', 'so s√°nh', 'ƒë√°nh gi√°', 'gi·∫£i th√≠ch'],
            'calculation': ['t√≠nh', 'to√°n', 'c√¥ng th·ª©c', 'mol', 'gam'],
            'synthesis': ['t·ªïng h·ª£p', 'ph·∫£n ·ª©ng', 'c∆° ch·∫ø', 'qu√° tr√¨nh'],
            'evaluation': ['·∫£nh h∆∞·ªüng', 't√°c ƒë·ªông', 'nguy√™n nh√¢n'],
            'definition': ['l√† g√¨', 't√™n g·ªçi', 'thu·ªôc'],
            'identification': ['m√†u', 'tr·∫°ng th√°i', 't√≠nh ch·∫•t']
        }
        
        features = []
        for category in ['analysis', 'calculation', 'synthesis', 'evaluation', 'definition', 'identification']:
            count = sum(1 for word in patterns[category] if word in full_lower)
            features.append(count)
        
        features.extend([
            len(question_text.split()),
            len(options_text.split()),
            full_text.count('.'),
            sum(1 for c in full_text if c in '+-*/=()$^_'),
            sum(1 for c in full_text if c.isupper()),
            sum(1 for w in full_text.split() if len(w) > 8)
        ])
        
        features.extend([
            1 if 't·∫°i sao' in full_lower or 'v√¨ sao' in full_lower else 0,
            1 if 'nh∆∞ th·∫ø n√†o' in full_lower else 0,
            1 if 'bao nhi√™u' in full_lower else 0
        ])
        
        return np.array(features).reshape(1, -1)
    
    def _create_labels(self, data):
        difficulties = []
        for _, row in data.iterrows():
            options_text = ' '.join(row['options']) if row['options'] else ''
            features = self._extract_features(row['question'], options_text).flatten()
            
            score = (features[0] + features[1] + features[2]) * 2
            score -= (features[4] + features[5])
            score += features[6] * 0.1 + features[9] * 0.2
            score += features[12] + features[13] * 2
            
            if score <= 2:
                difficulty = 'easy'
            elif score <= 5:
                difficulty = 'medium'
            else:
                difficulty = 'hard'
            
            difficulties.append(difficulty)
        
        return difficulties

In [8]:
# Cell 8 - Kh·ªüi t·∫°o v√† train DifficultyClassifier
difficulty_classifier = DifficultyClassifier()
print("ü§ñ ƒêang t·∫°o nh√£n ƒë·ªô kh√≥...")
difficulties = difficulty_classifier._create_labels(raw_data)

print("üìä PH√ÇN PH·ªêI ƒê·ªò KH√ì:")
difficulty_counts = pd.Series(difficulties).value_counts()
print(difficulty_counts)
print(f"\nT·ª∑ l·ªá ph·∫ßn trƒÉm:")
print((difficulty_counts / len(difficulties) * 100).round(1))

ü§ñ ƒêang t·∫°o nh√£n ƒë·ªô kh√≥...
üìä PH√ÇN PH·ªêI ƒê·ªò KH√ì:
hard      282
medium    202
easy      116
Name: count, dtype: int64

T·ª∑ l·ªá ph·∫ßn trƒÉm:
hard      47.0
medium    33.7
easy      19.3
Name: count, dtype: float64


In [9]:
# Cell 9 - Ph√¢n t√≠ch features cho ƒë·ªô kh√≥
print("üîç PH√ÇN T√çCH FEATURES CHO ƒê·ªò KH√ì:")
sample_questions = raw_data.head(10)
for i, (_, row) in enumerate(sample_questions.iterrows()):
    options_text = ' '.join(row['options']) if row['options'] else ''
    features = difficulty_classifier._extract_features(row['question'], options_text).flatten()
    print(f"C√¢u {i+1}: {difficulties[i]} - Features: {features[:6]}")  # Show first 6 features


üîç PH√ÇN T√çCH FEATURES CHO ƒê·ªò KH√ì:
C√¢u 1: medium - Features: [0 0 1 0 0 0]
C√¢u 2: easy - Features: [0 0 0 0 0 0]
C√¢u 3: easy - Features: [0 0 0 0 0 0]
C√¢u 4: medium - Features: [0 0 1 0 0 0]
C√¢u 5: medium - Features: [0 0 0 0 0 0]
C√¢u 6: easy - Features: [0 0 0 0 0 0]
C√¢u 7: medium - Features: [0 0 0 0 0 0]
C√¢u 8: easy - Features: [0 0 0 0 0 0]
C√¢u 9: medium - Features: [0 0 0 0 0 0]
C√¢u 10: medium - Features: [0 0 0 0 0 0]


In [10]:
# Cell 10 - Train difficulty model v·ªõi evaluation
texts = []
for _, row in raw_data.iterrows():
    options_text = ' '.join(row['options']) if row['options'] else ''
    full_text = row['question'] + ' ' + options_text
    processed_text = clean_vietnamese_text(full_text, remove_stopwords=True, normalize=True)
    texts.append(processed_text)

X_train_text, X_test_text, y_train, y_test = train_test_split(
    texts, difficulties, test_size=0.2, random_state=42, stratify=difficulties
)

X_train_vec = difficulty_classifier.text_vectorizer.fit_transform(X_train_text)
X_test_vec = difficulty_classifier.text_vectorizer.transform(X_test_text)

difficulty_classifier.model.fit(X_train_vec, y_train)
y_pred = difficulty_classifier.model.predict(X_test_vec)

print("üéØ K·∫æT QU·∫¢ DIFFICULTY CLASSIFICATION:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")

üéØ K·∫æT QU·∫¢ DIFFICULTY CLASSIFICATION:
Accuracy: 0.6583
F1-Score: 0.6199


In [11]:
# Cell 11 - Confusion Matrix cho Difficulty
cm = confusion_matrix(y_test, y_pred)
labels = sorted(list(set(y_test) | set(y_pred)))

print("üìä CONFUSION MATRIX - DIFFICULTY:")
print(f"{'':>8}", end="")
for label in labels:
    print(f"{label:>8}", end="")
print()

for i, true_label in enumerate(labels):
    print(f"{true_label:>8}", end="")
    for j in range(len(labels)):
        print(f"{cm[i][j]:>8}", end="")
    print()

üìä CONFUSION MATRIX - DIFFICULTY:
            easy    hard  medium
    easy       3       3      17
    hard       0      50       7
  medium       0      14      26


In [12]:
# Cell 12 - Classification Report cho Difficulty
print("üìã CLASSIFICATION REPORT - DIFFICULTY:")
print(classification_report(y_test, y_pred))

üìã CLASSIFICATION REPORT - DIFFICULTY:
              precision    recall  f1-score   support

        easy       1.00      0.13      0.23        23
        hard       0.75      0.88      0.81        57
      medium       0.52      0.65      0.58        40

    accuracy                           0.66       120
   macro avg       0.76      0.55      0.54       120
weighted avg       0.72      0.66      0.62       120



In [13]:
# Cell 13 - TopicClassifier Class
class TopicClassifier:
    def __init__(self):
        self.vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1, 3), min_df=2, max_df=0.8)
        self.model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
        self.label_to_id = {}
        self.id_to_label = {}
        self.is_trained = False
        
        self.subject_topics = {
            'physics': ['Dao ƒë·ªông c∆°', 'S√≥ng c∆°', 'ƒêi·ªán xoay chi·ªÅu', 'T·ª´ tr∆∞·ªùng', 'ƒêi·ªán tr∆∞·ªùng', 'Quang h·ªçc', 'C∆° h·ªçc', 'Nhi·ªát h·ªçc'],
            'chemistry': ['H√≥a h·ªØu c∆°', 'Este ‚Äì Lipit', 'ƒêi·ªán ph√¢n', 'C√¢n b·∫±ng h√≥a h·ªçc', 'Axit - Baz∆°', 'Oxi h√≥a - Kh·ª≠', 'Polime', 'Kim lo·∫°i'],
            'biology': ['Di truy·ªÅn h·ªçc', 'Ti·∫øn h√≥a', 'Sinh th√°i h·ªçc', 'T·∫ø b√†o h·ªçc', 'Sinh l√Ω h·ªçc', 'Ph√¢n lo·∫°i sinh v·∫≠t', 'Sinh h·ªçc ph√¢n t·ª≠', 'Mi·ªÖn d·ªãch h·ªçc']
        }
    
    def _create_topic_labels(self, data):
        topic_keywords = {
            'Dao ƒë·ªông c∆°': ['dao ƒë·ªông', 'chu k·ª≥', 't·∫ßn s·ªë', 'bi√™n ƒë·ªô', 'con l·∫Øc', 'l√≤ xo'],
            'ƒêi·ªán xoay chi·ªÅu': ['xoay chi·ªÅu', 'ƒëi·ªán √°p hi·ªáu d·ª•ng', 'd√≤ng ƒëi·ªán xoay chi·ªÅu', 'm√°y bi·∫øn √°p'],
            'H√≥a h·ªØu c∆°': ['ankan', 'anken', 'ankin', 'benzen', 'ancol', 'phenol', 'carbon'],
            'Di truy·ªÅn h·ªçc': ['gen', 'alen', 'NST', 'nhi·ªÖm s·∫Øc th·ªÉ', 'ADN', 'ARN', 'ƒë·ªôt bi·∫øn', 'lai']
        }
        
        topics = []
        for _, row in data.iterrows():
            subject = row['subject']
            question_text = row['question'].lower()
            
            subject_topic_list = self.subject_topics.get(subject, [])
            best_topic = 'Kh√°c'
            max_score = 0
            
            for topic in subject_topic_list:
                if topic in topic_keywords:
                    keywords = topic_keywords[topic]
                    score = sum(1 for keyword in keywords if keyword in question_text)
                    
                    if score > max_score:
                        max_score = score
                        best_topic = topic
            
            if best_topic == 'Kh√°c':
                if subject == 'physics':
                    best_topic = 'C∆° h·ªçc'
                elif subject == 'chemistry':
                    best_topic = 'H√≥a h·ªØu c∆°'
                elif subject == 'biology':
                    best_topic = 'T·∫ø b√†o h·ªçc'
            
            topics.append(best_topic)
        
        return topics
    
    def get_topics_by_subject(self, subject):
        return self.subject_topics.get(subject, ['Kh√°c'])

In [14]:
# Cell 14 - Kh·ªüi t·∫°o v√† train TopicClassifier
topic_classifier = TopicClassifier()
print("üè∑Ô∏è ƒêang t·∫°o nh√£n ch·ªß ƒë·ªÅ...")
topics = topic_classifier._create_topic_labels(raw_data)

print("üìä PH√ÇN PH·ªêI CH·ª¶ ƒê·ªÄ:")
topic_counts = pd.Series(topics).value_counts()
print(topic_counts.head(10))

üè∑Ô∏è ƒêang t·∫°o nh√£n ch·ªß ƒë·ªÅ...
üìä PH√ÇN PH·ªêI CH·ª¶ ƒê·ªÄ:
H√≥a h·ªØu c∆°         200
C∆° h·ªçc             103
T·∫ø b√†o h·ªçc         100
Di truy·ªÅn h·ªçc      100
Dao ƒë·ªông c∆°         72
ƒêi·ªán xoay chi·ªÅu     25
Name: count, dtype: int64


In [15]:
# Cell 15 - Ph√¢n t√≠ch topics theo subject
print("üìã CH·ª¶ ƒê·ªÄ THEO M√îN H·ªåC:")
data_with_topics = raw_data.copy()
data_with_topics['topic'] = topics

for subject in ['physics', 'chemistry', 'biology']:
    subject_data = data_with_topics[data_with_topics['subject'] == subject]
    print(f"\n{subject.upper()}:")
    print(subject_data['topic'].value_counts().head(5))

üìã CH·ª¶ ƒê·ªÄ THEO M√îN H·ªåC:

PHYSICS:
topic
C∆° h·ªçc             103
Dao ƒë·ªông c∆°         72
ƒêi·ªán xoay chi·ªÅu     25
Name: count, dtype: int64

CHEMISTRY:
topic
H√≥a h·ªØu c∆°    200
Name: count, dtype: int64

BIOLOGY:
topic
T·∫ø b√†o h·ªçc       100
Di truy·ªÅn h·ªçc    100
Name: count, dtype: int64


In [16]:
# Cell 16 - Train topic classification model
topic_texts = []
for _, row in raw_data.iterrows():
    options_text = ' '.join(row['options']) if row['options'] else ''
    full_text = row['question'] + ' ' + options_text
    topic_texts.append(full_text)

unique_topics = sorted(list(set(topics)))
topic_classifier.label_to_id = {label: idx for idx, label in enumerate(unique_topics)}
topic_classifier.id_to_label = {idx: label for label, idx in topic_classifier.label_to_id.items()}

X_topic = topic_classifier.vectorizer.fit_transform(topic_texts)
y_topic = [topic_classifier.label_to_id[topic] for topic in topics]

X_train_topic, X_test_topic, y_train_topic, y_test_topic = train_test_split(
    X_topic, y_topic, test_size=0.2, random_state=42, stratify=y_topic
)

topic_classifier.model.fit(X_train_topic, y_train_topic)
y_pred_topic = topic_classifier.model.predict(X_test_topic)

print("üéØ K·∫æT QU·∫¢ TOPIC CLASSIFICATION:")
print(f"Accuracy: {accuracy_score(y_test_topic, y_pred_topic):.4f}")
print(f"F1-Score: {f1_score(y_test_topic, y_pred_topic, average='weighted'):.4f}")

üéØ K·∫æT QU·∫¢ TOPIC CLASSIFICATION:
Accuracy: 0.8667
F1-Score: 0.8648


In [17]:
# Cell 17 - SimilarQuestionFinder Class
class SimilarQuestionFinder:
    def __init__(self, data):
        self.data = data
        self.train_data, self.test_data = train_test_split(
            data, test_size=0.2, random_state=42, stratify=data['subject']
        )
        
        self.vectorizer = TfidfVectorizer(
            max_features=2000, ngram_range=(1, 2), min_df=2, max_df=0.85, sublinear_tf=True
        )
        
        self.question_vectors = None
        self._prepare_vectors()
    
    def _prepare_vectors(self):
        train_texts = []
        for _, row in self.train_data.iterrows():
            full_text = row['question'] + ' ' + ' '.join(row['options']) if row['options'] else row['question']
            processed = clean_vietnamese_text(full_text, remove_stopwords=True, normalize=True)
            train_texts.append(processed)
        
        self.vectorizer.fit(train_texts)
        
        all_texts = []
        for _, row in self.data.iterrows():
            full_text = row['question'] + ' ' + ' '.join(row['options']) if row['options'] else row['question']
            processed = clean_vietnamese_text(full_text, remove_stopwords=True, normalize=True)
            all_texts.append(processed)
            
        self.question_vectors = self.vectorizer.transform(all_texts)
    
    def find_similar_questions(self, current_question_id, n_similar=3):
        try:
            current_idx = None
            for idx, (_, row) in enumerate(self.data.iterrows()):
                if row['id'] == current_question_id:
                    current_idx = idx
                    break
            
            if current_idx is None:
                return []
            
            current_vector = self.question_vectors[current_idx]
            similarities = cosine_similarity(current_vector, self.question_vectors).flatten()
            
            similar_questions = []
            for idx, similarity in enumerate(similarities):
                if idx != current_idx:
                    question_data = self.data.iloc[idx]
                    similar_questions.append({
                        'question_data': question_data,
                        'similarity': similarity,
                        'index': idx
                    })
            
            similar_questions.sort(key=lambda x: x['similarity'], reverse=True)
            return similar_questions[:n_similar]
            
        except Exception:
            return []

In [18]:
# Cell 18 - Kh·ªüi t·∫°o enhanced dataset
data_enhanced = raw_data.copy()
data_enhanced['difficulty'] = difficulties
data_enhanced['topic'] = topics

print("üìä ENHANCED DATASET:")
print(f"Shape: {data_enhanced.shape}")
print(f"Columns: {data_enhanced.columns.tolist()}")
data_enhanced.head()

üìä ENHANCED DATASET:
Shape: (600, 8)
Columns: ['id', 'question', 'options', 'answer', 'subject', 'explanation', 'difficulty', 'topic']


Unnamed: 0,id,question,options,answer,subject,explanation,difficulty,topic
0,MET_Bio_IE_2019_1,C√≥ th·ªÉ s·ª≠ d·ª•ng h√≥a ch·∫•t n√†o sau ƒë√¢y ƒë·ªÉ ph√°t hi...,"[A. Dung d·ªãch NaCl., B. Dung d·ªãch Ca(OH)2., C....",B,biology,C√≥ th·ªÉ ƒë∆∞·ª£c s·ª≠ d·ª•ng Ca(OH)2 ƒë·ªÉ ph√°t hi·ªán qu√° t...,medium,T·∫ø b√†o h·ªçc
1,MET_Bio_IE_2019_2,ƒê·ªông v·∫≠t n√†o sau ƒë√¢y trao ƒë·ªïi kh√≠ v·ªõi m√¥i tr∆∞·ªù...,"[A. Ch√¢u ch·∫•u., B. S∆∞ t·ª≠., C. Chu·ªôt., D. ·∫æch ƒë...",A,biology,Ch√¢u ch·∫•u trao ƒë·ªïi kh√≠ v·ªõi m√¥i tr∆∞·ªùng th√¥ng qu...,easy,T·∫ø b√†o h·ªçc
2,MET_Bio_IE_2019_3,Axit amin l√† ƒë∆°n ph√¢n c·∫•u t·∫°o n√™n ph√¢n t·ª≠ n√†o ...,"[A. ADN., B. mARN., C. tARN., D. Pr√¥t√™in.]",D,biology,Axit amin l√† ƒë∆°n ph√¢n c·∫•u t·∫°o n√™n ph√¢n t·ª≠ Pr√¥t...,easy,T·∫ø b√†o h·ªçc
3,MET_Bio_IE_2019_4,Ph√¢n t·ª≠ n√†o sau ƒë√¢y tr·ª±c ti·∫øp l√†m khu√¥n cho qu...,"[A. ADN., B. mARN., C. tARN., D. rARN.]",B,biology,Ph√¢n t·ª≠ mARM tr·ª±c ti·∫øp l√†m khu√¥n cho qu√° tr√¨nh...,medium,T·∫ø b√†o h·ªçc
4,MET_Bio_IE_2019_5,M·ªôt ph√¢n t·ª≠ ADN ·ªü vi khu·∫©n c√≥ 10% s·ªë nucl√™√¥tit...,"[A. 10%., B. 30%., C. 20%., D. 40%.]",D,biology,"Theo nguy√™n t·∫Øc b·ªï sung A = T, G = X n√™n %A +%...",medium,T·∫ø b√†o h·ªçc


In [19]:
# Cell 19 - Kh·ªüi t·∫°o SimilarQuestionFinder
similar_finder = SimilarQuestionFinder(data_enhanced)
print("üîç Similar Question Finder ƒë√£ ƒë∆∞·ª£c kh·ªüi t·∫°o")
print(f"üìä Training data: {len(similar_finder.train_data)} questions")
print(f"üìä Test data: {len(similar_finder.test_data)} questions")
print(f"üî§ Vocabulary size: {len(similar_finder.vectorizer.vocabulary_)}")

üîç Similar Question Finder ƒë√£ ƒë∆∞·ª£c kh·ªüi t·∫°o
üìä Training data: 480 questions
üìä Test data: 120 questions
üî§ Vocabulary size: 2000


In [20]:
# Cell 20 - Test similarity v·ªõi sample questions
print("üß™ TEST SIMILARITY V·ªöI SAMPLE QUESTIONS:")
sample_ids = data_enhanced['id'].head(3).tolist()

for i, question_id in enumerate(sample_ids):
    current_q = data_enhanced[data_enhanced['id'] == question_id].iloc[0]
    print(f"\n--- C√¢u {i+1} ---")
    print(f"ID: {question_id}")
    print(f"Subject: {current_q['subject']}")
    print(f"Difficulty: {current_q['difficulty']}")
    print(f"Topic: {current_q['topic']}")
    print(f"Question: {current_q['question'][:80]}...")
    
    similar_questions = similar_finder.find_similar_questions(question_id, n_similar=2)
    if similar_questions:
        print("Similar questions:")
        for j, similar in enumerate(similar_questions):
            sim_q = similar['question_data']
            print(f"  {j+1}. Similarity: {similar['similarity']:.3f} | Subject: {sim_q['subject']} | {sim_q['question'][:50]}...")

üß™ TEST SIMILARITY V·ªöI SAMPLE QUESTIONS:

--- C√¢u 1 ---
ID: MET_Bio_IE_2019_1
Subject: biology
Difficulty: medium
Topic: T·∫ø b√†o h·ªçc
Question: C√≥ th·ªÉ s·ª≠ d·ª•ng h√≥a ch·∫•t n√†o sau ƒë√¢y ƒë·ªÉ ph√°t hi·ªán qu√° tr√¨nh h√¥ h·∫•p ·ªü th·ª±c v·∫≠t th·∫£...
Similar questions:
  1. Similarity: 0.381 | Subject: biology | C√¢u 85. Nh√≥m th·ª±c v·∫≠t n√†o sau ƒë√¢y x·∫£y ra qu√° tr√¨nh...
  2. Similarity: 0.300 | Subject: biology | C√¢u 109. Khi n√≥i v·ªÅ h√¥ h·∫•p ·ªü th·ª±c v·∫≠t, c√≥ bao nhi√™...

--- C√¢u 2 ---
ID: MET_Bio_IE_2019_2
Subject: biology
Difficulty: easy
Topic: T·∫ø b√†o h·ªçc
Question: ƒê·ªông v·∫≠t n√†o sau ƒë√¢y trao ƒë·ªïi kh√≠ v·ªõi m√¥i tr∆∞·ªùng th√¥ng qua h·ªá th·ªëng ·ªëng kh√≠?...
Similar questions:
  1. Similarity: 0.616 | Subject: biology | ƒê·ªông v·∫≠t n√†o sau ƒë√¢y h√¥ h·∫•p b·∫±ng h·ªá th·ªëng ·ªëng kh√≠?...
  2. Similarity: 0.418 | Subject: biology | C√¢u 95. Sinh v·∫≠t n√†o sau ƒë√¢y c√≥ qu√° tr√¨nh trao ƒë·ªïi...

--- C√¢u 3 ---
ID: MET_Bio_IE_

In [21]:
# Cell 21 - Evaluate similarity performance
test_sample = similar_finder.test_data.sample(min(50, len(similar_finder.test_data)), random_state=42)
same_subject_correct = 0
total_tests = 0
within_subject_similarity = []
cross_subject_similarity = []

print("üìä ƒê√ÅNH GI√Å SIMILARITY PERFORMANCE:")

for _, test_question in test_sample.iterrows():
    similar_questions = similar_finder.find_similar_questions(test_question['id'], n_similar=3)
    
    if similar_questions:
        most_similar = similar_questions[0]
        if most_similar['question_data']['subject'] == test_question['subject']:
            same_subject_correct += 1
        
        for similar in similar_questions:
            sim_score = similar['similarity']
            if similar['question_data']['subject'] == test_question['subject']:
                within_subject_similarity.append(sim_score)
            else:
                cross_subject_similarity.append(sim_score)
        
        total_tests += 1

subject_accuracy = same_subject_correct / total_tests if total_tests > 0 else 0
print(f"Subject Accuracy: {subject_accuracy:.4f} ({same_subject_correct}/{total_tests})")

if within_subject_similarity:
    print(f"Within Subject Similarity: {np.mean(within_subject_similarity):.4f} ¬± {np.std(within_subject_similarity):.4f}")
if cross_subject_similarity:
    print(f"Cross Subject Similarity: {np.mean(cross_subject_similarity):.4f} ¬± {np.std(cross_subject_similarity):.4f}")

üìä ƒê√ÅNH GI√Å SIMILARITY PERFORMANCE:
Subject Accuracy: 0.9800 (49/50)
Within Subject Similarity: 0.4839 ¬± 0.1394
Cross Subject Similarity: 0.2409 ¬± 0.0483


In [22]:
# Cell 22 - Utility functions cho quiz
def get_random_question(data, subject=None, year=None, difficulty=None, topic=None):
    filtered_data = data.copy()
    
    if subject:
        filtered_data = filtered_data[filtered_data['subject'] == subject]
    if year:
        filtered_data = filtered_data[filtered_data['id'].str.contains(str(year), na=False)]
    if difficulty and 'difficulty' in filtered_data.columns:
        filtered_data = filtered_data[filtered_data['difficulty'] == difficulty]
    if topic and 'topic' in filtered_data.columns:
        filtered_data = filtered_data[filtered_data['topic'] == topic]
    
    if len(filtered_data) == 0:
        return None
    
    return filtered_data.sample(1).iloc[0]

def check_answer(user_answer, correct_answer):
    return user_answer.upper().strip() == correct_answer.upper().strip()

print("‚úÖ Utility functions ƒë√£ ƒë∆∞·ª£c ƒë·ªãnh nghƒ©a")

‚úÖ Utility functions ƒë√£ ƒë∆∞·ª£c ƒë·ªãnh nghƒ©a


In [23]:
# Cell 23 - Test filter functionality
print("üîç TEST FILTER FUNCTIONALITY:")

filter_tests = [
    {'subject': 'physics', 'difficulty': 'easy'},
    {'subject': 'chemistry', 'topic': 'H√≥a h·ªØu c∆°'},
    {'difficulty': 'hard'},
    {'subject': 'biology', 'difficulty': 'medium'}
]

for i, filters in enumerate(filter_tests):
    print(f"\nTest {i+1}: {filters}")
    question = get_random_question(data_enhanced, **filters)
    if question is not None:
        print(f"  ‚úÖ Found: {question['subject']} | {question['difficulty']} | {question['topic']}")
        print(f"  Question: {question['question'][:60]}...")
    else:
        print("  ‚ùå No questions found")

üîç TEST FILTER FUNCTIONALITY:

Test 1: {'subject': 'physics', 'difficulty': 'easy'}
  ‚úÖ Found: physics | easy | C∆° h·ªçc
  Question: C√¢u 17. Khi n√≥i v·ªÅ thuy·∫øt l∆∞·ª£ng t·ª≠ √°nh s√°ng, ph√°t bi·ªÉu n√†o s...

Test 2: {'subject': 'chemistry', 'topic': 'H√≥a h·ªØu c∆°'}
  ‚úÖ Found: chemistry | hard | H√≥a h·ªØu c∆°
  Question: Th√†nh ph·∫ßn ch√≠nh c·ªßa ƒë√° v√¥i l√† canxi cacbonat. C√¥ng th·ª©c c·ªßa...

Test 3: {'difficulty': 'hard'}
  ‚úÖ Found: chemistry | hard | H√≥a h·ªØu c∆°
  Question: ƒê·ªÉ ${m}$ gam h·ªón h·ª£p ${E}$ g·ªìm ${Al}$, ${Fe}$ v√† ${Cu}$ tron...

Test 4: {'subject': 'biology', 'difficulty': 'medium'}
  ‚úÖ Found: biology | medium | T·∫ø b√†o h·ªçc
  Question: M·ªôt lo√†i th·ª±c v·∫≠t, x√©t 2 c·∫∑p NST k√≠ hi·ªáu l√† A, a v√† B, b. C∆°...


In [24]:
# Cell 24 - Demo quiz flow
print("üéÆ DEMO QUIZ FLOW:")

# Get random question
current_question = get_random_question(data_enhanced, subject='physics')
if current_question is not None:
    print(f"üìù C√¢u h·ªèi ID: {current_question['id']}")
    print(f"üìö M√¥n: {current_question['subject']}")
    print(f"‚≠ê ƒê·ªô kh√≥: {current_question['difficulty']}")
    print(f"üìñ Ch·ªß ƒë·ªÅ: {current_question['topic']}")
    print(f"\n‚ùì C√¢u h·ªèi: {current_question['question']}")
    print(f"\nüìã C√°c ƒë√°p √°n:")
    for option in current_question['options']:
        print(f"  {option}")
    print(f"\n‚úÖ ƒê√°p √°n ƒë√∫ng: {current_question['answer']}")

üéÆ DEMO QUIZ FLOW:
üìù C√¢u h·ªèi ID: MET_Phy_IE_2019_17
üìö M√¥n: physics
‚≠ê ƒê·ªô kh√≥: hard
üìñ Ch·ªß ƒë·ªÅ: C∆° h·ªçc

‚ùì C√¢u h·ªèi: C√¢u 17. ƒê·∫∑t ƒëi·ªán √°p u = 200*cos (100*\pi*t) (V) v√†o hai ƒë·∫ßu ƒëo·∫°n m·∫°ch g·ªìm ƒëi·ªán tr·ªü 100 Ohm, cu·ªôn c·∫£m thu·∫ßn v√† t·ª• ƒëi·ªán m·∫Øc n·ªëi ti·∫øp. Bi·∫øt trong ƒëo·∫°n m·∫°ch c√≥ c·ªông h∆∞·ªüng ƒëi·ªán. C∆∞·ªùng ƒë·ªô hi·ªáu d·ª•ng c·ªßa d√≤ng ƒëi·ªán trong ƒëo·∫°n m·∫°ch l√†

üìã C√°c ƒë√°p √°n:
  A. 2\sqrt{2} A.
  B. \sqrt{2} A.
  C. 2 A.
  D. 1 A.

‚úÖ ƒê√°p √°n ƒë√∫ng: B


In [25]:
# Cell 25 - Test answer checking
user_answers = ['A', 'B', 'C', 'D']
correct_answer = current_question['answer']

print(f"\nüß™ TEST ANSWER CHECKING:")
print(f"Correct answer: {correct_answer}")

for answer in user_answers:
    is_correct = check_answer(answer, correct_answer)
    result = "‚úÖ ƒê√∫ng" if is_correct else "‚ùå Sai"
    print(f"User answer '{answer}': {result}")


üß™ TEST ANSWER CHECKING:
Correct answer: B
User answer 'A': ‚ùå Sai
User answer 'B': ‚úÖ ƒê√∫ng
User answer 'C': ‚ùå Sai
User answer 'D': ‚ùå Sai


In [26]:
# Cell 26 - Test similar questions cho demo question
print(f"\nüîç T√åM C√ÇU H·ªéI T∆Ø∆†NG T·ª∞:")
similar_questions = similar_finder.find_similar_questions(current_question['id'], n_similar=3)

if similar_questions:
    print(f"T√¨m th·∫•y {len(similar_questions)} c√¢u h·ªèi t∆∞∆°ng t·ª±:")
    for i, similar in enumerate(similar_questions, 1):
        similar_q = similar['question_data']
        similarity_score = similar['similarity']
        
        print(f"\n{i}. ƒê·ªô t∆∞∆°ng ƒë·ªìng: {similarity_score:.3f}")
        print(f"   Subject: {similar_q['subject']} | Difficulty: {similar_q['difficulty']} | Topic: {similar_q['topic']}")
        print(f"   Question: {similar_q['question'][:80]}...")
        print(f"   Answer: {similar_q['answer']}")
else:
    print("‚ùå Kh√¥ng t√¨m th·∫•y c√¢u h·ªèi t∆∞∆°ng t·ª±")


üîç T√åM C√ÇU H·ªéI T∆Ø∆†NG T·ª∞:
T√¨m th·∫•y 3 c√¢u h·ªèi t∆∞∆°ng t·ª±:

1. ƒê·ªô t∆∞∆°ng ƒë·ªìng: 0.630
   Subject: physics | Difficulty: hard | Topic: ƒêi·ªán xoay chi·ªÅu
   Question: C√¢u 36. ƒê·∫∑t ƒëi·ªán √°p xoay chi·ªÅu u = 60*\sqrt {2}*cos(100*\pi*t) (V) (t t√≠nh b·∫±ng ...
   Answer: A

2. ƒê·ªô t∆∞∆°ng ƒë·ªìng: 0.615
   Subject: physics | Difficulty: hard | Topic: C∆° h·ªçc
   Question: C√¢u 39. Cho ƒëo·∫°n m·∫°ch AB g·ªìm cu·ªôn c·∫£m thu·∫ßn L, ƒëi·ªán tr·ªü R = 50 Ohm v√† t·ª• ƒëi·ªán m·∫Ø...
   Answer: D

3. ƒê·ªô t∆∞∆°ng ƒë·ªìng: 0.603
   Subject: physics | Difficulty: hard | Topic: ƒêi·ªán xoay chi·ªÅu
   Question: ƒê·∫∑t ƒëi·ªán √°p xoay chi·ªÅu u = U\sqrt {2}cos (\omega t) (\omega>0) v√†o hai ƒë·∫ßu m·ªôt ƒë...
   Answer: C


In [27]:
# Cell 27 - Score tracking demo
score_tracker = ScoreTracker()
print("üìä DEMO SCORE TRACKING:")

# Simulate some answers
test_results = [True, False, True, True, False, True, False, True, True, False]

for i, result in enumerate(test_results, 1):
    score_tracker.add_result(result)
    print(f"C√¢u {i}: {'‚úÖ' if result else '‚ùå'} | Score: {score_tracker.correct}/{score_tracker.total} ({score_tracker.get_accuracy():.1f}%)")

üìä DEMO SCORE TRACKING:
C√¢u 1: ‚úÖ | Score: 1/1 (100.0%)
C√¢u 2: ‚ùå | Score: 1/2 (50.0%)
C√¢u 3: ‚úÖ | Score: 2/3 (66.7%)
C√¢u 4: ‚úÖ | Score: 3/4 (75.0%)
C√¢u 5: ‚ùå | Score: 3/5 (60.0%)
C√¢u 6: ‚úÖ | Score: 4/6 (66.7%)
C√¢u 7: ‚ùå | Score: 4/7 (57.1%)
C√¢u 8: ‚úÖ | Score: 5/8 (62.5%)
C√¢u 9: ‚úÖ | Score: 6/9 (66.7%)
C√¢u 10: ‚ùå | Score: 6/10 (60.0%)


In [28]:
# Cell 28 - Statistics summary
print("üìà TH·ªêNG K√ä T·ªîNG QUAN H·ªÜ TH·ªêNG:")
print(f"üìä T·ªïng c√¢u h·ªèi: {len(data_enhanced)}")
print(f"üìö S·ªë m√¥n h·ªçc: {data_enhanced['subject'].nunique()}")
print(f"‚≠ê S·ªë m·ª©c ƒë·ªô kh√≥: {data_enhanced['difficulty'].nunique()}")
print(f"üìñ S·ªë ch·ªß ƒë·ªÅ: {data_enhanced['topic'].nunique()}")

print(f"\nüìä Ph√¢n ph·ªëi m√¥n h·ªçc:")
print(data_enhanced['subject'].value_counts())

print(f"\n‚≠ê Ph√¢n ph·ªëi ƒë·ªô kh√≥:")
print(data_enhanced['difficulty'].value_counts())

print(f"\nüìñ Top 10 ch·ªß ƒë·ªÅ ph·ªï bi·∫øn:")
print(data_enhanced['topic'].value_counts().head(10))

üìà TH·ªêNG K√ä T·ªîNG QUAN H·ªÜ TH·ªêNG:
üìä T·ªïng c√¢u h·ªèi: 600
üìö S·ªë m√¥n h·ªçc: 3
‚≠ê S·ªë m·ª©c ƒë·ªô kh√≥: 3
üìñ S·ªë ch·ªß ƒë·ªÅ: 6

üìä Ph√¢n ph·ªëi m√¥n h·ªçc:
subject
biology      200
chemistry    200
physics      200
Name: count, dtype: int64

‚≠ê Ph√¢n ph·ªëi ƒë·ªô kh√≥:
difficulty
hard      282
medium    202
easy      116
Name: count, dtype: int64

üìñ Top 10 ch·ªß ƒë·ªÅ ph·ªï bi·∫øn:
topic
H√≥a h·ªØu c∆°         200
C∆° h·ªçc             103
T·∫ø b√†o h·ªçc         100
Di truy·ªÅn h·ªçc      100
Dao ƒë·ªông c∆°         72
ƒêi·ªán xoay chi·ªÅu     25
Name: count, dtype: int64


In [29]:
# Cell 29 - Model performance summary
print("ü§ñ T·ªîNG K·∫æT PERFORMANCE C√ÅC MODEL:")

print(f"\nüéØ DIFFICULTY CLASSIFIER:")
print(f"  - Algorithm: Random Forest + TF-IDF")
print(f"  - Features: 15 NLP features + text vectorization")
print(f"  - Classes: {difficulty_counts.index.tolist()}")
print(f"  - Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"  - F1-Score (weighted): {f1_score(y_test, y_pred, average='weighted'):.4f}")

print(f"\nüè∑Ô∏è TOPIC CLASSIFIER:")
print(f"  - Algorithm: Random Forest + TF-IDF")
print(f"  - Topics per subject: Physics({len(topic_classifier.subject_topics['physics'])}), Chemistry({len(topic_classifier.subject_topics['chemistry'])}), Biology({len(topic_classifier.subject_topics['biology'])})")
print(f"  - Total unique topics: {len(unique_topics)}")
print(f"  - Test Accuracy: {accuracy_score(y_test_topic, y_pred_topic):.4f}")
print(f"  - F1-Score (weighted): {f1_score(y_test_topic, y_pred_topic, average='weighted'):.4f}")

print(f"\nüîç SIMILAR QUESTION FINDER:")
print(f"  - Algorithm: TF-IDF + Cosine Similarity")
print(f"  - Vocabulary size: {len(similar_finder.vectorizer.vocabulary_)}")
print(f"  - Subject accuracy: {subject_accuracy:.4f}")
if within_subject_similarity:
    print(f"  - Within-subject similarity: {np.mean(within_subject_similarity):.4f}")
if cross_subject_similarity:
    print(f"  - Cross-subject similarity: {np.mean(cross_subject_similarity):.4f}")

ü§ñ T·ªîNG K·∫æT PERFORMANCE C√ÅC MODEL:

üéØ DIFFICULTY CLASSIFIER:
  - Algorithm: Random Forest + TF-IDF
  - Features: 15 NLP features + text vectorization
  - Classes: ['hard', 'medium', 'easy']
  - Test Accuracy: 0.6583
  - F1-Score (weighted): 0.6199

üè∑Ô∏è TOPIC CLASSIFIER:
  - Algorithm: Random Forest + TF-IDF
  - Topics per subject: Physics(8), Chemistry(8), Biology(8)
  - Total unique topics: 6
  - Test Accuracy: 0.8667
  - F1-Score (weighted): 0.8648

üîç SIMILAR QUESTION FINDER:
  - Algorithm: TF-IDF + Cosine Similarity
  - Vocabulary size: 2000
  - Subject accuracy: 0.9800
  - Within-subject similarity: 0.4839
  - Cross-subject similarity: 0.2409


In [30]:
# Cell 30 - System readiness check
print("‚úÖ KI·ªÇM TRA S·∫¥N S√ÄNG H·ªÜ TH·ªêNG:")

checks = [
    ("Data loaded", len(data_enhanced) > 0),
    ("Difficulty labels created", len(difficulties) == len(data_enhanced)),
    ("Topic labels created", len(topics) == len(data_enhanced)),
    ("Difficulty model trained", difficulty_classifier.model is not None),
    ("Topic model trained", topic_classifier.model is not None),
    ("Similar finder ready", similar_finder.question_vectors is not None),
    ("Enhanced dataset ready", 'difficulty' in data_enhanced.columns and 'topic' in data_enhanced.columns),
    ("Filter functions work", get_random_question(data_enhanced) is not None),
    ("Answer checking works", check_answer('A', 'A') == True),
    ("Score tracking works", score_tracker.get_accuracy() > 0)
]

all_passed = True
for check_name, passed in checks:
    status = "‚úÖ" if passed else "‚ùå"
    print(f"{status} {check_name}")
    if not passed:
        all_passed = False

print(f"\n{'üéâ H·ªÜ TH·ªêNG S·∫¥N S√ÄNG!' if all_passed else '‚ö†Ô∏è C√ì L·ªñI C·∫¶N KH·∫ÆC PH·ª§C!'}")

if all_passed:
    print("\nüöÄ C√≥ th·ªÉ ch·∫°y Streamlit app v·ªõi l·ªánh:")
    print("streamlit run main.py")

‚úÖ KI·ªÇM TRA S·∫¥N S√ÄNG H·ªÜ TH·ªêNG:
‚úÖ Data loaded
‚úÖ Difficulty labels created
‚úÖ Topic labels created
‚úÖ Difficulty model trained
‚úÖ Topic model trained
‚úÖ Similar finder ready
‚úÖ Enhanced dataset ready
‚úÖ Filter functions work
‚úÖ Answer checking works
‚úÖ Score tracking works

üéâ H·ªÜ TH·ªêNG S·∫¥N S√ÄNG!

üöÄ C√≥ th·ªÉ ch·∫°y Streamlit app v·ªõi l·ªánh:
streamlit run main.py
