# üöÄ Complete Dataset Training - Advanced Question Generation AI

Notebook n√†y s·∫Ω train to√†n b·ªô d·ªØ li·ªáu t·ª´:
- **datasets/**: H√†ng trƒÉm file CSV v·ªõi form data 
- **question_datasets/**: H√†ng trƒÉm file CSV v·ªõi real questions

M·ª•c ti√™u: T·∫°o AI model m·∫°nh m·∫Ω c√≥ th·ªÉ generate c√¢u h·ªèi th√¥ng minh t·ª´ keywords!

## üìö B∆∞·ªõc 1: Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import os
import glob
from pathlib import Path
import json
from datetime import datetime
import time
import warnings
warnings.filterwarnings('ignore')

# Machine Learning Libraries
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity

# Text Processing
import re
from collections import Counter, defaultdict

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-v0_8')

# Progress tracking
from tqdm import tqdm
tqdm.pandas()

print("‚úÖ All libraries imported successfully!")
print(f"üìÖ Training started at: {datetime.now()}")

## üóÇÔ∏è B∆∞·ªõc 2: Set Up Data Paths and Configuration

In [None]:
# Define data paths
BASE_PATH = Path("/home/dtu/ form-agent-AI-project/form-agent-AI-project")
DATASETS_PATH = BASE_PATH / "datasets"
QUESTION_DATASETS_PATH = BASE_PATH / "question_datasets"
MODELS_PATH = BASE_PATH / "models"

# Create models directory if not exists
MODELS_PATH.mkdir(exist_ok=True)

# Training configuration
CONFIG = {
    'max_samples_per_batch': 50000,  # Gi·ªõi h·∫°n samples per file ƒë·ªÉ tr√°nh memory overflow
    'total_max_samples': 1000000,    # T·ªïng s·ªë samples t·ªëi ƒëa
    'test_size': 0.2,
    'random_state': 42,
    'min_question_length': 10,
    'max_question_length': 200,
    'categories': ['it', 'economics', 'marketing']
}

print(f"üìÇ Dataset path: {DATASETS_PATH}")
print(f"üìÇ Question datasets path: {QUESTION_DATASETS_PATH}")
print(f"üíæ Models will be saved to: {MODELS_PATH}")
print(f"‚öôÔ∏è Configuration: {CONFIG}")

## üîç B∆∞·ªõc 3: Load and Explore Dataset Structure

In [None]:
# Find all CSV files in both directories
dataset_files = list(DATASETS_PATH.glob("*.csv"))
question_files = list(QUESTION_DATASETS_PATH.glob("*.csv"))

print(f"üìä Found {len(dataset_files)} files in datasets/")
print(f"üìä Found {len(question_files)} files in question_datasets/")

# Load sample files to understand structure
if dataset_files:
    sample_dataset = pd.read_csv(dataset_files[0])
    print(f"\nüîç Sample from datasets/ ({dataset_files[0].name}):")
    print(f"   Shape: {sample_dataset.shape}")
    print(f"   Columns: {list(sample_dataset.columns)}")
    display(sample_dataset.head(3))

if question_files:
    sample_questions = pd.read_csv(question_files[0])
    print(f"\nüîç Sample from question_datasets/ ({question_files[0].name}):")
    print(f"   Shape: {sample_questions.shape}")
    print(f"   Columns: {list(sample_questions.columns)}")
    display(sample_questions.head(3))

## üßπ B∆∞·ªõc 4: Data Preprocessing and Cleaning Functions

In [None]:
def clean_text(text):
    """Clean and normalize text data"""
    if pd.isna(text) or text == "":
        return ""
    
    # Convert to string and lowercase
    text = str(text).lower().strip()
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Remove special characters but keep Vietnamese
    text = re.sub(r'[^\w\s\u00C0-\u1EF9\?\.]', ' ', text)
    
    return text.strip()

def extract_questions_from_form_data(df):
    """Extract questions from form data format"""
    questions_data = []
    
    for _, row in df.iterrows():
        try:
            # Assume form data has columns like 'field_name', 'category', etc.
            if 'category' in row and 'field_name' in row:
                category = str(row['category']).lower()
                field_name = clean_text(row['field_name'])
                
                # Generate questions from field names
                if field_name and len(field_name) > 5:
                    question_templates = [
                        f"What is {field_name}?",
                        f"How do you define {field_name}?",
                        f"What are the key aspects of {field_name}?"
                    ]
                    
                    for template in question_templates:
                        questions_data.append({
                            'question': template,
                            'keyword': field_name,
                            'category': category,
                            'source': 'form_data'
                        })
                        
        except Exception as e:
            continue
    
    return pd.DataFrame(questions_data)

def process_question_data(df):
    """Process direct question data"""
    processed_data = []
    
    for _, row in df.iterrows():
        try:
            question = clean_text(row.get('question', ''))
            keyword = clean_text(row.get('keyword', ''))
            category = str(row.get('category', 'it')).lower()
            
            # Validate data
            if (question and keyword and 
                len(question) >= CONFIG['min_question_length'] and 
                len(question) <= CONFIG['max_question_length']):
                
                processed_data.append({
                    'question': question,
                    'keyword': keyword,
                    'category': category,
                    'source': 'direct_questions'
                })
                
        except Exception as e:
            continue
    
    return pd.DataFrame(processed_data)

print("‚úÖ Data preprocessing functions defined!")

## üìÇ B∆∞·ªõc 5: Load All CSV Files - Batch Processing

In [None]:
def load_all_datasets():
    """Load and combine all CSV files from both directories"""
    all_data = []
    total_loaded = 0
    
    print("üöÄ Loading datasets from datasets/ folder...")
    
    # Process form datasets
    for i, file_path in enumerate(tqdm(dataset_files[:20])):  # Limit to first 20 files for demo
        try:
            df = pd.read_csv(file_path)
            
            # Limit samples per file
            if len(df) > CONFIG['max_samples_per_batch']:
                df = df.sample(n=CONFIG['max_samples_per_batch'], random_state=42)
            
            # Extract questions from form data
            processed_df = extract_questions_from_form_data(df)
            
            if not processed_df.empty:
                all_data.append(processed_df)
                total_loaded += len(processed_df)
                
            print(f"   ‚úÖ {file_path.name}: {len(processed_df):,} questions extracted")
            
            # Stop if we've reached max samples
            if total_loaded >= CONFIG['total_max_samples']:
                print(f"   üõë Reached maximum samples limit: {CONFIG['total_max_samples']:,}")
                break
                
        except Exception as e:
            print(f"   ‚ùå Error processing {file_path.name}: {e}")
            continue
    
    print(f"\nüöÄ Loading question datasets from question_datasets/ folder...")
    
    # Process direct question datasets
    for i, file_path in enumerate(tqdm(question_files[:10])):  # Limit to first 10 files
        try:
            df = pd.read_csv(file_path)
            
            # Limit samples per file
            if len(df) > CONFIG['max_samples_per_batch']:
                df = df.sample(n=CONFIG['max_samples_per_batch'], random_state=42)
            
            # Process question data
            processed_df = process_question_data(df)
            
            if not processed_df.empty:
                all_data.append(processed_df)
                total_loaded += len(processed_df)
                
            print(f"   ‚úÖ {file_path.name}: {len(processed_df):,} questions loaded")
            
            # Stop if we've reached max samples
            if total_loaded >= CONFIG['total_max_samples']:
                print(f"   üõë Reached maximum samples limit: {CONFIG['total_max_samples']:,}")
                break
                
        except Exception as e:
            print(f"   ‚ùå Error processing {file_path.name}: {e}")
            continue
    
    # Combine all data
    if all_data:
        combined_df = pd.concat(all_data, ignore_index=True)
        print(f"\n‚úÖ Total combined data: {len(combined_df):,} records")
        return combined_df
    else:
        print("‚ùå No data loaded!")
        return pd.DataFrame()

# Load all data
print("üöÄ Starting comprehensive data loading...")
start_time = time.time()

master_dataset = load_all_datasets()

end_time = time.time()
print(f"\n‚è±Ô∏è Data loading completed in {end_time - start_time:.2f} seconds")

if not master_dataset.empty:
    print(f"\nüìä Master Dataset Summary:")
    print(f"   Total records: {len(master_dataset):,}")
    print(f"   Unique questions: {master_dataset['question'].nunique():,}")
    print(f"   Unique keywords: {master_dataset['keyword'].nunique():,}")
    print(f"   Categories: {master_dataset['category'].value_counts().to_dict()}")
    print(f"   Sources: {master_dataset['source'].value_counts().to_dict()}")

## üîß B∆∞·ªõc 6: Feature Engineering and Text Processing

In [None]:
# Remove duplicates and clean data
print("üßπ Cleaning and deduplicating data...")
initial_size = len(master_dataset)

# Remove duplicates based on question content
master_dataset = master_dataset.drop_duplicates(subset=['question'], keep='first')
print(f"   Removed {initial_size - len(master_dataset):,} duplicate questions")

# Filter valid categories
master_dataset = master_dataset[master_dataset['category'].isin(CONFIG['categories'])]
print(f"   Filtered to valid categories: {len(master_dataset):,} records")

# Create features for machine learning
print("\nüîß Creating features for ML training...")

# Text features from questions
master_dataset['question_length'] = master_dataset['question'].str.len()
master_dataset['word_count'] = master_dataset['question'].str.split().str.len()
master_dataset['keyword_length'] = master_dataset['keyword'].str.len()

# Encode categories
label_encoder = LabelEncoder()
master_dataset['category_encoded'] = label_encoder.fit_transform(master_dataset['category'])

print(f"‚úÖ Feature engineering completed!")
print(f"   Question length stats: {master_dataset['question_length'].describe()}")
print(f"   Word count stats: {master_dataset['word_count'].describe()}")

# Display final dataset info
display(master_dataset.head())
print(f"\nüìä Final dataset shape: {master_dataset.shape}")

## üìä B∆∞·ªõc 7: Data Visualization and Analysis

In [None]:
# Create visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Category distribution
master_dataset['category'].value_counts().plot(kind='bar', ax=axes[0,0], color='skyblue')
axes[0,0].set_title('Distribution of Categories')
axes[0,0].set_xlabel('Category')
axes[0,0].set_ylabel('Count')

# Question length distribution
master_dataset['question_length'].hist(bins=50, ax=axes[0,1], color='lightgreen')
axes[0,1].set_title('Distribution of Question Lengths')
axes[0,1].set_xlabel('Question Length (characters)')
axes[0,1].set_ylabel('Frequency')

# Word count distribution
master_dataset['word_count'].hist(bins=30, ax=axes[1,0], color='orange')
axes[1,0].set_title('Distribution of Word Count per Question')
axes[1,0].set_xlabel('Word Count')
axes[1,0].set_ylabel('Frequency')

# Source distribution
master_dataset['source'].value_counts().plot(kind='pie', ax=axes[1,1], autopct='%1.1f%%')
axes[1,1].set_title('Distribution of Data Sources')

plt.tight_layout()
plt.show()

# Print detailed statistics
print("\nüìà Detailed Dataset Statistics:")
print("=" * 50)
for category in CONFIG['categories']:
    cat_data = master_dataset[master_dataset['category'] == category]
    print(f"\n{category.upper()}:")
    print(f"   Records: {len(cat_data):,}")
    print(f"   Avg question length: {cat_data['question_length'].mean():.1f}")
    print(f"   Avg word count: {cat_data['word_count'].mean():.1f}")
    print(f"   Unique keywords: {cat_data['keyword'].nunique():,}")

## üîÄ B∆∞·ªõc 8: Prepare Training and Validation Sets

In [None]:
# Prepare features and targets
print("üîÄ Splitting data into training and validation sets...")

# Features: use keywords and questions
X = master_dataset[['keyword', 'question']]
y = master_dataset['category']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=CONFIG['test_size'], 
    random_state=CONFIG['random_state'],
    stratify=y
)

print(f"‚úÖ Data split completed:")
print(f"   Training set: {len(X_train):,} samples")
print(f"   Test set: {len(X_test):,} samples")
print(f"   Training categories: {y_train.value_counts().to_dict()}")
print(f"   Test categories: {y_test.value_counts().to_dict()}")

# Create TF-IDF features for keywords and questions
print("\nüîß Creating TF-IDF features...")

# TF-IDF for keywords
keyword_vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    stop_words='english'
)

# TF-IDF for questions
question_vectorizer = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1, 3),
    stop_words='english'
)

# Fit and transform
X_train_keyword_tfidf = keyword_vectorizer.fit_transform(X_train['keyword'])
X_train_question_tfidf = question_vectorizer.fit_transform(X_train['question'])

X_test_keyword_tfidf = keyword_vectorizer.transform(X_test['keyword'])
X_test_question_tfidf = question_vectorizer.transform(X_test['question'])

print(f"   Keyword TF-IDF shape: {X_train_keyword_tfidf.shape}")
print(f"   Question TF-IDF shape: {X_train_question_tfidf.shape}")

# Combine features
from scipy.sparse import hstack
X_train_combined = hstack([X_train_keyword_tfidf, X_train_question_tfidf])
X_test_combined = hstack([X_test_keyword_tfidf, X_test_question_tfidf])

print(f"   Combined features shape: {X_train_combined.shape}")
print("‚úÖ Feature preparation completed!")

## ü§ñ B∆∞·ªõc 9: Model Architecture Setup and Training

In [None]:
# Initialize multiple models for comparison
models = {
    'Random Forest': RandomForestClassifier(
        n_estimators=100,
        max_depth=20,
        random_state=42,
        n_jobs=-1
    ),
    'Logistic Regression': LogisticRegression(
        max_iter=1000,
        random_state=42,
        n_jobs=-1
    )
}

print("ü§ñ Training multiple models...")
print("=" * 50)

trained_models = {}
model_results = {}

for model_name, model in models.items():
    print(f"\nüî• Training {model_name}...")
    start_time = time.time()
    
    # Train the model
    model.fit(X_train_combined, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_combined)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    
    end_time = time.time()
    training_time = end_time - start_time
    
    # Store results
    trained_models[model_name] = model
    model_results[model_name] = {
        'accuracy': accuracy,
        'training_time': training_time,
        'predictions': y_pred
    }
    
    print(f"   ‚úÖ {model_name} completed!")
    print(f"   Accuracy: {accuracy:.4f}")
    print(f"   Training time: {training_time:.2f} seconds")

# Find best model
best_model_name = max(model_results, key=lambda x: model_results[x]['accuracy'])
best_model = trained_models[best_model_name]
best_accuracy = model_results[best_model_name]['accuracy']

print(f"\nüèÜ Best Model: {best_model_name}")
print(f"üéØ Best Accuracy: {best_accuracy:.4f}")

## üìä B∆∞·ªõc 10: Model Evaluation and Detailed Analysis

In [None]:
# Detailed evaluation of the best model
print(f"üìä Detailed Evaluation of {best_model_name}:")
print("=" * 60)

best_predictions = model_results[best_model_name]['predictions']

# Classification Report
print("\nüìã Classification Report:")
print(classification_report(y_test, best_predictions))

# Confusion Matrix
cm = confusion_matrix(y_test, best_predictions)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=CONFIG['categories'], 
            yticklabels=CONFIG['categories'])
plt.title(f'Confusion Matrix - {best_model_name}')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Model comparison visualization
plt.figure(figsize=(12, 6))

# Accuracy comparison
plt.subplot(1, 2, 1)
model_names = list(model_results.keys())
accuracies = [model_results[name]['accuracy'] for name in model_names]
plt.bar(model_names, accuracies, color=['skyblue', 'lightgreen'])
plt.title('Model Accuracy Comparison')
plt.ylabel('Accuracy')
plt.ylim(0, 1)

# Training time comparison
plt.subplot(1, 2, 2)
training_times = [model_results[name]['training_time'] for name in model_names]
plt.bar(model_names, training_times, color=['orange', 'pink'])
plt.title('Training Time Comparison')
plt.ylabel('Time (seconds)')

plt.tight_layout()
plt.show()

# Performance by category
print("\nüìà Performance by Category:")
for category in CONFIG['categories']:
    category_mask = y_test == category
    category_accuracy = accuracy_score(y_test[category_mask], best_predictions[category_mask])
    category_count = category_mask.sum()
    print(f"   {category.upper()}: {category_accuracy:.4f} (n={category_count})")

## üîÆ B∆∞·ªõc 11: Create Question Generation System

In [None]:
# Create a comprehensive question generation system
class AdvancedQuestionGenerator:
    def __init__(self, classifier, keyword_vectorizer, question_vectorizer, 
                 training_data, label_encoder):
        self.classifier = classifier
        self.keyword_vectorizer = keyword_vectorizer
        self.question_vectorizer = question_vectorizer
        self.training_data = training_data
        self.label_encoder = label_encoder
        
        # Build similarity index
        self.similarity_model = NearestNeighbors(
            n_neighbors=10, 
            metric='cosine'
        )
        
        # Fit on training keywords
        training_keyword_tfidf = keyword_vectorizer.transform(training_data['keyword'])
        self.similarity_model.fit(training_keyword_tfidf)
    
    def predict_category(self, keyword):
        """Predict category for a keyword"""
        # Create dummy question for prediction
        dummy_question = f"What is {keyword}?"
        
        # Vectorize
        keyword_tfidf = self.keyword_vectorizer.transform([keyword])
        question_tfidf = self.question_vectorizer.transform([dummy_question])
        
        # Combine features
        combined_features = hstack([keyword_tfidf, question_tfidf])
        
        # Predict
        prediction = self.classifier.predict(combined_features)[0]
        probabilities = self.classifier.predict_proba(combined_features)[0]
        confidence = max(probabilities)
        
        return prediction, confidence
    
    def find_similar_keywords(self, keyword, n_similar=5):
        """Find similar keywords from training data"""
        keyword_tfidf = self.keyword_vectorizer.transform([keyword])
        distances, indices = self.similarity_model.kneighbors(keyword_tfidf)
        
        similar_data = []
        for i, idx in enumerate(indices[0]):
            similar_row = self.training_data.iloc[idx]
            similar_data.append({
                'keyword': similar_row['keyword'],
                'question': similar_row['question'],
                'category': similar_row['category'],
                'similarity': 1 - distances[0][i]
            })
        
        return similar_data
    
    def generate_questions(self, keyword, num_questions=5):
        """Generate questions for a keyword using ML similarity"""
        # Predict category
        predicted_category, confidence = self.predict_category(keyword)
        
        # Find similar keywords and their questions
        similar_data = self.find_similar_keywords(keyword, n_similar=10)
        
        generated_questions = []
        
        # Use similar questions as templates
        for item in similar_data[:num_questions]:
            original_question = item['question']
            
            # Adapt question by replacing similar keyword with target keyword
            adapted_question = self.adapt_question(original_question, item['keyword'], keyword)
            
            generated_questions.append({
                'question': adapted_question,
                'category': predicted_category,
                'confidence': confidence,
                'similarity': item['similarity'],
                'source': 'ml_similarity'
            })
        
        # If we need more questions, generate template-based ones
        if len(generated_questions) < num_questions:
            template_questions = self.generate_template_questions(keyword, predicted_category)
            
            for template_q in template_questions[:num_questions - len(generated_questions)]:
                generated_questions.append({
                    'question': template_q,
                    'category': predicted_category,
                    'confidence': confidence,
                    'similarity': 0.5,
                    'source': 'template'
                })
        
        return generated_questions[:num_questions]
    
    def adapt_question(self, original_question, original_keyword, target_keyword):
        """Adapt a question by replacing keywords intelligently"""
        adapted = original_question.replace(original_keyword.lower(), target_keyword.lower())
        
        # Capitalize first letter
        adapted = adapted[0].upper() + adapted[1:] if len(adapted) > 1 else adapted.upper()
        
        return adapted
    
    def generate_template_questions(self, keyword, category):
        """Generate template-based questions as fallback"""
        templates = {
            'it': [
                f"What is {keyword}?",
                f"How does {keyword} work?",
                f"What are the benefits of {keyword}?",
                f"How to implement {keyword}?",
                f"What are the best practices for {keyword}?"
            ],
            'economics': [
                f"What is {keyword}?",
                f"How does {keyword} affect the economy?",
                f"What are the economic implications of {keyword}?",
                f"How to analyze {keyword}?",
                f"What strategies work for {keyword}?"
            ],
            'marketing': [
                f"What is {keyword}?",
                f"How to use {keyword} in marketing?",
                f"What are the best {keyword} strategies?",
                f"How to measure {keyword} effectiveness?",
                f"What tools help with {keyword}?"
            ]
        }
        
        return templates.get(category, templates['it'])

# Initialize the question generator
print("üîÆ Creating Advanced Question Generator...")
question_generator = AdvancedQuestionGenerator(
    classifier=best_model,
    keyword_vectorizer=keyword_vectorizer,
    question_vectorizer=question_vectorizer,
    training_data=master_dataset,
    label_encoder=label_encoder
)

print("‚úÖ Question Generator created successfully!")

## üß™ B∆∞·ªõc 12: Test Question Generation System

In [None]:
# Test the question generation system
print("üß™ Testing Advanced Question Generation System")
print("=" * 60)

test_keywords = [
    "artificial intelligence",
    "blockchain technology",
    "cryptocurrency investment",
    "digital marketing automation",
    "cloud computing security",
    "machine learning algorithms",
    "social media advertising",
    "financial portfolio management",
    "data science",
    "e-commerce optimization"
]

for keyword in test_keywords:
    print(f"\nüéØ Testing keyword: '{keyword}'")
    
    # Predict category first
    predicted_category, confidence = question_generator.predict_category(keyword)
    print(f"   üìÇ Predicted category: {predicted_category} (confidence: {confidence:.3f})")
    
    # Generate questions
    questions = question_generator.generate_questions(keyword, num_questions=4)
    
    print(f"   üéØ Generated {len(questions)} questions:")
    for i, q in enumerate(questions, 1):
        source = q['source']
        similarity = q['similarity']
        print(f"      {i}. {q['question']} [{source}, sim: {similarity:.2f}]")

print("\n‚úÖ Question generation testing completed!")

## üíæ B∆∞·ªõc 13: Save Trained Model and Results

In [None]:
import pickle
import json

# Save the complete trained system
print("üíæ Saving trained models and components...")

# Create a comprehensive model package
model_package = {
    'classifier': best_model,
    'keyword_vectorizer': keyword_vectorizer,
    'question_vectorizer': question_vectorizer,
    'label_encoder': label_encoder,
    'training_data_sample': master_dataset.sample(n=1000),  # Save sample for similarity
    'model_info': {
        'best_model_name': best_model_name,
        'accuracy': best_accuracy,
        'training_date': datetime.now().isoformat(),
        'total_training_samples': len(master_dataset),
        'categories': CONFIG['categories']
    }
}

# Save the complete model
model_file = MODELS_PATH / 'complete_question_ai_model.pkl'
with open(model_file, 'wb') as f:
    pickle.dump(model_package, f)

print(f"   ‚úÖ Complete model saved to: {model_file}")

# Save training results summary
training_summary = {
    'training_completed': datetime.now().isoformat(),
    'dataset_info': {
        'total_records': len(master_dataset),
        'unique_questions': master_dataset['question'].nunique(),
        'unique_keywords': master_dataset['keyword'].nunique(),
        'category_distribution': master_dataset['category'].value_counts().to_dict(),
        'source_distribution': master_dataset['source'].value_counts().to_dict()
    },
    'model_performance': {
        'best_model': best_model_name,
        'best_accuracy': float(best_accuracy),
        'all_model_results': {
            name: {
                'accuracy': float(results['accuracy']),
                'training_time': float(results['training_time'])
            }
            for name, results in model_results.items()
        }
    },
    'configuration': CONFIG
}

# Save training summary
summary_file = MODELS_PATH / 'training_summary.json'
with open(summary_file, 'w') as f:
    json.dump(training_summary, f, indent=2)

print(f"   ‚úÖ Training summary saved to: {summary_file}")

# Save sample dataset for reference
sample_file = MODELS_PATH / 'training_data_sample.csv'
master_dataset.sample(n=5000).to_csv(sample_file, index=False)
print(f"   ‚úÖ Sample training data saved to: {sample_file}")

print("\nüéâ Model training and saving completed successfully!")
print(f"üìä Final Results Summary:")
print(f"   Best Model: {best_model_name}")
print(f"   Accuracy: {best_accuracy:.4f}")
print(f"   Training Samples: {len(master_dataset):,}")
print(f"   Categories: {CONFIG['categories']}")
print(f"   Model Files: {MODELS_PATH}")

# Create a simple test function
def test_saved_model(keyword):
    """Test function to load and use the saved model"""
    with open(model_file, 'rb') as f:
        loaded_package = pickle.load(f)
    
    # Create question generator from loaded components
    generator = AdvancedQuestionGenerator(
        classifier=loaded_package['classifier'],
        keyword_vectorizer=loaded_package['keyword_vectorizer'],
        question_vectorizer=loaded_package['question_vectorizer'],
        training_data=loaded_package['training_data_sample'],
        label_encoder=loaded_package['label_encoder']
    )
    
    return generator.generate_questions(keyword, num_questions=3)

print("\nüß™ Testing saved model with 'machine learning':")
test_result = test_saved_model("machine learning")
for i, q in enumerate(test_result, 1):
    print(f"   {i}. {q['question']} [{q['source']}]")

print("\n‚úÖ All training completed successfully! üéä")