# üß† AI Chatbot - Deep Learning Training Notebook

Notebook n√†y gi√∫p b·∫°n hu·∫•n luy·ªán m√¥ h√¨nh LSTM/GRU cho Chatbot tr√™n Google Colab.

### üìù H∆∞·ªõng d·∫´n s·ª≠ d·ª•ng:
1.  **Upload file `train_data_full.csv`** (ƒë∆∞·ª£c t·∫°o t·ª´ script `app/export_data.py`) l√™n Colab.
2.  Ch·∫°y l·∫ßn l∆∞·ª£t c√°c √¥ code b√™n d∆∞·ªõi (nh·∫•n n√∫t Play ‚ñ∂Ô∏è).
3.  Sau khi ch·∫°y xong, notebook s·∫Ω t·ª± ƒë·ªông n√©n v√† t·∫£i v·ªÅ file `models.zip`.
4.  Gi·∫£i n√©n file n√†y v√† ch√©p v√†o th∆∞ m·ª•c `models/` trong d·ª± √°n c·ªßa b·∫°n.

In [None]:
# üì¶ 1. C√†i ƒë·∫∑t th∆∞ vi·ªán c·∫ßn thi·∫øt
!pip install pyvi tensorflow pandas scikit-learn

In [None]:
# üìö 2. Import th∆∞ vi·ªán
import pandas as pd
import numpy as np
import pickle
import os
import re
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from pyvi import ViTokenizer 
from sklearn.feature_extraction.text import TfidfVectorizer

print("‚úÖ Libraries imported successfully.")

In [None]:
# üßπ 3. ƒê·ªãnh nghƒ©a Class Preprocessor (Copy t·ª´ app/preprocess.py)
# ---------------------------------------------------------

class TextPreprocessor:
    _instance = None

    def __new__(cls):
        if cls._instance is None:
            cls._instance = super(TextPreprocessor, cls).__new__(cls)
            cls._instance._initialize()
        return cls._instance

    def _initialize(self):
        self.re_special_chars = re.compile(r'[^\w\s]')
        self.re_numbers = re.compile(r'\d+')
        self._load_dictionaries()

    def _load_dictionaries(self):
        self.VIETNAMESE_STOPWORDS = {
            'th√¨', 'l√†', 'm√†', 'v√†', 'c·ªßa', 'nh·ªØng', 'c√°c', 'nh∆∞', 'th·∫ø', 'n√†o', 
            'ƒë∆∞·ª£c', 'v·ªÅ', 'v·ªõi', 'trong', 'c√≥', 'kh√¥ng', 'cho', 't√¥i', 'b·∫°n', 
            'c·∫≠u', 't·ªõ', 'm√¨nh', 'n√≥', 'h·∫Øn', 'g√¨', 'c√°i', 'con', 'ng∆∞·ªùi', 
            's·ª±', 'vi·ªác', 'ƒë√≥', 'ƒë√¢y', 'kia', 'n√†y', 'nh√©', '·∫°', '∆°i', 'ƒëi', 
            'l√†m', 'khi', 'l√∫c', 'n∆°i', 't·∫°i', 'ƒë√£', 'ƒëang', 's·∫Ω', 'mu·ªën', 
            'ph·∫£i', 'bi·∫øt', 'h√£y', 'r·ªìi', 'ch·ª©', 'nh·ªâ'
        }

    def preprocess_text(self, text: str) -> str:
        if not text: return ""
        text = text.lower()
        text = self.re_special_chars.sub('', text)
        text = self.re_numbers.sub('', text)
        tokenized_text = ViTokenizer.tokenize(text)
        tokens = tokenized_text.split()
        filtered_tokens = [
            word for word in tokens 
            if word not in self.VIETNAMESE_STOPWORDS and len(word) > 1
        ]
        return ' '.join(filtered_tokens)

# Kh·ªüi t·∫°o Preprocessor
preprocessor = TextPreprocessor()
def preprocess_text(text: str) -> str:
    return preprocessor.preprocess_text(text)

print("‚úÖ Preprocessor initialized.")

In [None]:
# üèóÔ∏è 4. ƒê·ªãnh nghƒ©a Model Deep Learning (Copy t·ª´ app/dl_model.py)
# ---------------------------------------------------------

def create_model(vocab_size, embedding_dim, max_length, num_classes):
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
    model.add(SpatialDropout1D(0.2))
    model.add(Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2)))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

print("‚úÖ Model architecture defined.")

In [None]:
# üî• 5. Hu·∫•n luy·ªán M√¥ h√¨nh (Logic t·ª´ app/train_dl.py)
# ---------------------------------------------------------

# C·∫•u h√¨nh
MAX_NUM_WORDS = 5000
MAX_SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 100
EPOCHS = 10
BATCH_SIZE = 32
MODEL_PATH = 'models/dl_model.h5'
TOKENIZER_PATH = 'models/tokenizer.pickle'
LABEL_ENCODER_PATH = 'models/label_encoder.pickle'

def train_dl_model():
    print("üöÄ Starting Deep Learning Model Training...")

    # 1. Load Data
    if not os.path.exists('train_data_full.csv'):
        print("‚ùå Error: train_data_full.csv not found. Please upload it!")
        return

    df = pd.read_csv('train_data_full.csv')
    
    # 2. Preprocess Data
    print("üßπ Preprocessing text...")
    df['clean_text'] = df['question'].apply(preprocess_text)
    
    # 3. Tokenization & Padding
    print("üî† Tokenizing...")
    tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\\]^`{|}~\t\n', lower=True)
    tokenizer.fit_on_texts(df['clean_text'].values)
    
    X = tokenizer.texts_to_sequences(df['clean_text'].values)
    X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)

    # 4. Encode Labels
    print("üè∑Ô∏è Encoding labels...")
    if 'topic' not in df.columns:
        print("‚ùå Error: 'topic' column missing in dataset!")
        return
    else:
        target_column = 'topic'
        
    le = LabelEncoder()
    Y = le.fit_transform(df[target_column])
    num_classes = len(le.classes_)
    print(f"Number of classes: {num_classes}")

    # 5. Build Model
    print("üèóÔ∏è Building model...")
    model = create_model(MAX_NUM_WORDS, EMBEDDING_DIM, MAX_SEQUENCE_LENGTH, num_classes)
    model.summary()

    # 6. Train Model
    print("üî• Training...")
    model.fit(X, Y, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=0.1, verbose=1)

    # 7. Save Artifacts
    print("üíæ Saving artifacts...")
    if not os.path.exists('models'):
        os.makedirs('models')
        
    model.save(MODEL_PATH)
    
    with open(TOKENIZER_PATH, 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
    with open(LABEL_ENCODER_PATH, 'wb') as handle:
        pickle.dump(le, handle, protocol=pickle.HIGHEST_PROTOCOL)

    print("‚úÖ Training complete! Model saved to models/")

# Run Training
train_dl_model()

In [None]:
# üì¶ 6. N√©n v√† T·∫£i v·ªÅ Model
!zip -r models.zip models/
from google.colab import files
files.download('models.zip')