# Download Dataset +  EDA

In [None]:
# pip install pandas numpy scikit-learn matplotlib seaborn underthesea requests joblib imbalanced-learn

import os
import re
import unicodedata
import requests
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression   # <- đã import
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
import joblib

# Thử import RandomOverSampler (imblearn). Nếu không có, sẽ fallback.
try:
    from imblearn.over_sampling import RandomOverSampler
    ROS_AVAILABLE = True
except Exception as e:
    print("⚠ imbalanced-learn (RandomOverSampler) không có. Install bằng: pip install imbalanced-learn")
    ROS_AVAILABLE = False

# Thiết lập font (nếu cần hiển thị tiếng Việt)
plt.rcParams['font.family'] = 'DejaVu Sans'

# ---------------------------
# 1) Download & Prepare Data
# ---------------------------
def download_from_drive(drive_url, local_path):
    try:
        response = requests.get(drive_url, timeout=30)
        response.raise_for_status()
        os.makedirs(os.path.dirname(local_path), exist_ok=True)
        with open(local_path, 'wb') as f:
            f.write(response.content)
        print(f"✓ Downloaded: {local_path}")
        return True
    except Exception as e:
        print(f"✗ Error downloading {local_path}: {e}")
        return False

def prepare_dataset():
    urls = {
        "train": {
            "sentences": "https://drive.google.com/uc?id=1nzak5OkrheRV1ltOGCXkT671bmjODLhP&export=download",
            "sentiments": "https://drive.google.com/uc?id=1ye-gOZIBqXdKOoi_YxvpT6FeRNmViPPv&export=download",
            "topics": "https://drive.google.com/uc?id=14MuDtwMnNOcr4z_8KdpxprjbwaQ7lJ_C&export=download",
        },
        "validation": {
            "sentences": "https://drive.google.com/uc?id=1sMJSR3oRfPc3fe1gK-V3W5F24tov_517&export=download",
            "sentiments": "https://drive.google.com/uc?id=1GiY1AOp41dLXIIkgES4422AuDwmbUseL&export=download",
            "topics": "https://drive.google.com/uc?id=1DwLgDEaFWQe8mOd7EpF-xqMEbDLfdT-W&export=download",
        },
        "test": {
            "sentences": "https://drive.google.com/uc?id=1aNMOeZZbNwSRkjyCWAGtNCMa3YrshR-n&export=download",
            "sentiments": "https://drive.google.com/uc?id=1vkQS5gI0is4ACU58-AbWusnemw7KZNfO&export=download",
            "topics": "https://drive.google.com/uc?id=1_ArMpDguVsbUGl-xSMkTF_p5KpZrmpSB&export=download",
        }
    }

    output_dir = "dataset_feedback_students"
    csv_dir = os.path.join(output_dir, "csv")
    os.makedirs(csv_dir, exist_ok=True)

    datasets = {}

    for split_name, split_urls in urls.items():
        print(f"\n=== Processing {split_name} split ===")
        paths = {}
        for data_type, url in split_urls.items():
            local_path = os.path.join(output_dir, f"{split_name}_{data_type}.txt")
            if not os.path.exists(local_path):
                if download_from_drive(url, local_path):
                    paths[data_type] = local_path
                else:
                    print(f"Failed to download {data_type} for {split_name}")
            else:
                print(f"✓ File already exists: {local_path}")
                paths[data_type] = local_path

        if len(paths) != 3:
            print(f"Missing files for {split_name}, skipping...")
            continue

        try:
            with open(paths["sentences"], 'r', encoding='utf-8') as f:
                sentences = [line.strip() for line in f.readlines()]
            with open(paths["sentiments"], 'r', encoding='utf-8') as f:
                sentiments = [line.strip() for line in f.readlines()]
            with open(paths["topics"], 'r', encoding='utf-8') as f:
                topics = [line.strip() for line in f.readlines()]

            if len(sentences) == len(sentiments) == len(topics):
                # Map numeric -> text nếu cần
                sentiment_map = {'0': 'negative', '1': 'neutral', '2': 'positive'}
                sentiments = [sentiment_map.get(s.strip(), s.strip()) for s in sentiments]

                df = pd.DataFrame({
                    'sentence': sentences,
                    'sentiment': sentiments,
                    'topic': topics
                })

                csv_path = os.path.join(csv_dir, f"{split_name}.csv")
                df.to_csv(csv_path, index=False, encoding='utf-8')
                datasets[split_name] = df
                print(f"✓ Saved {split_name}.csv with {len(df)} records")
            else:
                print(f"✗ Data length mismatch in {split_name}: sentences={len(sentences)}, sentiments={len(sentiments)}, topics={len(topics)}")
        except Exception as e:
            print(f"✗ Error processing {split_name}: {e}")

    return datasets

# ---------------------------
# 2) Text preprocessing
# ---------------------------
try:
    from underthesea import word_tokenize
    USE_UNDERTHESEA = True
    print("✓ underthesea available")
except Exception:
    USE_UNDERTHESEA = False
    print("⚠ underthesea not available. Using simple tokenization.")

def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = unicodedata.normalize('NFC', text)
    text = re.sub(r'http\S+|www\S+', ' ', text)
    text = re.sub(r'[@#]\w+', ' ', text)
    text = re.sub(r'[^\w\s]', ' ', text, flags=re.UNICODE)
    text = re.sub(r'\d+', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def tokenize_vietnamese(text):
    text = clean_text(text)
    if USE_UNDERTHESEA and text:
        try:
            return word_tokenize(text, format="text")
        except:
            pass
    return " ".join(text.split())

# ---------------------------
# 3) EDA helper
# ---------------------------
def perform_eda(df, name="dataset"):
    print(f"\n=== EDA for {name} ===")
    print(f"Dataset shape: {df.shape}")
    print("\nSentiment distribution:")
    print(df['sentiment'].value_counts().sort_index())
    df['sentence_length'] = df['sentence'].astype(str).apply(lambda x: len(x.split()))
    print(f"\nSentence length statistics:")
    print(df['sentence_length'].describe())

    fig, axes = plt.subplots(1, 3, figsize=(18,5))
    sentiment_counts = df['sentiment'].value_counts().sort_index()
    axes[0].bar(sentiment_counts.index, sentiment_counts.values)
    axes[0].set_title(f'{name} - Sentiment Distribution')
    axes[0].set_xlabel('Sentiment')
    axes[0].set_ylabel('Count')

    top_topics = df['topic'].value_counts().head(10)
    axes[1].barh(range(len(top_topics)), top_topics.values)
    axes[1].set_yticks(range(len(top_topics)))
    axes[1].set_yticklabels(top_topics.index)
    axes[1].set_title(f'{name} - Top 10 Topics')
    axes[1].set_xlabel('Count')

    axes[2].hist(df['sentence_length'], bins=30, alpha=0.7)
    axes[2].set_title(f'{name} - Sentence Length Distribution')
    axes[2].set_xlabel('Sentence Length (words)')
    axes[2].set_ylabel('Frequency')

    plt.tight_layout()
    plt.show()
    return df

def predict_sentiment(text, model, vectorizer):
    text_clean = tokenize_vietnamese(text)
    X = vectorizer.transform([text_clean])
    pred = model.predict(X)[0]
    prob = None
    try:
        prob = model.predict_proba(X)[0]
    except:
        pass
    label_names = ['negative','neutral','positive']
    return {
        'prediction': label_names[pred],
        'confidence': float(max(prob)) if prob is not None else None,
        'probabilities': dict(zip(label_names, prob)) if prob is not None else None
    }

# ---------------------------
# 6) Main
# ---------------------------
def main():
    print("=== Vietnamese Student Feedback Sentiment Analysis (Oversampling) ===")
    datasets = prepare_dataset()
    if len(datasets) < 3:
        print("✗ Not enough datasets. Aborting.")
        return

    train_df = datasets['train']
    val_df = datasets['validation']
    test_df = datasets['test']
    print(f"✓ Loaded datasets - Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

    train_df = perform_eda(train_df, "Training")
    
if __name__ == "__main__":
    try:
       main()
    except KeyboardInterrupt:
        print("\n⚠ Interrupted by user")
    except Exception as e:
        print("\n✗ Error during execution:", e)
        import traceback; traceback.print_exc()


# TRAIN

In [None]:
# from datasets import load_dataset
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import joblib

# ==============================
# 1. Load dataset từ CSV
# ==============================
df_train = pd.read_csv("dataset_feedback_students/csv/train.csv")
df_validation = pd.read_csv("dataset_feedback_students/csv/validation.csv")
df_test = pd.read_csv("dataset_feedback_students/csv/test.csv")

print("Kích thước tập train:", df_train.shape)
print("Kích thước tập validation:", df_validation.shape)
print("Kích thước tập test:", df_test.shape)

# ==============================
# 2. Chuẩn hóa label
# ==============================
# Encode sentiment -> số
sentiment_encoder = LabelEncoder()
df_train['sentiment'] = sentiment_encoder.fit_transform(df_train['sentiment'])
df_validation['sentiment'] = sentiment_encoder.transform(df_validation['sentiment'])
df_test['sentiment'] = sentiment_encoder.transform(df_test['sentiment'])

print("Các nhãn sentiment:", list(sentiment_encoder.classes_))

# ==============================
# 3. Chuẩn bị dữ liệu
# ==============================
X_train_text = df_train['sentence'].tolist()
y_train = df_train[['sentiment', 'topic']].values

X_validation_text = df_validation['sentence'].tolist()
y_validation = df_validation[['sentiment', 'topic']].values

X_test_text = df_test['sentence'].tolist()
y_test = df_test[['sentiment', 'topic']].values

# ==============================
# 4. SBERT embedding
# ==============================
print("Đang tải model SBERT...")
sbert_model = SentenceTransformer("sentence-transformers/LaBSE")

print("Đang tạo embedding cho dữ liệu train...")
X_train_embeddings = sbert_model.encode(X_train_text, convert_to_numpy=True, show_progress_bar=True)

print("Đang tạo embedding cho dữ liệu validation...")
X_validation_embeddings = sbert_model.encode(X_validation_text, convert_to_numpy=True, show_progress_bar=True)

print("Đang tạo embedding cho dữ liệu test...")
X_test_embeddings = sbert_model.encode(X_test_text, convert_to_numpy=True, show_progress_bar=True)

print("Kích thước embedding train:", X_train_embeddings.shape)
print("Kích thước embedding validation:", X_validation_embeddings.shape)
print("Kích thước embedding test:", X_test_embeddings.shape)

# ==============================
# 5. Huấn luyện mô hình BalancedRandomForest
# ==============================
brf = BalancedRandomForestClassifier(
    n_estimators=200,
    max_depth=15,
    random_state=42,
    n_jobs=-1,
    sampling_strategy="auto",
    replacement=True
)


model = MultiOutputClassifier(brf)
model.fit(X_train_embeddings, y_train)

# ==============================
# 6. Đánh giá mô hình
# ==============================
print("Đánh giá trên tập validation...")
y_validation_pred = model.predict(X_validation_embeddings)

print("\n📌 Kết quả trên tập validation cho nhãn 'sentiment':")
print(classification_report(y_validation[:, 0], y_validation_pred[:, 0], target_names=sentiment_encoder.classes_))

print("\n📌 Kết quả trên tập validation cho nhãn 'topic':")
print(classification_report(y_validation[:, 1], y_validation_pred[:, 1]))

validation_accuracy = (y_validation_pred == y_validation).all(axis=1).mean()
print(f"\n🎯 Độ chính xác tổng thể trên validation: {validation_accuracy:.4f}")

# Đánh giá trên tập test
print("\nĐánh giá trên tập test...")
y_test_pred = model.predict(X_test_embeddings)

print("\n📌 Kết quả trên tập test cho nhãn 'sentiment':")
print(classification_report(y_test[:, 0], y_test_pred[:, 0], target_names=sentiment_encoder.classes_))

print("\n📌 Kết quả trên tập test cho nhãn 'topic':")
print(classification_report(y_test[:, 1], y_test_pred[:, 1]))

test_accuracy = (y_test_pred == y_test).all(axis=1).mean()
print(f"\n🎯 Độ chính xác tổng thể trên test: {test_accuracy:.4f}")

# ==============================
# 7. Vẽ ma trận nhầm lẫn
# ==============================
plt.figure(figsize=(15, 6))

# Sentiment
plt.subplot(1, 2, 1)
cm_sentiment = confusion_matrix(y_test[:, 0], y_test_pred[:, 0])
sns.heatmap(cm_sentiment, annot=True, fmt='d', cmap='Blues',
            xticklabels=sentiment_encoder.classes_,
            yticklabels=sentiment_encoder.classes_)
plt.title('Ma trận nhầm lẫn - Sentiment')
plt.ylabel('Thực tế')
plt.xlabel('Dự đoán')

# Topic
plt.subplot(1, 2, 2)
cm_topic = confusion_matrix(y_test[:, 1], y_test_pred[:, 1])
sns.heatmap(cm_topic, annot=True, fmt='d', cmap='Blues')
plt.title('Ma trận nhầm lẫn - Topic')
plt.ylabel('Thực tế')
plt.xlabel('Dự đoán')

plt.tight_layout()
plt.show()

# # ==============================
# # 8. Lưu mô hình + encoder
# # ==============================
# joblib.dump({"model": model, "sentiment_encoder": sentiment_encoder}, "multioutput_brf_model.pkl")
# print("✅ Đã lưu mô hình vào multioutput_brf_model.pkl")


# ==============================
# 8. Lưu mô hình + encoder + tên SBERT
# ==============================
save_obj = {
    "model": model,                         # mô hình đã huấn luyện
    "sentiment_encoder": sentiment_encoder, # encoder cho sentiment
    "sbert_model_name": "sentence-transformers/LaBSE"  # chỉ cần lưu tên model
}

joblib.dump(save_obj, "multioutput_brf_V2_model.pkl")
print("✅ Đã lưu mô hình vào multioutput_brf_V2_model.pkl")
