# Download Dataset

In [1]:
import pandas as pd
import requests
import os

# H√†m helper ƒë·ªÉ download t·ª´ Google Drive
def download_from_drive(drive_url, local_path):
    response = requests.get(drive_url)
    response.raise_for_status()
    with open(local_path, 'wb') as f:
        f.write(response.content)

urls = {
    "train": {
        "sentences": "https://drive.google.com/uc?id=1nzak5OkrheRV1ltOGCXkT671bmjODLhP&export=download",
        "sentiments": "https://drive.google.com/uc?id=1ye-gOZIBqXdKOoi_YxvpT6FeRNmViPPv&export=download",
        "topics":     "https://drive.google.com/uc?id=14MuDtwMnNOcr4z_8KdpxprjbwaQ7lJ_C&export=download",
    },
    "validation": {
        "sentences": "https://drive.google.com/uc?id=1sMJSR3oRfPc3fe1gK-V3W5F24tov_517&export=download",
        "sentiments": "https://drive.google.com/uc?id=1GiY1AOp41dLXIIkgES4422AuDwmbUseL&export=download",
        "topics":     "https://drive.google.com/uc?id=1DwLgDEaFWQe8mOd7EpF-xqMEbDLfdT-W&export=download",
    },
    "test": {
        "sentences": "https://drive.google.com/uc?id=1aNMOeZZbNwSRkjyCWAGtNCMa3YrshR-n&export=download",
        "sentiments": "https://drive.google.com/uc?id=1vkQS5gI0is4ACU58-AbWusnemw7KZNfO&export=download",
        "topics":     "https://drive.google.com/uc?id=1_ArMpDguVsbUGl-xSMkTF_p5KpZrmpSB&export=download",
    }
}

def prepare_split(split_name, urls_for_split, output_dir="data"):
    os.makedirs(output_dir, exist_ok=True)
    paths = {}
    for kind, url in urls_for_split.items():
        local_path = os.path.join(output_dir, f"{split_name}_{kind}.txt")
        if not os.path.exists(local_path):
            print(f"Downloading {split_name} {kind} ‚Ä¶")
            download_from_drive(url, local_path)
        else:
            print(f"File {local_path} ƒë√£ t·ªìn t·∫°i, b·ªè qua download.")
        paths[kind] = local_path

    # ƒê·ªçc file m·ªói d√≤ng l√† m·ªôt b·∫£n ghi
    # D√πng read_csv v·ªõi delimiter m·∫∑c ƒë·ªãnh (ph√¢n c√°ch theo d·∫•u ph·∫©y n·∫øu c√≥),
    # ·ªü ƒë√¢y file ch·ªâ c√≥ m·ªôt c·ªôt, n√™n ta ch·ªâ c·∫ßn ƒë·ªçc c·∫£ d√≤ng l√† string
    def read_single_column_txt(path, column_name):
        # D√πng read_csv, m·ªói d√≤ng l√† m·ªôt record
        return pd.read_csv(path, header=None, names=[column_name], dtype=str, encoding="utf-8", sep="\r\n", engine="python")

    # N·∫øu c√°ch tr√™n v·∫´n l·ªói, d√πng c√°ch fallback: ƒë·ªçc th·ªß c√¥ng v·ªõi Python
    def read_single_column_manual(path, column_name):
        with open(path, 'r', encoding='utf-8') as f:
            lines = [line.strip("\n") for line in f]
        return pd.DataFrame({column_name: lines})

    # Th·ª≠ ƒë·ªçc
    try:
        df_sent = read_single_column_txt(paths["sentences"], "sentence")
    except Exception as e:
        print("ƒê·ªçc sentences b·∫±ng read_csv v·ªõi sep=\"\\r\\n\" b·ªã l·ªói, d√πng manual:", e)
        df_sent = read_single_column_manual(paths["sentences"], "sentence")

    try:
        df_senti = read_single_column_txt(paths["sentiments"], "sentiment")
    except Exception as e:
        print("ƒê·ªçc sentiments b·ªã l·ªói, d√πng manual:", e)
        df_senti = read_single_column_manual(paths["sentiments"], "sentiment")

    try:
        df_topic = read_single_column_txt(paths["topics"], "topic")
    except Exception as e:
        print("ƒê·ªçc topics b·ªã l·ªói, d√πng manual:", e)
        df_topic = read_single_column_manual(paths["topics"], "topic")

    # Ki·ªÉm tra s·ªë d√≤ng
    assert len(df_sent) == len(df_senti) == len(df_topic), \
        f"S·ªë d√≤ng kh√¥ng kh·ªõp ·ªü split {split_name}: sentences {len(df_sent)}, sentiments {len(df_senti)}, topics {len(df_topic)}"

    df = pd.concat([df_sent, df_senti, df_topic], axis=1)

    output_csv = os.path.join(output_dir, f"{split_name}.csv")
    print(f"L∆∞u file CSV: {output_csv}")
    df.to_csv(output_csv, index=False, encoding="utf-8")

    return df

def download_and_prepare_all(output_dir="data"):
    datasets = {}
    for split, split_urls in urls.items():
        df = prepare_split(split, split_urls, output_dir=output_dir)
        datasets[split] = df
    return datasets

if __name__ == "__main__":
    datasets = download_and_prepare_all(output_dir="uit_vsf_feedback_data")
    # K·∫øt qu·∫£: c√≥ c√°c file train.csv, validation.csv, test.csv trong th∆∞ m·ª•c


Downloading train sentences ‚Ä¶
Downloading train sentiments ‚Ä¶
Downloading train topics ‚Ä¶
L∆∞u file CSV: uit_vsf_feedback_data\train.csv
Downloading validation sentences ‚Ä¶
Downloading validation sentiments ‚Ä¶
Downloading validation topics ‚Ä¶
L∆∞u file CSV: uit_vsf_feedback_data\validation.csv
Downloading test sentences ‚Ä¶
Downloading test sentiments ‚Ä¶
Downloading test topics ‚Ä¶
L∆∞u file CSV: uit_vsf_feedback_data\test.csv


# TRAIN

In [4]:
# from datasets import load_dataset
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import joblib

# ==============================
# 1. Load dataset t·ª´ CSV
# ==============================
df_train = pd.read_csv("./uit_vsf_feedback_data/train.csv")
df_validation = pd.read_csv("uit_vsf_feedback_data/validation.csv")
df_test = pd.read_csv("uit_vsf_feedback_data/test.csv")

print("K√≠ch th∆∞·ªõc t·∫≠p train:", df_train.shape)
print("K√≠ch th∆∞·ªõc t·∫≠p validation:", df_validation.shape)
print("K√≠ch th∆∞·ªõc t·∫≠p test:", df_test.shape)

# ==============================
# 2. Chu·∫©n h√≥a label
# ==============================
# Encode sentiment -> s·ªë
sentiment_encoder = LabelEncoder()
df_train['sentiment'] = sentiment_encoder.fit_transform(df_train['sentiment'])
df_validation['sentiment'] = sentiment_encoder.transform(df_validation['sentiment'])
df_test['sentiment'] = sentiment_encoder.transform(df_test['sentiment'])

print("C√°c nh√£n sentiment:", list(sentiment_encoder.classes_))

# ==============================
# 3. Chu·∫©n b·ªã d·ªØ li·ªáu
# ==============================
X_train_text = df_train['sentence'].tolist()
y_train = df_train[['sentiment', 'topic']].values

X_validation_text = df_validation['sentence'].tolist()
y_validation = df_validation[['sentiment', 'topic']].values

X_test_text = df_test['sentence'].tolist()
y_test = df_test[['sentiment', 'topic']].values

# ==============================
# 4. SBERT embedding
# ==============================
print("ƒêang t·∫£i model SBERT...")
sbert_model = SentenceTransformer("sentence-transformers/LaBSE")

print("ƒêang t·∫°o embedding cho d·ªØ li·ªáu train...")
X_train_embeddings = sbert_model.encode(X_train_text, convert_to_numpy=True, show_progress_bar=True)

print("ƒêang t·∫°o embedding cho d·ªØ li·ªáu validation...")
X_validation_embeddings = sbert_model.encode(X_validation_text, convert_to_numpy=True, show_progress_bar=True)

print("ƒêang t·∫°o embedding cho d·ªØ li·ªáu test...")
X_test_embeddings = sbert_model.encode(X_test_text, convert_to_numpy=True, show_progress_bar=True)

print("K√≠ch th∆∞·ªõc embedding train:", X_train_embeddings.shape)
print("K√≠ch th∆∞·ªõc embedding validation:", X_validation_embeddings.shape)
print("K√≠ch th∆∞·ªõc embedding test:", X_test_embeddings.shape)

# ==============================
# 5. Hu·∫•n luy·ªán m√¥ h√¨nh BalancedRandomForest
# ==============================
brf = BalancedRandomForestClassifier(
    n_estimators=200,
    max_depth=15,
    random_state=42,
    n_jobs=-1,
    sampling_strategy="auto",
    replacement=True
)


model = MultiOutputClassifier(brf)
model.fit(X_train_embeddings, y_train)

# ==============================
# 6. ƒê√°nh gi√° m√¥ h√¨nh
# ==============================
print("ƒê√°nh gi√° tr√™n t·∫≠p validation...")
y_validation_pred = model.predict(X_validation_embeddings)

print("\nüìå K·∫øt qu·∫£ tr√™n t·∫≠p validation cho nh√£n 'sentiment':")
print(classification_report(y_validation[:, 0], y_validation_pred[:, 0], target_names=sentiment_encoder.classes_))

print("\nüìå K·∫øt qu·∫£ tr√™n t·∫≠p validation cho nh√£n 'topic':")
print(classification_report(y_validation[:, 1], y_validation_pred[:, 1]))

validation_accuracy = (y_validation_pred == y_validation).all(axis=1).mean()
print(f"\nüéØ ƒê·ªô ch√≠nh x√°c t·ªïng th·ªÉ tr√™n validation: {validation_accuracy:.4f}")

# ƒê√°nh gi√° tr√™n t·∫≠p test
print("\nƒê√°nh gi√° tr√™n t·∫≠p test...")
y_test_pred = model.predict(X_test_embeddings)

print("\nüìå K·∫øt qu·∫£ tr√™n t·∫≠p test cho nh√£n 'sentiment':")
print(classification_report(y_test[:, 0], y_test_pred[:, 0], target_names=sentiment_encoder.classes_))

print("\nüìå K·∫øt qu·∫£ tr√™n t·∫≠p test cho nh√£n 'topic':")
print(classification_report(y_test[:, 1], y_test_pred[:, 1]))

test_accuracy = (y_test_pred == y_test).all(axis=1).mean()
print(f"\nüéØ ƒê·ªô ch√≠nh x√°c t·ªïng th·ªÉ tr√™n test: {test_accuracy:.4f}")

# ==============================
# 7. V·∫Ω ma tr·∫≠n nh·∫ßm l·∫´n
# ==============================
plt.figure(figsize=(15, 6))

# Sentiment
plt.subplot(1, 2, 1)
cm_sentiment = confusion_matrix(y_test[:, 0], y_test_pred[:, 0])
sns.heatmap(cm_sentiment, annot=True, fmt='d', cmap='Blues',
            xticklabels=sentiment_encoder.classes_,
            yticklabels=sentiment_encoder.classes_)
plt.title('Ma tr·∫≠n nh·∫ßm l·∫´n - Sentiment')
plt.ylabel('Th·ª±c t·∫ø')
plt.xlabel('D·ª± ƒëo√°n')

# Topic
plt.subplot(1, 2, 2)
cm_topic = confusion_matrix(y_test[:, 1], y_test_pred[:, 1])
sns.heatmap(cm_topic, annot=True, fmt='d', cmap='Blues')
plt.title('Ma tr·∫≠n nh·∫ßm l·∫´n - Topic')
plt.ylabel('Th·ª±c t·∫ø')
plt.xlabel('D·ª± ƒëo√°n')

plt.tight_layout()
plt.show()

# # ==============================
# # 8. L∆∞u m√¥ h√¨nh + encoder
# # ==============================
# joblib.dump({"model": model, "sentiment_encoder": sentiment_encoder}, "multioutput_brf_model.pkl")
# print("‚úÖ ƒê√£ l∆∞u m√¥ h√¨nh v√†o multioutput_brf_model.pkl")


# ==============================
# 8. L∆∞u m√¥ h√¨nh + encoder + t√™n SBERT
# ==============================
save_obj = {
    "model": model,                         # m√¥ h√¨nh ƒë√£ hu·∫•n luy·ªán
    "sentiment_encoder": sentiment_encoder, # encoder cho sentiment
    "sbert_model_name": "sentence-transformers/LaBSE"  # ch·ªâ c·∫ßn l∆∞u t√™n model
}

joblib.dump(save_obj, "multioutput_brf_V2_model.pkl")
print("‚úÖ ƒê√£ l∆∞u m√¥ h√¨nh v√†o multioutput_brf_V2_model.pkl")


K√≠ch th∆∞·ªõc t·∫≠p train: (11426, 3)
K√≠ch th∆∞·ªõc t·∫≠p validation: (1583, 3)
K√≠ch th∆∞·ªõc t·∫≠p test: (3166, 3)
C√°c nh√£n sentiment: [np.int64(0), np.int64(1), np.int64(2)]
ƒêang t·∫£i model SBERT...
ƒêang t·∫°o embedding cho d·ªØ li·ªáu train...


Batches:   0%|          | 1/358 [00:06<35:42,  6.00s/it]


KeyboardInterrupt: 