In [None]:
# 10.1 Classification Demo: Naive Bayes & SVM (Dùng file CSV)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score

# --- Đọc dữ liệu từ file CSV ---
# 👉 Thay đường dẫn bên dưới bằng đường dẫn file
df = pd.read_csv("/content/IMDB Dataset.csv")

# Hiển thị vài dòng đầu để kiểm tra
print(df.head())

# --- Tiền xử lý ---
# Chuyển nhãn từ text sang số
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Lấy danh sách dữ liệu và nhãn
texts = df['review'].astype(str).tolist()
labels = df['label'].tolist()

print("Số lượng mẫu:", len(texts))

# --- Tách dữ liệu ---
X_train, X_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.3, random_state=42
)

# --- Biểu diễn đặc trưng bằng TF-IDF ---
vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# --- Naive Bayes ---
nb_model = MultinomialNB()
nb_model.fit(X_train_vec, y_train)
nb_pred = nb_model.predict(X_test_vec)

print("\n=== Naive Bayes ===")
print(f"Accuracy: {accuracy_score(y_test, nb_pred):.2f}")
print(classification_report(y_test, nb_pred, zero_division=0))

# --- Support Vector Machine ---
svm_model = LinearSVC()
svm_model.fit(X_train_vec, y_train)
svm_pred = svm_model.predict(X_test_vec)

print("\n=== Support Vector Machine (SVM) ===")
print(f"Accuracy: {accuracy_score(y_test, svm_pred):.2f}")
print(classification_report(y_test, svm_pred, zero_division=0))


                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
Số lượng mẫu: 50000

=== Naive Bayes ===
Accuracy: 0.85
              precision    recall  f1-score   support

           0       0.85      0.86      0.85      7411
           1       0.86      0.85      0.86      7589

    accuracy                           0.85     15000
   macro avg       0.85      0.85      0.85     15000
weighted avg       0.85      0.85      0.85     15000


=== Support Vector Machine (SVM) ===
Accuracy: 0.88
              precision    recall  f1-score   support

           0       0.89      0.87      0.88      7411
           1       0.88      0.89      0.89      7589

    accuracy          