**IMPORT THƯ VIỆC CẦN THIẾT**

In [29]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn import under_sampling, over_sampling
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

**DỮ LIỆU** 

**Đọc dữ liệu**

In [30]:
data = pd.read_csv('data/train_data.csv')
data.head()

Unnamed: 0,id,clean_content,class
0,19906519,phản_ánh tác_giả sách_trùng sách nước_ngoài nh...,positive
1,19968834,tạm_thời ổn đọc review hj yhanks,positive
2,19943636,quyển sách tựa_đề lầm khách_hàng viết không đủ...,negative
3,19919040,giao hàng đóng_gói cẩn_thận,positive
4,19963835,tạm đọc sách tác_giả nội_sưu_tầm trùng_lặp,negative


In [31]:
data.shape

(14823, 3)

**Xóa null và duplicate data**

In [32]:
data.dropna(subset='clean_content', inplace=True)
data.drop_duplicates(subset='clean_content', inplace=True)
data['class'] = data['class'].str.strip()
data.shape

(14823, 3)

In [33]:
data['class'].value_counts()

class
positive    11353
negative     3470
Name: count, dtype: int64

**FEATURE ENGINEERING**

**Mã hóa biến mục tiêu**

In [34]:
data['class'] = data['class'].map({'positive': 1, 'negative': 0})

In [35]:
data['class'].value_counts()

class
1    11353
0     3470
Name: count, dtype: int64

**TFIDF VECTORIZER**

In [36]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,5), max_df=0.5, min_df=5, sublinear_tf=True, norm='l2')

In [37]:
texts = tfidf_vectorizer.fit_transform(data['clean_content'].values.astype('U'))

In [38]:
df = pd.DataFrame(texts.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

In [39]:
df.head()

Unnamed: 0,a_z,achilles,afghanistan,agatha,agatha christie,ah,all,amazon,anh_chị,anne,...,ủng_hộ shop,ủng_hộ sách,ủng_hộ tiki,ủng_hộ tiếp,ủng_hộ tác_giả,ủng_hộ tương,ức_chế,ứng_dụng,ứng_dụng sách,ứng_xử
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
texts.shape

(14823, 9940)

In [41]:
labels = data['class']

In [42]:
print(texts.shape, labels.shape)

(14823, 9940) (14823,)


**XỬ LÝ DỮ LIỆU MẤT CÂN BẰNG BẰNG SMOTE**


In [43]:
smt = SMOTE(random_state=42)
texts_smt, labels_smt = smt.fit_resample(texts,labels)

In [44]:
texts_smt.shape

(22706, 9940)

In [45]:
labels_smt.shape

(22706,)

**CHIA DATA THÀNH TẬP TRAIN VÀ TẬP TEST**

In [46]:
X_train, X_test, y_train, y_test = train_test_split(texts_smt, labels_smt, test_size=0.3, random_state=42)

**HUẤN LUYỆN MÔ HÌNH VỚI HỌC MÁY**

**Chọn model**

In [47]:
LogReg = LogisticRegression()
SVCModel = SVC()
RForest = RandomForestClassifier()

models = [LogReg, SVCModel, RForest]
models_dict = {0: "Logistic Regression", 1: "SVC", 2: "Random Forest"}

In [48]:
results = []

for i, model in enumerate(models):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    report1 = metrics.classification_report(y_test, y_pred, labels=[1,0], digits=3)
    print(f"Classification Report for model {models_dict[i]}:\n", report1)

Classification Report for model Logistic Regression:
               precision    recall  f1-score   support

           1      0.921     0.886     0.903      3380
           0      0.891     0.925     0.908      3432

    accuracy                          0.905      6812
   macro avg      0.906     0.905     0.905      6812
weighted avg      0.906     0.905     0.905      6812

Classification Report for model SVC:
               precision    recall  f1-score   support

           1      0.930     0.969     0.949      3380
           0      0.968     0.928     0.948      3432

    accuracy                          0.948      6812
   macro avg      0.949     0.948     0.948      6812
weighted avg      0.949     0.948     0.948      6812

Classification Report for model Random Forest:
               precision    recall  f1-score   support

           1      0.943     0.899     0.920      3380
           0      0.905     0.947     0.925      3432

    accuracy                          0.92

**Hyperparameter Tuning**

In [49]:
parameters = {
    'kernel': ('linear', 'rbf'),
    'C': [0.125, 0.25, 0.5, 1, 2, 4],
    'gamma': [0.125, 0.25, 0.5, 1, 2, 4]
}

grid = GridSearchCV(SVCModel, param_grid=parameters, scoring='accuracy', cv=5)  
grid_search = grid.fit(X_train, y_train)

print("Best score: %0.3f" % grid_search.best_score_)
print("Best estimator:", grid_search.best_estimator_)
print('Best parameters:', grid.best_params_)

Best score: 0.946
Best estimator: SVC(C=4, gamma=2)
Best parameters: {'C': 4, 'gamma': 2, 'kernel': 'rbf'}


**Xây dựng mô hình**

In [50]:
model = SVC(C=4, gamma=2, kernel='rbf')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

report1 = metrics.classification_report(y_test, y_pred, labels=[1, 0], digits=3)
print("Classification Report on Test Set:\n", report1)

Classification Report on Test Set:
               precision    recall  f1-score   support

           1      0.941     0.975     0.957      3380
           0      0.974     0.940     0.957      3432

    accuracy                          0.957      6812
   macro avg      0.957     0.957     0.957      6812
weighted avg      0.958     0.957     0.957      6812



**Kiểm tra overfitting để phát hiện ra lỗi**

In [51]:
model = SVC(C=4, gamma=2, kernel='rbf')
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
report2 = metrics.classification_report(y_train, y_train_pred, labels=[1,0], digits=3)
print("Classification Report on Train Set:\n", report2)

Classification Report on Train Set:
               precision    recall  f1-score   support

           1      1.000     0.998     0.999      7973
           0      0.998     1.000     0.999      7921

    accuracy                          0.999     15894
   macro avg      0.999     0.999     0.999     15894
weighted avg      0.999     0.999     0.999     15894



**Phân tích lỗi để phát hiện ra các data đã bị dán nhãn sai**

In [52]:
mismatches = []

for id, review, true_label, predicted_label in zip(data['id'], X_train, y_train, y_train_pred):
    if true_label != predicted_label:
        if true_label != 1:
            mismatches.append(f"{id}, True Label: {true_label}, Predicted Label: {predicted_label}")
            print(f"{id}, True Label: {true_label}, Predicted Label: {predicted_label}")

result_length = len(mismatches)
print(f"Số lượng bản ghi không khớp: {result_length}")

Số lượng bản ghi không khớp: 0


**Huấn luyện lại mô hình với phương pháp Crossvalidation 5 folds để đánh giá công bằng hơn sau khi gán lại nhãn**

In [53]:
cv_results = cross_val_score(SVC(C=4, gamma=2, kernel='rbf'), X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)

print(f"Cross-validation Accuracy Scores: {cv_results}")
print(f"Mean Accuracy: {cv_results.mean()}")
print(f"Standard Deviation: {cv_results.std()}")

Cross-validation Accuracy Scores: [0.94054734 0.94966971 0.94904058 0.94400755 0.94745123]
Mean Accuracy: 0.946143280983151
Standard Deviation: 0.0034180878928392316
