In [1]:
!pip install scikit-learn imbalanced-learn

Collecting scikit-learn
  Downloading scikit_learn-1.7.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
  Downloading scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Downloading scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m91.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
category-encoders 2.7.0 requires scikit-learn<1.6.0,>=1.0.0, but you have scikit-learn 1.6.1 which is incompatible.

In [5]:
# Import the necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from collections import Counter

In [6]:
# Load your cleaned data
df = pd.read_csv('/kaggle/input/chongpha-ver2/Dataset_ChongPha.csv')

# Combine text columns
df['text'] = df['comment_clean']

# Encode labels
label_encoder = LabelEncoder()
df['encoded_label'] = label_encoder.fit_transform(df['label'])

print(f"Data shape: {df.shape}")
print(f"Labels: {label_encoder.classes_}")
print(f"Label distribution:\n{df['label'].value_counts()}")

Data shape: (18912, 6)
Labels: ['KHONG_LIEN_QUAN' 'KHONG_PHAN_DONG' 'PHAN_DONG']
Label distribution:
label
KHONG_LIEN_QUAN    10004
KHONG_PHAN_DONG     6744
PHAN_DONG           2164
Name: count, dtype: int64


In [16]:
# Define X (input features) and Y (output labels)
X = df["text"]
Y = df["encoded_label"]

# Perform the train test split using stratified cross-validation
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=2025, stratify=Y)

# Initialize the CountVectorizer
Tfid_vectorizer = TfidfVectorizer()

# Fit and transform the training data 
X_train_Tfid = Tfid_vectorizer.fit_transform(X_train)
# Transform the test data
X_test_Tfid = Tfid_vectorizer.transform(X_test)

In [11]:
print("Original class distribution in training data:")
print(Counter(Y_train))

Original class distribution in training data:
Counter({0: 9003, 1: 6069, 2: 1948})


In [12]:
# Method 1: SMOTE oversampling
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train_count, Y_train)

print("\nClass distribution after SMOTE:")
print(Counter(y_smote))


Class distribution after SMOTE:
Counter({0: 9003, 1: 9003, 2: 9003})


In [13]:
# Show the shape changes
print(f"\nOriginal training data shape: {X_train_count.shape}")
print(f"Oversampled training data shape: {X_smote.shape}")

# We keep the test data unchanged
print(f"\nTest data shape: {X_test_count.shape}")


Original training data shape: (17020, 7523)
Oversampled training data shape: (27009, 7523)

Test data shape: (1892, 7523)


In [20]:
# Initialize the RandomForestClassifier model
random_forest_model = RandomForestClassifier(n_estimators=600, random_state=2025)

# Fit the model on the training data
random_forest_model.fit(X_train_Tfid, Y_train)

In [22]:
from sklearn.metrics import classification_report

# Make predictions on the test data
y_pred = random_forest_model.predict(X_test_count)

# Calculate the accuracy of the model
accuracy = metrics.accuracy_score(Y_test, y_pred)

# Print the accuracy
print(f"Accuracy of Random Forest Classifier: {accuracy:.2f}")

print("\nClassification Report:")
print(classification_report(Y_test, y_pred))

Accuracy of Random Forest Classifier: 0.70

Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.88      0.79      1001
           1       0.67      0.58      0.62       675
           2       0.75      0.21      0.33       216

    accuracy                           0.70      1892
   macro avg       0.71      0.56      0.58      1892
weighted avg       0.70      0.70      0.67      1892



In [24]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()

clf.fit(X_train_Tfid, Y_train)

In [25]:
# Make predictions on the test data
y_pred = clf.predict(X_test_Tfid)

# Calculate the accuracy of the model
accuracy = metrics.accuracy_score(Y_test, y_pred)

# Print the accuracy
print(f"Accuracy of Random Forest Classifier: {accuracy:.2f}")

print("\nClassification Report:")
print(classification_report(Y_test, y_pred))

Accuracy of Random Forest Classifier: 0.66

Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.91      0.77      1001
           1       0.64      0.50      0.56       675
           2       1.00      0.04      0.07       216

    accuracy                           0.66      1892
   macro avg       0.77      0.48      0.47      1892
weighted avg       0.70      0.66      0.62      1892



In [26]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0).fit(X_train_Tfid, Y_train)

In [27]:
# Make predictions on the test data
y_pred = clf.predict(X_test_Tfid)

# Calculate the accuracy of the model
accuracy = metrics.accuracy_score(Y_test, y_pred)

# Print the accuracy
print(f"Accuracy of Random Forest Classifier: {accuracy:.2f}")

print("\nClassification Report:")
print(classification_report(Y_test, y_pred))

Accuracy of Random Forest Classifier: 0.72

Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.88      0.80      1001
           1       0.69      0.61      0.65       675
           2       0.72      0.33      0.45       216

    accuracy                           0.72      1892
   macro avg       0.72      0.61      0.63      1892
weighted avg       0.72      0.72      0.71      1892

