In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
path = "/content/drive/MyDrive/Colab Notebooks/MachineLearning/SentimentAnalysis"
test_path = os.path.join(path, "data", "test.csv")
train_path = os.path.join(path, "data", "train.csv")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

In [None]:
train.shape

(25000, 3)

In [None]:
test.shape

(25000, 3)

### Preprocessing and EDA

In [None]:
# Xóa cột dir

In [None]:
train = train.drop("dir", axis=1)

Unnamed: 0,text,score
0,Story of a man who has unnatural feelings for ...,3
1,Airport '77 starts as a brand new luxury 747 p...,4
2,This film lacked something I couldn't put my f...,4
3,"Sorry everyone,,, I know this is supposed to b...",1
4,When I was little my parents took me along to ...,1


In [None]:
test = test.drop("dir", axis=1)

Unnamed: 0,text,score
0,Once again Mr. Costner has dragged out a movie...,2
1,This is an example of why the majority of acti...,4
2,"First of all I hate those moronic rappers, who...",1
3,Not even the Beatles could write songs everyon...,3
4,Brass pictures (movies is not a fitting word f...,3


In [None]:
# Shuffle train.csv
RANDOM_STATE = 42
train = train.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)

Unnamed: 0,text,score
0,"Silent Night, Deadly Night 5 is the very last ...",4
1,The idea ia a very short film with a lot of in...,10
2,"For me, this movie just seemed to fall on its ...",4
3,Was this based on a comic-book? A video-game? ...,7
4,Caution: May contain spoilers...<br /><br />I'...,10


In [None]:
# score <= 4: neg, score >= 7: pos
train["score"] = train["score"].apply(
    lambda x: 0 if x <= 4 else (1 if x >= 7 else None)
)
test["score"] = test["score"].apply(
    lambda x: 0 if x <= 4 else (1 if x >= 7 else None)
)

In [None]:
train["score"].unique()

array([0, 1])

In [None]:
test["score"].unique()

array([0, 1])

In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    25000 non-null  object
 1   score   25000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 390.8+ KB


In [None]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    25000 non-null  object
 1   score   25000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 390.8+ KB


In [None]:
train.duplicated().sum()

np.int64(96)

In [None]:
test.duplicated().sum()

np.int64(199)

In [None]:
# drop duplicates
train.drop_duplicates(inplace=True)
test.drop_duplicates(inplace=True)

In [None]:
train.duplicated().sum()

np.int64(0)

In [None]:
test.duplicated().sum()

np.int64(0)

In [None]:
# lower text
train["text"] = train["text"].str.lower()
test["text"] = test["text"].str.lower()

In [None]:
train[train["text"].apply(lambda x: x.endswith(" ") or x.startswith(" "))]

Unnamed: 0,text,score


In [None]:
import re
# Mẫu regex để nhận diện URL
url_pattern = re.compile(
    r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
)

# Lọc các comment có chứa URL
comments_with_urls = train[train['text'].str.contains(url_pattern, na=False)]

# Hiển thị vài dòng đầu tiên
print(comments_with_urls.head())

                                                   text  score
1564  the secret of kells is one of the most unique,...      1
1624  i am watching the series back to back as fast ...      1
1837  i run a group to stop comedian exploitation an...      0
3834  the mere fact that i still think of the movie ...      1
4004  a super comedy series from the 1990s (two seri...      1


In [None]:
train = train[~(train['text'].str.contains(url_pattern, na=False))]

In [None]:
import re
# Mẫu regex để nhận diện URL
url_pattern = re.compile(
    r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
)

# Lọc các comment có chứa URL
comments_with_urls = test[test['text'].str.contains(url_pattern, na=False)]

# Hiển thị vài dòng đầu tiên
print(comments_with_urls.head())

                                                   text  score
912   i think that most people would agree with me i...      0
1009  being a huge fan of conte d'ete ( http://www.i...      0
1040  did uwe boll seriously just rip off the basic ...      0
1545  this is a horrific re-make of the french movie...      0
1748  "scientists at a remote lab experiment on (ins...      0


In [None]:
test = test[~(test['text'].str.contains(url_pattern, na=False))]

In [None]:
comments_with_newline = train[train["text"].str.contains('\n')]
comments_with_newline.head()

Unnamed: 0,text,score


In [None]:
comments_with_newline = test[test["text"].str.contains('\n')]
comments_with_newline.head()

Unnamed: 0,text,score


In [None]:
train[train["text"].str.contains("<", na=False)].head()

Unnamed: 0,text,score
0,"silent night, deadly night 5 is the very last ...",0
3,was this based on a comic-book? a video-game? ...,1
4,caution: may contain spoilers...<br /><br />i'...,1
8,"skippy from ""family ties"" plays eddie, a wussy...",0
9,mr perlman gives a standout performance (as us...,0


In [None]:
test[test["text"].str.contains("<", na=False)].head()

Unnamed: 0,text,score
1,this is an example of why the majority of acti...,0
2,"first of all i hate those moronic rappers, who...",0
4,brass pictures (movies is not a fitting word f...,0
6,this german horror film has to be one of the w...,0
7,"being a long-time fan of japanese film, i expe...",0


In [None]:
import html
def clear_html(text: str) -> str:
    if not isinstance(text, str):
        return ""
    # Bỏ nội dung trong <script> và <style>
    text = re.sub(r"(?is)<(script|style).*?>.*?</\1>", " ", text)
    # Bỏ tất cả thẻ HTML còn lại
    text = re.sub(r"(?s)<[^>]+>", " ", text)
    # Chuẩn hoá &entity; → ký tự thật (ví dụ &amp; → &)
    text = html.unescape(text)
    # Xoá các khoảng trắng thừa
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Ví dụ áp dụng cho cột 'clean_comment' của DataFrame df
train["text"] = train["text"].apply(clear_html)
test["text"] = test["text"].apply(clear_html)

In [None]:
train[train["text"].str.contains("<", na=False)].head(15)

Unnamed: 0,text,score
7763,yaaaaaaaaaaaaaawwwwwwwwwwwwwwwwwnnnnnnnnnnnnn!...,0
7846,if it smells like garbage and if it looks like...,0
12903,"this movie may seem scary on commercials, but ...",0
13022,>>>>>>>>>>>>> >>>>>>>>>>>>>>>>> >>>>>>>>>>>>>>...,0
15296,"*** this contains many, many spoilers, not tha...",0
22737,rented and watched this short (< 90 minutes) w...,1
23693,this can't be mandy schaffer's last film. some...,0


In [None]:
test[test["text"].str.contains("<", na=False)].head()

Unnamed: 0,text,score
742,one of the most awaited movie!i thought himesh...,0
3251,this movie over does it on the cgi i mean sci-...,0
4368,"okay wait let me get this street, there are ac...",0
10210,assault on precinct 13 is the absolute dumbest...,0
11965,since this show was changed from tss (the scre...,0


In [None]:
train["text"][7763]

'yaaaaaaaaaaaaaawwwwwwwwwwwwwwwwwnnnnnnnnnnnnn! :=8o zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz........... oh, um excuse me, sorry, fell asleep there for a mooment. now where was i? oh yes, "the projected man", yes... zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz........... ooops, sorry. yes, "the projected man". well, it\'s a british sci-fi yawnfest about nothing. some orange-headed guy projects himself on a laser, gets the touch of death. at last he vanishes, the end. actually, the film\'s not even that interesting. dull, droning, starchy, stiff, and back-breakingly boring, "the projected man" is 77 solid minutes of nothing, starring nobody. dull as dishwater. dull as doorknob dust. dull as ethan hawke - we\'re talking really dull here, people! but wait, in respect to our dull cousins from across the puddle, the moocow will now do a proper review for "the projected man": zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz.............. <=8.'

In [None]:
train["word_count"] = train["text"].apply(lambda x: len(x.split()))
test["word_count"] = test["text"].apply(lambda x: len(x.split()))

In [None]:
import nltk
from nltk.corpus import stopwords

# Tải stopwords (nếu chưa có)
nltk.download('stopwords')

# Lấy danh sách stopwords tiếng Anh
stop_words = set(stopwords.words('english'))

# Tạo cột mới 'num_stop_words' = số lượng stopwords trong mỗi comment
def count_stopwords(text):
    if not isinstance(text, str):
        return 0
    words = text.split()
    return sum(1 for word in words if word.lower() in stop_words)

train["num_stop_words"] = train["text"].apply(count_stopwords)
test["num_stop_words"] = test["text"].apply(count_stopwords)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
train["num_chars"] = train["text"].apply(len)
test["num_chars"] = test["text"].apply(len)

In [None]:
import string
train["num_punctuation_chars"] = train["text"].apply(lambda x: len([c for c in x if c in string.punctuation]))
test["num_punctuation_chars"] = test["text"].apply(lambda x: len([c for c in x if c in string.punctuation]))

In [None]:
train["text"] = train["text"].apply(lambda x: re.sub(r'[^A-Za-z0-9\s!?.,]', '', str(x)))
test["text"] = test["text"].apply(lambda x: re.sub(r'[^A-Za-z0-9\s!?.,]', '', str(x)))

In [None]:
from nltk.corpus import stopwords

# Defining stop words but keeping essential ones for sentiment analysis
stop_words = set(stopwords.words('english')) - {'not', 'but', 'however', 'no', 'yet'}

# Remove stop words from 'clean_comment' column, retaining essential ones
train['text'] = train['text'].apply(
    lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words])
)
test['text'] = test['text'].apply(
    lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words])
)

In [None]:
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')

# Define the lemmatizer
lemmatizer = WordNetLemmatizer()

# Apply lemmatization to the 'text' column
train['text'] = train['text'].apply(
    lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()])
)
test['text'] = test['text'].apply(
    lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()])
)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
train.head()

In [None]:
test.head()

### Chia train, test thành X, y

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import xgboost as xgb
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

In [None]:
ngram_range = (1, 3)
max_features = 5000

In [None]:
X_train = train.drop(["score"], axis=1)
y_train = train["score"]
X_test = test.drop(["score"], axis=1)
y_test = test["score"]
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((24843, 5), (24843,), (24741, 5), (24741,))

In [None]:
vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)

In [None]:
# Chuyển đổi dữ liệu text thành TF-IDF vectors
X_train_tfidf = vectorizer.fit_transform(X_train['text'])
X_test_tfidf = vectorizer.transform(X_test['text'])

In [None]:
# Sử dụng SMOTE để cân bằng dữ liệu nếu cần
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf, y_train)

In [None]:
X_train.columns

Index(['text', 'word_count', 'num_stop_words', 'num_chars',
       'num_punctuation_chars'],
      dtype='object')

In [None]:
### Nếu muốn kết hợp với các đặc trưng trước đó

In [None]:
# Chuyển các cột khác thành numpy array
X_train_other = X_train[['word_count', 'num_stop_words', 'num_chars', 'num_punctuation_chars']].values
X_test_other = X_test[['word_count', 'num_stop_words', 'num_chars', 'num_punctuation_chars']].values

In [None]:
from scipy.sparse import hstack

# Kết hợp TF-IDF và các cột khác cho X_train và X_test
X_train_combined = hstack([X_train_tfidf, X_train_other])
X_test_combined = hstack([X_test_tfidf, X_test_other])


In [None]:
### ======================================

### XGBoost

In [None]:
import xgboost as xgb

In [151]:
# Chuyển về dạng DMatrix (tối ưu cho XGBoost)
dtrain = xgb.DMatrix(X_train_resampled, label=y_train_resampled)
dtest = xgb.DMatrix(X_test_tfidf)

In [152]:
params = {
    'objective': 'binary:logistic',  # Bài toán phân loại nhị phân
    'eval_metric': 'logloss',        # Sử dụng log loss làm chỉ số đánh giá
    'tree_method': 'gpu_hist',       # Sử dụng GPU để huấn luyện
    'predictor': 'gpu_predictor',    # Sử dụng GPU cho dự đoán
    'learning_rate': 0.2,            # Tốc độ học
    'max_depth': 6,                  # Độ sâu tối đa của cây
    'n_estimators': 100            # Số lượng cây
}

In [153]:
# Huấn luyện mô hình
model = xgb.train(params, dtrain, num_boost_round=200)


    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "n_estimators", "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [154]:
# Dự đoán trên X_test
y_pred = model.predict(dtest)
y_pred = np.round(y_pred)  # Chuyển giá trị dự đoán thành 0 hoặc 1


    E.g. tree_method = "hist", device = "cuda"

  return func(**kwargs)


In [156]:
# Kết quả của mô hình trên
# Accuracy: 0.8683
# Precision: 0.8590
# Recall: 0.8822
# F1 Score: 0.8704
# Classification Report:
#               precision    recall  f1-score   support

#            0       0.88      0.85      0.87     12332
#            1       0.86      0.88      0.87     12409

#     accuracy                           0.87     24741
#    macro avg       0.87      0.87      0.87     24741
# weighted avg       0.87      0.87      0.87     24741

### LightGBM

In [None]:
import lightgbm as lgb

In [None]:
# Tạo DMatrix cho LGBM
train_data = lgb.Dataset(X_train_resampled, label=y_train_resampled)
test_data = lgb.Dataset(X_test_tfidf, label=y_test, reference=train_data)

In [129]:
# Tham số LightGBM với GPU và early stopping
params = {
    'objective': 'binary',  # Phân loại nhị phân
    'metric': 'binary_logloss',  # Đánh giá theo log loss
    'boosting_type': 'gbdt',  # Sử dụng boosting truyền thống
    'num_leaves': 31,  # Số lá của cây
    'learning_rate': 0.1,  # Tốc độ học
    'feature_fraction': 0.9,  # Tỉ lệ chọn đặc trưng
    'bagging_fraction': 0.8,  # Tỉ lệ chọn mẫu
    'bagging_freq': 5,  # Thực hiện bagging mỗi 5 lần
    'lambda_l1': 0.1,  # Regularization l1
    'lambda_l2': 0.1,  # Regularization l2
    'tree_method': 'gpu_hist',  # Sử dụng GPU cho histogram-based tree building
    'predictor': 'gpu_predictor',  # Dự đoán trên GPU
    'early_stopping_rounds': 50  # Dừng huấn luyện nếu không cải thiện trong 50 vòng
}

In [130]:
# Huấn luyện mô hình LGBM
model = lgb.train(params, train_data, valid_sets=[test_data], num_boost_round=1000)

[LightGBM] [Info] Number of positive: 12437, number of negative: 12437
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.151054 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 487485
[LightGBM] [Info] Number of data points in the train set: 24874, number of used features: 5000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[427]	valid_0's binary_logloss: 0.270804


In [137]:
# Dự đoán trên tập test
print("Đang dự đoán")
y_pred = model.predict(X_test_tfidf, num_iteration=model.best_iteration)
y_pred = np.round(y_pred)  # Chuyển giá trị dự đoán thành 0 hoặc 1
print("Dự đoán xong")

Đang dự đoán
Dự đoán xong


In [136]:
# Kết quả của mô hình trên
# Accuracy: 0.8855
# Precision: 0.8832
# Recall: 0.8893
# F1 Score: 0.8862
# Classification Report:
#               precision    recall  f1-score   support

#            0       0.89      0.88      0.88     12332
#            1       0.88      0.89      0.89     12409

#     accuracy                           0.89     24741
#    macro avg       0.89      0.89      0.89     24741
# weighted avg       0.89      0.89      0.89     24741

### Random Forest

In [144]:
# Các thư viện này cho phép sử dụng GPU cho các thuật toán học máy
import cuml
import cupy as cp
import pandas as pd
from cuml.ensemble import RandomForestClassifier as cuRF
from cuml.feature_extraction.text import TfidfVectorizer as cuTfidfVectorizer

In [146]:
# Chuyển sparse matrix thành dense array trước khi huấn luyện
X_train_dense = X_train_resampled.toarray()
X_test_dense = X_test_combined.toarray()

In [147]:
# Huấn luyện mô hình cuML Random Forest với dense data
rf_model = cuRF(n_estimators=100, max_depth=6)
rf_model.fit(X_train_dense, y_train_resampled)

In [148]:
# Dự đoán trên tập test
y_pred = rf_model.predict(X_test_dense)

In [150]:
# Kết quả của mô hình trên
# Accuracy: 0.5047
# Precision: 0.5036
# Recall: 0.8624
# F1 Score: 0.6359
# Classification Report:
#               precision    recall  f1-score   support

#            0       0.51      0.14      0.23     12332
#            1       0.50      0.86      0.64     12409

#     accuracy                           0.50     24741
#    macro avg       0.51      0.50      0.43     24741
# weighted avg       0.51      0.50      0.43     24741

### KNN

In [138]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

In [139]:
# Chuẩn hóa dữ liệu (bắt buộc đối với KNN)
scaler = StandardScaler(with_mean=False)  # Đảm bảo không trừ trung bình vì X_train_combined là sparse matrix
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test_tfidf )

In [140]:
# Huấn luyện mô hình KNN
knn = KNeighborsClassifier(n_neighbors=5)  # Bạn có thể thay đổi n_neighbors
knn.fit(X_train_scaled, y_train_resampled)

In [141]:
# Dự đoán trên tập test
y_pred = knn.predict(X_test_scaled)

In [None]:
# Kết quả của mô hình trên
# Accuracy: 0.5988
# Precision: 0.6201
# Recall: 0.5165
# F1 Score: 0.5636
# Classification Report:
#               precision    recall  f1-score   support

#            0       0.58      0.68      0.63     12332
#            1       0.62      0.52      0.56     12409

#     accuracy                           0.60     24741
#    macro avg       0.60      0.60      0.60     24741
# weighted avg       0.60      0.60      0.60     24741

### Evaluation

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [155]:
# Đánh giá mô hình
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# In kết quả
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Classification Report:\n{report}")

Accuracy: 0.8683
Precision: 0.8590
Recall: 0.8822
F1 Score: 0.8704
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.85      0.87     12332
           1       0.86      0.88      0.87     12409

    accuracy                           0.87     24741
   macro avg       0.87      0.87      0.87     24741
weighted avg       0.87      0.87      0.87     24741

