In [1780]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [1781]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [1782]:
df = pd.read_csv('/kaggle/input/100rvamz/reviews.csv')
print(df.head())

                                             comment  star
0  This is a self-published book, and if you want...     1
1  A complete waste of time. Typographical errors...     1
2  I guess you have to be a romance novel lover f...     1
3  I feel I have to write to keep others from was...     1
4  Rather than scratches and insect droppings, th...     1


In [1783]:
df = df[['comment', 'star']]
df.dropna(inplace=True) 

In [1784]:
def clean_text(text):
    text = text.lower() 
    text = re.sub(r'[^a-z\s]', '', text) 
    text = re.sub(r'\s+', ' ', text).strip() 
    return text

In [1785]:
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    return ' '.join([word for word in words if word not in stop_words])

In [1786]:
def preprocess_text(text):
    text = clean_text(text)
    text = remove_stopwords(text)
    return text

In [1787]:
df['clean_comment'] = df['comment'].apply(preprocess_text)
print("\nDữ liệu sau tiền xử lý:")
print(df[['comment', 'clean_comment', 'star']].head())


Dữ liệu sau tiền xử lý:
                                             comment  \
0  This is a self-published book, and if you want...   
1  A complete waste of time. Typographical errors...   
2  I guess you have to be a romance novel lover f...   
3  I feel I have to write to keep others from was...   
4  Rather than scratches and insect droppings, th...   

                                       clean_comment  star  
0  selfpublished book want know whyread paragraph...     1  
1  complete waste time typographical errors poor ...     1  
2  guess romance novel lover one discerning one o...     1  
3  feel write keep others wasting money book seem...     1  
4  rather scratches insect droppings one random p...     1  


In [1788]:
x_dims = 48
vectorizer = TfidfVectorizer()
pca = PCA(n_components=x_dims)

In [1789]:
X = df.clean_comment
X = vectorizer.fit_transform(X).toarray()
X = pca.fit_transform(X)
y = df['star'].astype(int)
print(X)

[[-0.09783284 -0.25367539 -0.10478953 ... -0.0776866   0.03182103
   0.01402933]
 [-0.06568705 -0.11847945  0.00503855 ... -0.04112222  0.12114929
   0.00837941]
 [-0.05535454 -0.06560471 -0.03775411 ...  0.006112   -0.01775409
  -0.02043033]
 ...
 [ 0.17313499  0.01412178  0.04487918 ...  0.00782353  0.05448936
  -0.07682357]
 [ 0.21398105 -0.00325152 -0.02879084 ... -0.00472259  0.0971109
   0.0293673 ]
 [ 0.13078206  0.02873855  0.06661815 ... -0.05468799 -0.01586919
   0.10258325]]


In [1790]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [1791]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [1792]:
y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)

In [1793]:
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print("Đánh giá mô hình:")
print(f"Accuracy (Train): {train_accuracy:}")
print(f"Accuracy (Test): {test_accuracy:}")
print("\nClassification Report (Test):\n", classification_report(y_test, y_test_pred))
print("\nConfusion Matrix (Test):\n", confusion_matrix(y_test, y_test_pred))

Đánh giá mô hình:
Accuracy (Train): 1.0
Accuracy (Test): 0.67

Classification Report (Test):
               precision    recall  f1-score   support

           1       0.77      0.61      0.68        28
           2       0.53      0.71      0.61        14
           3       0.53      0.80      0.64        10
           4       0.77      0.71      0.74        24
           5       0.68      0.62      0.65        24

    accuracy                           0.67       100
   macro avg       0.66      0.69      0.66       100
weighted avg       0.69      0.67      0.67       100


Confusion Matrix (Test):
 [[17  2  2  3  4]
 [ 1 10  3  0  0]
 [ 1  1  8  0  0]
 [ 2  2  0 17  3]
 [ 1  4  2  2 15]]


In [1794]:
def check_review(comment, given_star):
    
    comment_cleaned = preprocess_text(comment)
    comment_vector = vectorizer.transform([comment_cleaned]).toarray()
    comment_vector = pca.transform(comment_vector)
    predicted_star = rf_model.predict(comment_vector)[0]
    print(f"\nComment: {comment}")
    print(f"Given Star: {given_star}, Predicted Star: {predicted_star}")
    if given_star == predicted_star:
        print("=> Số sao phù hợp với nội dung review.")
    else:
        print("=> Số sao KHÔNG phù hợp với nội dung review.")

In [1811]:
check_review("I love this, i will buy it again", 5)
check_review("I don't like this, i will never recommend for anyone.", 1)


Comment: I love this, i will buy it again
Given Star: 5, Predicted Star: 5
=> Số sao phù hợp với nội dung review.

Comment: I don't like this, i will never recommend for anyone.
Given Star: 1, Predicted Star: 2
=> Số sao KHÔNG phù hợp với nội dung review.


In [1796]:
import pickle

with open('random_forest_model_with_accuracy.pkl', 'wb') as file:
    pickle.dump(rf_model, file)

with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)

In [1797]:
import pickle

with open('/kaggle/working/random_forest_model_with_accuracy.pkl', 'rb') as model_file:
    model = pickle.load(model_file)

with open('/kaggle/working/tfidf_vectorizer.pkl', 'rb') as vectorizer_file:
    vectorizer = pickle.load(vectorizer_file)


In [1798]:
# # Văn bản mới cần dự đoán
# new_comments = [
#     "The product quality is amazing and worth the price.",
#     "Terrible experience. It broke after two days.",
#     "Good product but could be better."
# ]

In [1799]:
# # Nếu bạn có hàm tiền xử lý
# processed_comments = [preprocess_text(comment) for comment in new_comments]

In [1800]:
# # Chuyển đổi văn bản thành vector
# X_new = vectorizer.transform(processed_comments)