In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
vietnamese_comments = pd.read_csv('vietnamese_comments.csv')
# print out 5 first rows
print(vietnamese_comments.head(5))

                       comment label  rate Unnamed: 3
0               Áo bao đẹp ạ!!   POS     5        NaN
1                  Tuyệt vời !   POS     5        NaN
2   2day ao khong giong trong.   NEG     1        NaN
3  Mùi thơm,bôi lên da mềm da.   POS     5        NaN
4            Vải đẹp, dày dặn.   POS     5        NaN


In [5]:
# Extract the list of reviews X
X = vietnamese_comments['comment'].values.tolist()
# Extract the labels y
y = vietnamese_comments['label'].values.tolist()

In [6]:
# Use CountVectorizer to convert each review in X to a vector of number
vectorizer = CountVectorizer()
X_counts = vectorizer.fit_transform(X)
vocabulary = vectorizer.get_feature_names_out()

In [7]:
# Show the list of unique words (vocabulary) in the dataset
print("Vocabulary:", vocabulary)

Vocabulary: ['000' '000đ' '01' ... 'ửinkled' 'ửng' '쓰레기']


In [8]:
# Show the number of unique words in the vocabulary
print("Number of unique words:", len(vocabulary))

Number of unique words: 5861


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_counts, y, test_size=0.2, random_state=42)

In [10]:
clf = MultinomialNB()
clf.fit(X_train, y_train)

In [11]:
# Predict a new review
new_review = ["'Doctor Strange' is not a perfect film. If you expect a movie to have depth and the point, you should skip, not only this one, but whole Marvel production."]
new_review_vectorized = vectorizer.transform(new_review)
print(clf.predict(new_review_vectorized.toarray()))
print(clf.predict_proba(new_review_vectorized.toarray()))

['NEU']
[[2.06004393e-03 9.97686455e-01 2.53500663e-04]]


In [12]:
# Evaluate by the accuracy score
print(clf.score(X_test, y_test))

0.721233312142403
