In [2]:
# B22DCAT240-Ph·∫°m Xu√¢n Qu√Ω
import csv, random
from datetime import datetime, timedelta

NUM_USERS = 1000
NUM_ITEMS = 500
REVIEWS_PER_USER = 50
users = [f"user_{i:04d}" for i in range(1, NUM_USERS+1)]
items = [f"item_{i:04d}" for i in range(1, NUM_ITEMS+1)]

templates_pos = ["S·∫£n ph·∫©m t·ªët, r·∫•t h√†i l√≤ng", "Ch·∫•t l∆∞·ª£ng v∆∞·ª£t mong ƒë·ª£i", "ƒê√°ng ti·ªÅn, mua l·∫°i"]
templates_neg = ["Kh√¥ng nh∆∞ m√¥ t·∫£, th·∫•t v·ªçng", "Giao h√†ng h·ªèng", "Ch·∫•t l∆∞·ª£ng k√©m"]

outfile = "itemReview_pxquy.csv"
with open(outfile, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["user_id","item_id","rating","review_text","date"])
    start = datetime(2023,1,1)
    for u in users:
        for _ in range(REVIEWS_PER_USER):
            item = random.choice(items)
            p = random.random()
            if p < 0.05: rating = 1
            elif p < 0.12: rating = 2
            elif p < 0.35: rating = 3
            elif p < 0.75: rating = 4
            else: rating = 5
            review = random.choice(templates_pos if rating>=4 else templates_neg if rating<=2 else ["B√¨nh th∆∞·ªùng, ·ªïn"])
            date = (start + timedelta(days=random.randint(0,1000))).strftime("%Y-%m-%d")
            writer.writerow([u, item, rating, review, date])
print("Saved", outfile)


Saved itemReview_pxquy.csv


In [3]:
import pandas as pd, re
from pyvi import ViTokenizer

vn_stopwords = {"v√†","l√†","c·ªßa","c√≥","cho","nh·ªØng","ƒë√£","r·∫•t","r·ªìi","v·ªõi","m·ªôt","c√°c","tr√™n","t·ª´"}

def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r"[^0-9a-z·∫°√°√†√¢√£·∫£ƒÉ·∫Ø·∫±·∫≥·∫µ√™·∫ø·ªÅ·ªÉ·ªÖƒë√¨√≠ƒ©·ªâ√≤√≥·ªè√µ√¥·ªë·ªì·ªï·ªó∆°·ªõ·ªù·ªü·ª£√π√∫·ªß≈©∆∞·ª©·ª´·ª≠·ªØ·ª≥√Ω·ª∑·ªπ\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    tokenized = ViTokenizer.tokenize(text)
    tokens = [t for t in tokenized.split() if t not in vn_stopwords]
    return " ".join(tokens)

df = pd.read_csv("itemReview_pxquy.csv")
df["clean_text"] = df["review_text"].apply(preprocess_text)
df.to_csv("itemReview_pxquy_preprocessed.csv", index=False)


In [4]:
from gensim.models import Word2Vec
import pandas as pd, numpy as np

df = pd.read_csv("itemReview_pxquy_preprocessed.csv")
sentences = [s.split() for s in df["clean_text"].astype(str).tolist()]
w2v = Word2Vec(sentences, vector_size=100, window=5, min_count=2, epochs=10)
w2v.save("w2v_pxquy.model")

def doc_vec(s):
    toks = s.split()
    vecs = [w2v.wv[t] for t in toks if t in w2v.wv]
    return np.mean(vecs, axis=0) if vecs else np.zeros(100)


In [5]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df["rating_norm"] = scaler.fit_transform(df[["rating"]])


In [6]:
import pandas as pd
import numpy as np

df = pd.read_csv("itemReview_pxquy.csv")
users = {u:i for i,u in enumerate(df['user_id'].unique())}
items = {p:i for i,p in enumerate(df['item_id'].unique())}

R = np.zeros((len(users), len(items)))
for row in df.itertuples():
    R[users[row.user_id], items[row.item_id]] = row.rating

# Mask missing values (0) ‚Üí ch·ªâ d√πng nh·ªØng √¥ c√≥ rating
mask = (R > 0)
# Thay th·∫ø gi√° tr·ªã thi·∫øu b·∫±ng trung b√¨nh ng∆∞·ªùi d√πng ƒë·ªÉ gi·∫£m bias
mean_user = np.sum(R, axis=1) / np.sum(mask, axis=1)
for i in range(R.shape[0]):
    R[i, ~mask[i]] = mean_user[i]

# Th·ª±c hi·ªán SVD
U, sigma, Vt = np.linalg.svd(R, full_matrices=False)
k = 50
sigma_k = np.diag(sigma[:k])
U_k = U[:, :k]
Vt_k = Vt[:k, :]
R_hat = U_k @ sigma_k @ Vt_k  # Ma tr·∫≠n d·ª± ƒëo√°n rating

# D·ª± ƒëo√°n rating cho user_0001 v√† item_0001
u_idx = users['user_0001']
i_idx = items['item_0001']
predicted_rating = R_hat[u_idx, i_idx]
print("Predicted rating:", predicted_rating)


Predicted rating: 3.4881294559238647


In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

MAX_WORDS = 20000
MAX_LEN = 100
EMBED_DIM = 128

model = Sequential()
model.add(Embedding(MAX_WORDS, EMBED_DIM, input_length=MAX_LEN))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])




In [8]:
%matplotlib inline

def plot_history(h, name):
    plt.figure(figsize=(10,4))
    plt.subplot(1,2,1)
    plt.plot(h.history['loss']); plt.plot(h.history['val_loss']); plt.title('Loss')
    plt.subplot(1,2,2)
    plt.plot(h.history['accuracy']); plt.plot(h.history['val_accuracy']); plt.title('Accuracy')
    plt.suptitle(name)
    plt.savefig(name+"_history.png")
    plt.show()   # üëâ th√™m d√≤ng n√†y



In [4]:
# ==========================
# 6.2 - Full Pipeline for pxquy (a ‚Üí h)
# ==========================

# --- IMPORTS & CONFIG ---
import os
import random
import csv
from datetime import datetime, timedelta
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score, confusion_matrix
import warnings
warnings.filterwarnings("ignore")

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# --------------------------
# a) Problem/Task Statement
# --------------------------
# M·ª•c ti√™u: X√¢y d·ª±ng pipeline AI cho h·ªá th·ªëng e-commerce:
# - T·∫°o dataset ng∆∞·ªùi d√πng / s·∫£n ph·∫©m / review
# - Ti·ªÅn x·ª≠ l√Ω ti·∫øng Vi·ªát, bi·ªÉu di·ªÖn text b·∫±ng BERT ho·∫∑c Word2Vec
# - Chu·∫©n h√≥a rating
# - D·ª± ƒëo√°n xu h∆∞·ªõng s·∫£n ph·∫©m b·∫±ng Collaborative Filtering (SVD)
# - Hu·∫•n luy·ªán CNN / RNN / LSTM ƒë·ªÉ ph√¢n t√≠ch review (sentiment)
# - V·∫Ω bi·ªÉu ƒë·ªì loss/accuracy, ch·ªëng overfitting b·∫±ng Dropout & EarlyStopping
# - So s√°nh m√¥ h√¨nh ‚Üí ch·ªçn m√¥ h√¨nh t·ªët nh·∫•t
# - (i) Deploy web s·∫Ω l√†m sau

# --------------------------
# b) Generate dataset
# --------------------------
OUTFILE = "itemReview_pxquy.csv"
if not os.path.exists(OUTFILE):
    print("T·∫°o dataset synthetic:", OUTFILE)
    NUM_USERS = 1000
    NUM_ITEMS = 500
    REVIEWS_PER_USER = 50  # 50k d√≤ng
    users = [f"user_{i:04d}" for i in range(1, NUM_USERS+1)]
    items = [f"item_{i:04d}" for i in range(1, NUM_ITEMS+1)]
    templates_pos = [
        "S·∫£n ph·∫©m t·ªët, r·∫•t h√†i l√≤ng",
        "Ch·∫•t l∆∞·ª£ng v∆∞·ª£t mong ƒë·ª£i",
        "Mua l·∫°i l·∫ßn n·ªØa, ƒë√°ng ti·ªÅn",
        "Giao h√†ng nhanh, ƒë√≥ng g√≥i c·∫©n th·∫≠n",
        "Thi·∫øt k·∫ø ƒë·∫πp, d√πng ·ªïn"
    ]
    templates_neg = [
        "Ch·∫•t l∆∞·ª£ng k√©m, kh√¥ng nh∆∞ m√¥ t·∫£",
        "Giao h√†ng ch·∫≠m, h·ªèng h√≥c",
        "Kh√¥ng ƒë√°ng ti·ªÅn, th·∫•t v·ªçng",
        "S·∫£n ph·∫©m b·ªã l·ªói, c·∫ßn ƒë·ªïi tr·∫£",
        "D·ªãch v·ª• chƒÉm s√≥c kh√°ch h√†ng t·ªá"
    ]
    start = datetime(2023,1,1)
    with open(OUTFILE, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["user_id","item_id","rating","review_text","date"])
        for u in users:
            for _ in range(REVIEWS_PER_USER):
                item = random.choice(items)
                p = random.random()
                if p < 0.05:
                    rating = 1
                elif p < 0.12:
                    rating = 2
                elif p < 0.35:
                    rating = 3
                elif p < 0.75:
                    rating = 4
                else:
                    rating = 5
                review = random.choice(templates_pos if rating>=4 else templates_neg if rating<=2 else ["B√¨nh th∆∞·ªùng, kh√¥ng qu√° t·ªá"])
                date = (start + timedelta(days=random.randint(0,1000))).strftime("%Y-%m-%d")
                writer.writerow([u, item, rating, review, date])
    print("Saved:", OUTFILE)
else:
    print("Dataset ƒë√£ t·ªìn t·∫°i:", OUTFILE)

# --------------------------
# c) Preprocessing stopwords + tokenize
# --------------------------
df = pd.read_csv(OUTFILE)
print("K√≠ch th∆∞·ªõc dataset:", df.shape)

vn_stopwords = set([
    "v√†","l√†","c·ªßa","c√≥","cho","nh·ªØng","ƒë√£","r·∫•t","r·ªìi","v·ªõi","m·ªôt","c√°c","tr√™n","t·ª´",
    "kh√¥ng","nh∆∞ng","n√™n","n·∫øu","v√¨","c≈©ng","c·∫£"
])

def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r"[^0-9a-z·∫°√°√†·∫£√£ƒÉ·∫Ø·∫±·∫≥·∫µ√¢·∫•·∫ß·∫©·∫´ƒë√™·∫ø·ªÅ·ªÉ·ªÖ√¨√≠·ªâƒ©√≤√≥·ªè√µ√¥·ªë·ªì·ªï·ªó∆°·ªõ·ªù·ªü·ª£√π√∫·ªß≈©∆∞·ª©·ª´·ª≠·ªØ·ª≥√Ω·ª∑·ªπ\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    tokens = [t for t in text.split() if t not in vn_stopwords]
    return " ".join(tokens)

df['clean_text'] = df['review_text'].apply(preprocess_text)
df.to_csv("itemReview_pxquy_preprocessed.csv", index=False)
print("ƒê√£ l∆∞u file ti·ªÅn x·ª≠ l√Ω:", "itemReview_pxquy_preprocessed.csv")

# --------------------------
# c) BERT ho·∫∑c Word2Vec
# --------------------------
USE_BERT = False
try:
    from transformers import AutoTokenizer, AutoModel
    import torch
    model_name = "bert-base-multilingual-cased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    bert_model = AutoModel.from_pretrained(model_name)
    bert_model.eval()
    USE_BERT = True
    print("BERT s·∫µn s√†ng.")
except:
    print("Kh√¥ng d√πng BERT. S·∫Ω fallback sang Word2Vec.")

if USE_BERT:
    sample_texts = df['clean_text'].sample(2000, random_state=SEED).tolist()
    def bert_encode(texts, batch_size=16):
        embs = []
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            inputs = tokenizer(batch, return_tensors="pt", truncation=True, padding=True, max_length=128)
            with torch.no_grad():
                out = bert_model(**inputs)
            cls = out.last_hidden_state[:,0,:].cpu().numpy()
            embs.append(cls)
        return np.vstack(embs)
    bert_embs = bert_encode(sample_texts)
    np.save("bert_pxquy_sample.npy", bert_embs)
else:
    from gensim.models import Word2Vec
    sentences = [s.split() for s in df['clean_text'].tolist()]
    w2v = Word2Vec(sentences, vector_size=100, window=5, min_count=2, workers=4, epochs=10)
    w2v.save("w2v_pxquy.model")
    def doc_vector(s):
        toks = s.split()
        vecs = [w2v.wv[t] for t in toks if t in w2v.wv]
        return np.mean(vecs, axis=0) if vecs else np.zeros(100)
    df['w2v_vec'] = df['clean_text'].apply(lambda s: doc_vector(s).tolist())
    df.to_pickle("itemReview_pxquy_w2v.pkl")
    print("ƒê√£ l∆∞u Word2Vec embeddings.")

# --------------------------
# d) Normalize ratings
# --------------------------
scaler = MinMaxScaler()
df['rating_norm'] = scaler.fit_transform(df[['rating']])

# --------------------------
# e) Collaborative Filtering (SVD - NumPy)
# --------------------------
users = df['user_id'].unique().tolist()
items = df['item_id'].unique().tolist()
u2i = {u:i for i,u in enumerate(users)}
p2i = {p:i for i,p in enumerate(items)}
n_u, n_i = len(users), len(items)

R = np.zeros((n_u, n_i))
for row in df.itertuples():
    R[u2i[row.user_id], p2i[row.item_id]] = row.rating

# Chia train/test
positions = np.array(np.where(R>0)).T
np.random.shuffle(positions)
test_size = int(len(positions)*0.2)
test_pos = positions[:test_size]
train_pos = positions[test_size:]

R_train = R.copy()
for (u,v) in test_pos:
    R_train[u,v] = 0

# Thay missing b·∫±ng mean user
R_filled = R_train.copy()
for u in range(n_u):
    row = R_train[u]
    obs = row[row>0]
    mean_u = obs.mean() if len(obs)>0 else df['rating'].mean()
    R_filled[u, row==0] = mean_u

U, s, Vt = np.linalg.svd(R_filled, full_matrices=False)
k = 50
R_hat = U[:, :k] @ np.diag(s[:k]) @ Vt[:k, :]

from math import sqrt

def rmse(R_true, R_pred, positions):
    ys, ys_hat = [], []
    for (u,v) in positions:
        ys.append(R_true[u,v])
        ys_hat.append(R_pred[u,v])
    return sqrt(mean_squared_error(ys, ys_hat))


rmse_test = rmse(R, R_hat, test_pos)
print(f"CF (SVD) RMSE (test): {rmse_test:.4f}")

# --------------------------
# f) CNN / RNN / LSTM tr√™n review
# --------------------------
def label_from_rating(r):
    return 2 if r>=4 else 1 if r==3 else 0

df['label'] = df['rating'].apply(label_from_rating)
SAMPLE_SIZE = 15000
df_small = df.sample(min(SAMPLE_SIZE, len(df)), random_state=SEED).reset_index(drop=True)

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

MAX_WORDS = 20000
MAX_LEN = 100
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<UNK>")
tokenizer.fit_on_texts(df_small['clean_text'])
seq = tokenizer.texts_to_sequences(df_small['clean_text'])
X = pad_sequences(seq, maxlen=MAX_LEN)
y = to_categorical(df_small['label'], num_classes=3)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=SEED)

# H√†m v·∫Ω history
def plot_history(h, name):
    plt.figure(figsize=(10,4))
    plt.subplot(1,2,1)
    plt.plot(h.history['loss']); plt.plot(h.history['val_loss']); plt.title('Loss')
    plt.subplot(1,2,2)
    plt.plot(h.history['accuracy']); plt.plot(h.history['val_accuracy']); plt.title('Accuracy')
    plt.suptitle(name)
    plt.savefig(f"{name}_pxquy_history.png")
    plt.show()

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, SimpleRNN, LSTM, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping

EMBED_DIM = 128
EPOCHS = 6
BATCH = 256
es = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

def build_cnn():
    m = Sequential([
        Embedding(MAX_WORDS, EMBED_DIM, input_length=MAX_LEN),
        Conv1D(128, 5, activation='relu'),
        GlobalMaxPooling1D(),
        Dropout(0.5),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(3, activation='softmax')
    ])
    m.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return m

def build_rnn():
    m = Sequential([
        Embedding(MAX_WORDS, EMBED_DIM, input_length=MAX_LEN),
        SimpleRNN(64),
        Dropout(0.5),
        Dense(3, activation='softmax')
    ])
    m.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return m

def build_lstm():
    m = Sequential([
        Embedding(MAX_WORDS, EMBED_DIM, input_length=MAX_LEN),
        Bidirectional(LSTM(64)),
        Dropout(0.5),
        Dense(64, activation='relu'),
        Dense(3, activation='softmax')
    ])
    m.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return m



Dataset ƒë√£ t·ªìn t·∫°i: itemReview_pxquy.csv
K√≠ch th∆∞·ªõc dataset: (50000, 5)
ƒê√£ l∆∞u file ti·ªÅn x·ª≠ l√Ω: itemReview_pxquy_preprocessed.csv
BERT s·∫µn s√†ng.
CF (SVD) RMSE (test): 1.0883
