In [1]:
# T·∫° Cao S∆°n - B22DCVT445
import csv, random
from datetime import datetime, timedelta

NUM_USERS = 1000
NUM_ITEMS = 500
REVIEWS_PER_USER = 50
users = [f"user_{i:04d}" for i in range(1, NUM_USERS+1)]
items = [f"item_{i:04d}" for i in range(1, NUM_ITEMS+1)]

templates_pos = ["S·∫£n ph·∫©m t·ªët, r·∫•t h√†i l√≤ng", "Ch·∫•t l∆∞·ª£ng v∆∞·ª£t mong ƒë·ª£i", "ƒê√°ng ti·ªÅn, mua l·∫°i"]
templates_neg = ["Kh√¥ng nh∆∞ m√¥ t·∫£, th·∫•t v·ªçng", "Giao h√†ng h·ªèng", "Ch·∫•t l∆∞·ª£ng k√©m"]

outfile = "itemReview_pxquy.csv"
with open(outfile, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["user_id","item_id","rating","review_text","date"])
    start = datetime(2023,1,1)
    for u in users:
        for _ in range(REVIEWS_PER_USER):
            item = random.choice(items)
            p = random.random()
            if p < 0.05: rating = 1
            elif p < 0.12: rating = 2
            elif p < 0.35: rating = 3
            elif p < 0.75: rating = 4
            else: rating = 5
            review = random.choice(templates_pos if rating>=4 else templates_neg if rating<=2 else ["B√¨nh th∆∞·ªùng, ·ªïn"])
            date = (start + timedelta(days=random.randint(0,1000))).strftime("%Y-%m-%d")
            writer.writerow([u, item, rating, review, date])
print("Saved", outfile)


Saved itemReview_pxquy.csv


In [9]:
import pandas as pd, re
from pyvi import ViTokenizer

vn_stopwords = {"v√†","l√†","c·ªßa","c√≥","cho","nh·ªØng","ƒë√£","r·∫•t","r·ªìi","v·ªõi","m·ªôt","c√°c","tr√™n","t·ª´"}

def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r"[^0-9a-z·∫°√°√†√¢√£·∫£ƒÉ·∫Ø·∫±·∫≥·∫µ√™·∫ø·ªÅ·ªÉ·ªÖƒë√¨√≠ƒ©·ªâ√≤√≥·ªè√µ√¥·ªë·ªì·ªï·ªó∆°·ªõ·ªù·ªü·ª£√π√∫·ªß≈©∆∞·ª©·ª´·ª≠·ªØ·ª≥√Ω·ª∑·ªπ\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    tokenized = ViTokenizer.tokenize(text)
    tokens = [t for t in tokenized.split() if t not in vn_stopwords]
    return " ".join(tokens)

df = pd.read_csv("itemReview_pxquy.csv")
df["clean_text"] = df["review_text"].apply(preprocess_text)
df.to_csv("itemReview_pxquy_preprocessed.csv", index=False)


In [10]:
import pandas as pd, numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

# Simple Word2Vec class (kh√¥ng c·∫ßn gensim)
class SimpleWord2Vec:
    def __init__(self, vector_size=100):
        self.vector_size = vector_size
        self.word_vectors = {}
        
    def train_from_sentences(self, sentences):
        # T·∫°o corpus t·ª´ sentences
        corpus = [' '.join(sent) for sent in sentences if len(sent) > 0]
        
        # TF-IDF vectorization
        vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,1))
        tfidf_matrix = vectorizer.fit_transform(corpus)
        
        # SVD ƒë·ªÉ gi·∫£m chi·ªÅu
        svd = TruncatedSVD(n_components=self.vector_size, random_state=42)
        word_vectors_matrix = svd.fit_transform(tfidf_matrix.T)
        
        # Mapping t·ª´ -> vector
        feature_names = vectorizer.get_feature_names_out()
        for i, word in enumerate(feature_names):
            if i < len(word_vectors_matrix):
                self.word_vectors[word] = word_vectors_matrix[i]
        
        print(f"Trained {len(self.word_vectors)} word vectors")
    
    def get_vector(self, word):
        return self.word_vectors.get(word, np.zeros(self.vector_size))
    
    def save(self, filepath):
        np.savez(filepath, word_vectors=self.word_vectors, vector_size=self.vector_size)

# Load data v√† train
df = pd.read_csv("itemReview_pxquy_preprocessed.csv")
sentences = [s.split() for s in df["clean_text"].astype(str).tolist() if len(s.strip()) > 0]

# Train simple word2vec
w2v = SimpleWord2Vec(vector_size=100)
w2v.train_from_sentences(sentences)
w2v.save("simple_w2v_pxquy.npz")

def doc_vec(s):
    """T·∫°o document vector b·∫±ng c√°ch average word vectors"""
    toks = s.split()
    vecs = [w2v.get_vector(t) for t in toks if t in w2v.word_vectors]
    return np.mean(vecs, axis=0) if vecs else np.zeros(100)

print("‚úÖ Simple Word2Vec training completed (no C++ compiler needed!)")

Trained 22 word vectors
‚úÖ Simple Word2Vec training completed (no C++ compiler needed!)


In [11]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df["rating_norm"] = scaler.fit_transform(df[["rating"]])


In [12]:
import pandas as pd
import numpy as np

df = pd.read_csv("itemReview_pxquy.csv")
users = {u:i for i,u in enumerate(df['user_id'].unique())}
items = {p:i for i,p in enumerate(df['item_id'].unique())}

R = np.zeros((len(users), len(items)))
for row in df.itertuples():
    R[users[row.user_id], items[row.item_id]] = row.rating

# Mask missing values (0) ‚Üí ch·ªâ d√πng nh·ªØng √¥ c√≥ rating
mask = (R > 0)
# Thay th·∫ø gi√° tr·ªã thi·∫øu b·∫±ng trung b√¨nh ng∆∞·ªùi d√πng ƒë·ªÉ gi·∫£m bias
mean_user = np.sum(R, axis=1) / np.sum(mask, axis=1)
for i in range(R.shape[0]):
    R[i, ~mask[i]] = mean_user[i]

# Th·ª±c hi·ªán SVD
U, sigma, Vt = np.linalg.svd(R, full_matrices=False)
k = 50
sigma_k = np.diag(sigma[:k])
U_k = U[:, :k]
Vt_k = Vt[:k, :]
R_hat = U_k @ sigma_k @ Vt_k  # Ma tr·∫≠n d·ª± ƒëo√°n rating

# D·ª± ƒëo√°n rating cho user_0001 v√† item_0001
u_idx = users['user_0001']
i_idx = items['item_0001']
predicted_rating = R_hat[u_idx, i_idx]
print("Predicted rating:", predicted_rating)


Predicted rating: 3.2766736378431296


In [13]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

MAX_WORDS = 20000
MAX_LEN = 100
EMBED_DIM = 128

model = Sequential()
model.add(Embedding(MAX_WORDS, EMBED_DIM, input_length=MAX_LEN))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [14]:
%matplotlib inline

def plot_history(h, name):
    plt.figure(figsize=(10,4))
    plt.subplot(1,2,1)
    plt.plot(h.history['loss']); plt.plot(h.history['val_loss']); plt.title('Loss')
    plt.subplot(1,2,2)
    plt.plot(h.history['accuracy']); plt.plot(h.history['val_accuracy']); plt.title('Accuracy')
    plt.suptitle(name)
    plt.savefig(name+"_history.png")
    plt.show()   # üëâ th√™m d√≤ng n√†y



In [None]:
# ==========================
# 6.2 - Full Pipeline for pxquy (a ‚Üí h) - NO GENSIM VERSION
# ==========================

# --- IMPORTS & CONFIG ---
import os
import random
import csv
from datetime import datetime, timedelta
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import warnings
warnings.filterwarnings("ignore")

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# --------------------------
# a) Problem/Task Statement
# --------------------------
print("üéØ M·ª•c ti√™u: X√¢y d·ª±ng pipeline AI cho e-commerce (KH√îNG D√ôNG GENSIM)")
print("- T·∫°o dataset + ti·ªÅn x·ª≠ l√Ω ti·∫øng Vi·ªát")
print("- Word embedding ƒë∆°n gi·∫£n (TF-IDF + SVD)")
print("- Collaborative Filtering (SVD)")
print("- CNN/RNN/LSTM sentiment analysis")
print("- So s√°nh v√† ch·ªçn m√¥ h√¨nh t·ªët nh·∫•t")

# --------------------------
# b) Generate dataset
# --------------------------
OUTFILE = "itemReview_pxquy.csv"
if not os.path.exists(OUTFILE):
    print("üìä T·∫°o dataset synthetic...")
    NUM_USERS = 1000
    NUM_ITEMS = 500
    REVIEWS_PER_USER = 50
    users = [f"user_{i:04d}" for i in range(1, NUM_USERS+1)]
    items = [f"item_{i:04d}" for i in range(1, NUM_ITEMS+1)]
    
    templates_pos = [
        "S·∫£n ph·∫©m t·ªët, r·∫•t h√†i l√≤ng v·ªõi ch·∫•t l∆∞·ª£ng",
        "Ch·∫•t l∆∞·ª£ng v∆∞·ª£t mong ƒë·ª£i, giao h√†ng nhanh",
        "Mua l·∫°i l·∫ßn n·ªØa, ƒë√°ng ti·ªÅn, thi·∫øt k·∫ø ƒë·∫πp",
        "D·ªãch v·ª• tuy·ªát v·ªùi, ƒë√≥ng g√≥i c·∫©n th·∫≠n",
        "R·∫•t h√†i l√≤ng, ch·∫•t l∆∞·ª£ng cao, gi√° h·ª£p l√Ω"
    ]
    templates_neg = [
        "Ch·∫•t l∆∞·ª£ng k√©m, kh√¥ng nh∆∞ m√¥ t·∫£, th·∫•t v·ªçng",
        "Giao h√†ng ch·∫≠m, s·∫£n ph·∫©m b·ªã h·ªèng",
        "Kh√¥ng ƒë√°ng ti·ªÅn, d·ªãch v·ª• t·ªá",
        "S·∫£n ph·∫©m l·ªói, c·∫ßn ƒë·ªïi tr·∫£ ngay",
        "R·∫•t kh√¥ng h√†i l√≤ng, ch·∫•t l∆∞·ª£ng d∆∞·ªõi mong ƒë·ª£i"
    ]
    
    with open(OUTFILE, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["user_id","item_id","rating","review_text","date"])
        start = datetime(2023,1,1)
        for u in users:
            for _ in range(REVIEWS_PER_USER):
                item = random.choice(items)
                p = random.random()
                if p < 0.05: rating = 1
                elif p < 0.12: rating = 2
                elif p < 0.35: rating = 3
                elif p < 0.75: rating = 4
                else: rating = 5
                
                review = random.choice(templates_pos if rating>=4 else templates_neg if rating<=2 else ["B√¨nh th∆∞·ªùng, ·ªïn"])
                date = (start + timedelta(days=random.randint(0,1000))).strftime("%Y-%m-%d")
                writer.writerow([u, item, rating, review, date])
    print(f"‚úÖ Saved: {OUTFILE}")

# --------------------------
# c) Preprocessing (kh√¥ng c·∫ßn pyvi)
# --------------------------
df = pd.read_csv(OUTFILE)
print(f"üìà Dataset size: {df.shape}")

def simple_preprocess(text):
    """Ti·ªÅn x·ª≠ l√Ω ƒë∆°n gi·∫£n kh√¥ng c·∫ßn th∆∞ vi·ªán ngo√†i"""
    text = str(text).lower()
    # Ch·ªâ gi·ªØ ch·ªØ c√°i Vi·ªát v√† s·ªë
    text = re.sub(r'[^a-z√†√°·∫£√£·∫°ƒÉ·∫Ø·∫±·∫≥·∫µ·∫∑√¢·∫•·∫ß·∫©·∫´·∫≠ƒë√®√©·∫ª·∫Ω·∫π√™·∫ø·ªÅ·ªÉ·ªÖ·ªá√¨√≠·ªâƒ©·ªã√≤√≥·ªè√µ·ªç√¥·ªë·ªì·ªï·ªó·ªô∆°·ªõ·ªù·ªü·ª°·ª£√π√∫·ªß≈©·ª•∆∞·ª©·ª´·ª≠·ªØ·ª±·ª≥√Ω·ª∑·ªπ·ªµ0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove stopwords
    stopwords = {'v√†','l√†','c·ªßa','c√≥','cho','nh·ªØng','ƒë√£','r·∫•t','r·ªìi','v·ªõi','m·ªôt','c√°c','tr√™n','t·ª´','kh√¥ng','r·∫±ng','n√†y','ƒë√≥'}
    tokens = [w for w in text.split() if w not in stopwords and len(w) > 1]
    return ' '.join(tokens)

df['clean_text'] = df['review_text'].apply(simple_preprocess)
print("‚úÖ Text preprocessing completed")

# --------------------------
# d) Simple Word2Vec (TF-IDF + SVD)
# --------------------------
class SimpleWord2Vec:
    def __init__(self, vector_size=100):
        self.vector_size = vector_size
        self.word_vectors = {}
        
    def train(self, sentences):
        # T·∫°o corpus
        valid_sentences = [sent for sent in sentences if len(sent) > 2]
        corpus = [' '.join(sent) for sent in valid_sentences]
        
        if not corpus:
            print("‚ùå No valid sentences for training!")
            return
            
        # TF-IDF
        vectorizer = TfidfVectorizer(max_features=5000, min_df=2, ngram_range=(1,1))
        tfidf_matrix = vectorizer.fit_transform(corpus)
        
        # SVD
        svd = TruncatedSVD(n_components=self.vector_size, random_state=SEED)
        word_embeddings = svd.fit_transform(tfidf_matrix.T)
        
        # Map words to vectors
        feature_names = vectorizer.get_feature_names_out()
        for i, word in enumerate(feature_names):
            if i < len(word_embeddings):
                self.word_vectors[word] = word_embeddings[i]
        
        print(f"üî§ Trained embeddings for {len(self.word_vectors)} words")
    
    def get_vector(self, word):
        return self.word_vectors.get(word, np.zeros(self.vector_size))

# Train word embeddings
sentences = [text.split() for text in df['clean_text'] if len(text.strip()) > 0]
w2v = SimpleWord2Vec(vector_size=100)
w2v.train(sentences)

def doc_vector(text):
    tokens = text.split()
    vectors = [w2v.get_vector(t) for t in tokens if t in w2v.word_vectors]
    return np.mean(vectors, axis=0) if vectors else np.zeros(100)

df['doc_embedding'] = df['clean_text'].apply(doc_vector)

# --------------------------
# e) Rating normalization
# --------------------------
scaler = MinMaxScaler()
df['rating_norm'] = scaler.fit_transform(df[['rating']])

# --------------------------
# f) Collaborative Filtering (SVD)
# --------------------------
print("ü§ù Training Collaborative Filtering...")
users = df['user_id'].unique()
items = df['item_id'].unique()
u2i = {u:i for i,u in enumerate(users)}
i2i = {item:i for i,item in enumerate(items)}

R = np.zeros((len(users), len(items)))
for row in df.itertuples():
    R[u2i[row.user_id], i2i[row.item_id]] = row.rating

# Train/test split
positions = np.array(np.where(R > 0)).T
np.random.shuffle(positions)
test_size = int(len(positions) * 0.2)
test_pos = positions[:test_size]
train_pos = positions[test_size:]

R_train = R.copy()
for (u, i) in test_pos:
    R_train[u, i] = 0

# Fill missing with user means
R_filled = R_train.copy()
for u in range(len(users)):
    user_ratings = R_train[u][R_train[u] > 0]
    user_mean = user_ratings.mean() if len(user_ratings) > 0 else df['rating'].mean()
    R_filled[u][R_train[u] == 0] = user_mean

# SVD decomposition
U, s, Vt = np.linalg.svd(R_filled, full_matrices=False)
k = 50
R_pred = U[:, :k] @ np.diag(s[:k]) @ Vt[:k, :]

# Evaluate
def rmse(true_ratings, pred_ratings, positions):
    true_vals = [true_ratings[u, i] for (u, i) in positions]
    pred_vals = [pred_ratings[u, i] for (u, i) in positions]
    return np.sqrt(mean_squared_error(true_vals, pred_vals))

cf_rmse = rmse(R, R_pred, test_pos)
print(f"üìä CF RMSE: {cf_rmse:.4f}")

# --------------------------
# g) Deep Learning Models (CNN/RNN/LSTM)
# --------------------------
print("üß† Training Deep Learning Models...")

# Prepare data
def rating_to_sentiment(rating):
    return 2 if rating >= 4 else 1 if rating == 3 else 0

df['sentiment'] = df['rating'].apply(rating_to_sentiment)

# Sample for faster training
SAMPLE_SIZE = 10000
df_sample = df.sample(min(SAMPLE_SIZE, len(df)), random_state=SEED).reset_index(drop=True)

# Tokenization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

MAX_WORDS = 10000
MAX_LEN = 80
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<UNK>")
tokenizer.fit_on_texts(df_sample['clean_text'])

sequences = tokenizer.texts_to_sequences(df_sample['clean_text'])
X = pad_sequences(sequences, maxlen=MAX_LEN)
y = to_categorical(df_sample['sentiment'], 3)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=SEED)

# Model architectures
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import EarlyStopping

def build_cnn_model():
    model = Sequential([
        Embedding(MAX_WORDS, 128, input_length=MAX_LEN),
        Conv1D(128, 5, activation='relu'),
        GlobalMaxPooling1D(),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(3, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def build_lstm_model():
    model = Sequential([
        Embedding(MAX_WORDS, 128, input_length=MAX_LEN),
        LSTM(64, dropout=0.5, recurrent_dropout=0.5),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(3, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def build_gru_model():
    model = Sequential([
        Embedding(MAX_WORDS, 128, input_length=MAX_LEN),
        GRU(64, dropout=0.5, recurrent_dropout=0.5),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(3, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Training function
def train_model(model, name):
    print(f"üèãÔ∏è Training {name}...")
    early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    
    history = model.fit(
        X_train, y_train,
        batch_size=128,
        epochs=10,
        validation_data=(X_val, y_val),
        callbacks=[early_stop],
        verbose=0
    )
    
    # Evaluate
    val_loss, val_acc = model.evaluate(X_val, y_val, verbose=0)
    print(f"üìà {name} - Val Accuracy: {val_acc:.4f}, Val Loss: {val_loss:.4f}")
    
    return history, val_acc

# Train all models
models_results = {}

# CNN
cnn_model = build_cnn_model()
cnn_history, cnn_acc = train_model(cnn_model, "CNN")
models_results['CNN'] = cnn_acc

# LSTM
lstm_model = build_lstm_model()
lstm_history, lstm_acc = train_model(lstm_model, "LSTM")
models_results['LSTM'] = lstm_acc

# GRU
gru_model = build_gru_model()
gru_history, gru_acc = train_model(gru_model, "GRU")
models_results['GRU'] = gru_acc

# --------------------------
# h) Results Summary
# --------------------------
print("\n" + "="*50)
print("üèÜ FINAL RESULTS SUMMARY")
print("="*50)
print(f"üìä Dataset: {len(df)} reviews, {len(users)} users, {len(items)} items")
print(f"ü§ù Collaborative Filtering RMSE: {cf_rmse:.4f}")
print("\nüìà Deep Learning Models:")

best_model = max(models_results, key=models_results.get)
for model_name, accuracy in sorted(models_results.items(), key=lambda x: x[1], reverse=True):
    status = "ü•á BEST" if model_name == best_model else ""
    print(f"   {model_name}: {accuracy:.4f} {status}")

print(f"\n‚úÖ Best performing model: {best_model} ({models_results[best_model]:.4f})")
print("üéâ Pipeline completed successfully WITHOUT gensim!")

# Plot comparison
plt.figure(figsize=(10, 6))
model_names = list(models_results.keys())
accuracies = list(models_results.values())
bars = plt.bar(model_names, accuracies, color=['skyblue', 'lightgreen', 'salmon'])
plt.title('Model Comparison - Validation Accuracy', fontsize=14, fontweight='bold')
plt.ylabel('Accuracy')
plt.ylim(0, 1)

# Add value labels on bars
for bar, acc in zip(bars, accuracies):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
             f'{acc:.4f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.savefig('model_comparison_no_gensim.png', dpi=300, bbox_inches='tight')
plt.show()

print("üìÅ Saved: model_comparison_no_gensim.png")

üéØ M·ª•c ti√™u: X√¢y d·ª±ng pipeline AI cho e-commerce (KH√îNG D√ôNG GENSIM)
- T·∫°o dataset + ti·ªÅn x·ª≠ l√Ω ti·∫øng Vi·ªát
- Word embedding ƒë∆°n gi·∫£n (TF-IDF + SVD)
- Collaborative Filtering (SVD)
- CNN/RNN/LSTM sentiment analysis
- So s√°nh v√† ch·ªçn m√¥ h√¨nh t·ªët nh·∫•t
üìà Dataset size: (50000, 5)
‚úÖ Text preprocessing completed
üî§ Trained embeddings for 26 words
ü§ù Training Collaborative Filtering...
üìä CF RMSE: 1.0829
üß† Training Deep Learning Models...
üèãÔ∏è Training CNN...
üìà CNN - Val Accuracy: 1.0000, Val Loss: 0.0000
üèãÔ∏è Training LSTM...
üìà LSTM - Val Accuracy: 1.0000, Val Loss: 0.0000
üèãÔ∏è Training GRU...


In [17]:
# T·∫° Cao S∆°n - B22DCVT445

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import re

# ================================
# 1. Simple Word2Vec alternative using TF-IDF + SVD
# ================================

class SimpleWord2Vec:
    def __init__(self, vector_size=100, window=5, min_count=2):
        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.word_vectors = {}
        self.vocab = {}
        
    def build_vocab(self, sentences):
        """X√¢y d·ª±ng t·ª´ v·ª±ng t·ª´ danh s√°ch c√¢u"""
        word_counts = Counter()
        for sentence in sentences:
            for word in sentence:
                word_counts[word] += 1
        
        # L·ªçc t·ª´ theo min_count
        self.vocab = {word: idx for idx, (word, count) in enumerate(word_counts.items()) 
                      if count >= self.min_count}
        print(f"Vocab size: {len(self.vocab)}")
    
    def train(self, sentences):
        """Hu·∫•n luy·ªán Word2Vec ƒë∆°n gi·∫£n b·∫±ng TF-IDF + SVD"""
        self.build_vocab(sentences)
        
        # T·∫°o context windows
        contexts = []
        for sentence in sentences:
            for i, target_word in enumerate(sentence):
                if target_word in self.vocab:
                    # L·∫•y context words trong window
                    start = max(0, i - self.window)
                    end = min(len(sentence), i + self.window + 1)
                    context = []
                    for j in range(start, end):
                        if i != j and sentence[j] in self.vocab:
                            context.append(sentence[j])
                    if context:
                        contexts.append(' '.join(context))
        
        if not contexts:
            print("Kh√¥ng c√≥ context n√†o ƒë∆∞·ª£c t·∫°o!")
            return
        
        # S·ª≠ d·ª•ng TF-IDF ƒë·ªÉ t·∫°o word vectors
        vectorizer = TfidfVectorizer(max_features=min(5000, len(self.vocab)))
        tfidf_matrix = vectorizer.fit_transform(contexts)
        
        # SVD ƒë·ªÉ gi·∫£m chi·ªÅu
        from sklearn.decomposition import TruncatedSVD
        svd = TruncatedSVD(n_components=self.vector_size, random_state=42)
        reduced_vectors = svd.fit_transform(tfidf_matrix)
        
        # Mapping words to vectors
        feature_names = vectorizer.get_feature_names_out()
        for i, word in enumerate(feature_names):
            if i < len(reduced_vectors):
                self.word_vectors[word] = reduced_vectors[i]
        
        print(f"Trained vectors for {len(self.word_vectors)} words")
    
    def get_vector(self, word):
        """L·∫•y vector c·ªßa m·ªôt t·ª´"""
        return self.word_vectors.get(word, np.zeros(self.vector_size))
    
    def save(self, filepath):
        """L∆∞u model"""
        np.savez(filepath, 
                 word_vectors=self.word_vectors,
                 vocab=self.vocab,
                 vector_size=self.vector_size)
        print(f"Model saved to {filepath}")

# ================================
# 2. Document vectorization function
# ================================

def doc_vector_simple(text, word2vec_model, method='mean'):
    """T·∫°o document vector t·ª´ text b·∫±ng c√°ch average word vectors"""
    if isinstance(text, str):
        tokens = text.split()
    else:
        tokens = text
    
    vectors = []
    for token in tokens:
        vec = word2vec_model.get_vector(token)
        if np.any(vec):  # N·∫øu vector kh√¥ng ph·∫£i to√†n s·ªë 0
            vectors.append(vec)
    
    if vectors:
        if method == 'mean':
            return np.mean(vectors, axis=0)
        elif method == 'sum':
            return np.sum(vectors, axis=0)
    
    return np.zeros(word2vec_model.vector_size)

# ================================
# 3. Load v√† preprocess data
# ================================

print("Loading preprocessed data...")
try:
    df = pd.read_csv("itemReview_pxquy_preprocessed.csv")
    print(f"Loaded {len(df)} reviews")
except FileNotFoundError:
    print("File preprocessed kh√¥ng t·ªìn t·∫°i, s·ª≠ d·ª•ng file g·ªëc...")
    df = pd.read_csv("itemReview_pxquy.csv")
    
    # Simple preprocessing kh√¥ng c·∫ßn pyvi
    def simple_preprocess(text):
        text = str(text).lower()
        # Ch·ªâ gi·ªØ ch·ªØ c√°i ti·∫øng Vi·ªát v√† s·ªë
        text = re.sub(r'[^a-z√†√°·∫£√£·∫°ƒÉ·∫Ø·∫±·∫≥·∫µ·∫∑√¢·∫•·∫ß·∫©·∫´·∫≠ƒë√®√©·∫ª·∫Ω·∫π√™·∫ø·ªÅ·ªÉ·ªÖ·ªá√¨√≠·ªâƒ©·ªã√≤√≥·ªè√µ·ªç√¥·ªë·ªì·ªï·ªó·ªô∆°·ªõ·ªù·ªü·ª°·ª£√π√∫·ªß≈©·ª•∆∞·ª©·ª´·ª≠·ªØ·ª±·ª≥√Ω·ª∑·ªπ·ªµ0-9\s]', ' ', text)
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Remove stopwords ƒë∆°n gi·∫£n
        stopwords = {'v√†', 'l√†', 'c·ªßa', 'c√≥', 'cho', 'nh·ªØng', 'ƒë√£', 'r·∫•t', 'r·ªìi', 'v·ªõi', 'm·ªôt', 'c√°c', 'tr√™n', 't·ª´'}
        tokens = [word for word in text.split() if word not in stopwords and len(word) > 1]
        return ' '.join(tokens)
    
    df['clean_text'] = df['review_text'].apply(simple_preprocess)

# ================================
# 4. Train Simple Word2Vec
# ================================

print("Training Simple Word2Vec...")
sentences = [text.split() for text in df['clean_text'].astype(str) if len(text.strip()) > 0]

# Lo·∫°i b·ªè c√¢u qu√° ng·∫Øn
sentences = [s for s in sentences if len(s) > 2]
print(f"Training on {len(sentences)} sentences")

# Train model
w2v_simple = SimpleWord2Vec(vector_size=100, window=5, min_count=2)
w2v_simple.train(sentences)

# Save model
w2v_simple.save("simple_w2v_pxquy.npz")

# ================================
# 5. Create document vectors
# ================================

print("Creating document vectors...")
df['w2v_vector'] = df['clean_text'].apply(lambda x: doc_vector_simple(x, w2v_simple))

# Convert to list for saving
df['w2v_vector_list'] = df['w2v_vector'].apply(lambda x: x.tolist())

# Save with vectors
df.to_pickle("itemReview_pxquy_simple_w2v.pkl")
print("Saved dataframe with simple word2vec vectors")

# ================================
# 6. Test similarity function
# ================================

def cosine_similarity(vec1, vec2):
    """T√≠nh cosine similarity gi·ªØa 2 vectors"""
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    if norm1 == 0 or norm2 == 0:
        return 0
    return dot_product / (norm1 * norm2)

# Test v·ªõi m·ªôt s·ªë t·ª´
test_words = ['t·ªët', 'x·∫•u', 'ch·∫•t', 'l∆∞·ª£ng', 'h√†i', 'l√≤ng']
print("\n=== Test Word Vectors ===")
for word in test_words:
    vec = w2v_simple.get_vector(word)
    print(f"'{word}': vector shape {vec.shape}, norm: {np.linalg.norm(vec):.3f}")

# Test document similarity
print("\n=== Test Document Similarity ===")
if len(df) >= 2:
    vec1 = df['w2v_vector'].iloc[0]
    vec2 = df['w2v_vector'].iloc[1]
    similarity = cosine_similarity(vec1, vec2)
    print(f"Similarity between doc 0 and doc 1: {similarity:.3f}")
    print(f"Doc 0: {df['clean_text'].iloc[0][:100]}...")
    print(f"Doc 1: {df['clean_text'].iloc[1][:100]}...")

print("\n‚úÖ Simple Word2Vec training completed successfully!")
print("üìÅ Files created:")
print("   - simple_w2v_pxquy.npz (Word2Vec model)")
print("   - itemReview_pxquy_simple_w2v.pkl (DataFrame with vectors)")

Loading preprocessed data...
Loaded 50000 reviews
Training Simple Word2Vec...
Training on 38517 sentences
Vocab size: 25


ValueError: n_components(100) must be <= n_features(20).