In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim

# Đọc dữ liệu
interactions = pd.read_csv("interactions.csv")
product = pd.read_csv("product.csv")

customer_id    0
product_id     0
is_positive    0
dtype: int64

In [12]:
# Mã hóa customer_id, product_id, seller_id thành các chỉ số số nguyên
customer_ids = interactions['customer_id'].unique()
product_ids = product['id'].unique()
seller_ids = product['seller_id'].unique()

customer_id_map = {id: idx for idx, id in enumerate(customer_ids)}
product_id_map = {id: idx for idx, id in enumerate(product_ids)}
seller_id_map = {id: idx for idx, id in enumerate(seller_ids)}

interactions['customer_idx'] = interactions['customer_id'].map(customer_id_map)
interactions['product_idx'] = interactions['product_id'].map(product_id_map)
product['product_idx'] = product['id'].map(product_id_map)
product['seller_idx'] = product['seller_id'].map(seller_id_map)

# Loại bỏ các dòng có product_idx bị NaN
interactions = interactions.dropna(subset=['product_idx']).reset_index(drop=True)
interactions['product_idx'] = interactions['product_idx'].astype(int)

In [13]:
# Chuẩn bị đặc trưng số từ product.csv (price, rating_average)
scaler = StandardScaler()
product_features = scaler.fit_transform(product[['price', 'rating_average']])
product_features_dict = {row['product_idx']: features for row, features in zip(product.to_dict('records'), product_features)}

# Chuẩn bị dữ liệu huấn luyện
customer_indices = interactions['customer_idx'].values
product_indices = interactions['product_idx'].values
labels = interactions['is_positive'].values

# Lấy đặc trưng price, rating_average cho từng product_idx
product_features_train = np.array([product_features_dict[p_idx] for p_idx in product_indices])

# Lấy seller_idx cho từng product_idx
seller_indices = product.set_index('product_idx')['seller_idx'].to_dict()
seller_indices_train = np.array([seller_indices[p_idx] for p_idx in product_indices])

In [15]:
# Chia dữ liệu thành tập huấn luyện và kiểm tra
X_train, X_test, y_train, y_test = train_test_split(
    list(zip(customer_indices, product_indices, seller_indices_train, product_features_train)),
    labels,
    test_size=0.2,
    random_state=42
)

# Chuyển dữ liệu thành tensor
customer_train = torch.LongTensor([x[0] for x in X_train])
product_train = torch.LongTensor([x[1] for x in X_train])
seller_train = torch.LongTensor([x[2] for x in X_train])
features_train = torch.FloatTensor([x[3] for x in X_train])
y_train = torch.FloatTensor(y_train)

customer_test = torch.LongTensor([x[0] for x in X_test])
product_test = torch.LongTensor([x[1] for x in X_test])
seller_test = torch.LongTensor([x[2] for x in X_test])
features_test = torch.FloatTensor([x[3] for x in X_test])
y_test = torch.FloatTensor(y_test)

In [16]:
# Định nghĩa mô hình NeuMF
class NeuMF(nn.Module):
    def __init__(self, num_customers, num_products, num_sellers, num_features=2, embed_dim=32, mlp_layers=[64, 32, 16]):
        super(NeuMF, self).__init__()
        # Embedding cho GMF
        self.customer_embed_gmf = nn.Embedding(num_customers, embed_dim)
        self.product_embed_gmf = nn.Embedding(num_products, embed_dim)
        
        # Embedding cho MLP
        self.customer_embed_mlp = nn.Embedding(num_customers, embed_dim)
        self.product_embed_mlp = nn.Embedding(num_products, embed_dim)
        self.seller_embed_mlp = nn.Embedding(num_sellers, embed_dim)
        
        # MLP layers
        mlp_input_dim = embed_dim * 3 + num_features  # customer + product + seller + features
        mlp_layers_list = []
        in_dim = mlp_input_dim
        for out_dim in mlp_layers:
            mlp_layers_list.append(nn.Linear(in_dim, out_dim))
            mlp_layers_list.append(nn.ReLU())
            in_dim = out_dim
        self.mlp = nn.Sequential(*mlp_layers_list)
        
        # Kết hợp GMF và MLP
        self.final_layer = nn.Linear(embed_dim + mlp_layers[-1], 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, customer, product, seller, features):
        # Nhánh GMF
        customer_gmf = self.customer_embed_gmf(customer)
        product_gmf = self.product_embed_gmf(product)
        gmf_output = customer_gmf * product_gmf  # Element-wise multiplication
        
        # Nhánh MLP
        customer_mlp = self.customer_embed_mlp(customer)
        product_mlp = self.product_embed_mlp(product)
        seller_mlp = self.seller_embed_mlp(seller)
        mlp_input = torch.cat([customer_mlp, product_mlp, seller_mlp, features], dim=1)
        mlp_output = self.mlp(mlp_input)
        
        # Kết hợp GMF và MLP
        combined = torch.cat([gmf_output, mlp_output], dim=1)
        output = self.final_layer(combined)
        return self.sigmoid(output).squeeze()

In [None]:
# Khởi tạo mô hình
num_customers = len(customer_ids)
num_products = len(product_ids)
num_sellers = len(seller_ids)
model = NeuMF(num_customers, num_products, num_sellers)

# Định nghĩa hàm mất mát và tối ưu hóa
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Huấn luyện mô hình
num_epochs = 10
batch_size = 256
model.train()
for epoch in range(num_epochs):
    for i in range(0, len(X_train), batch_size):
        batch_customer = customer_train[i:i+batch_size]
        batch_product = product_train[i:i+batch_size]
        batch_seller = seller_train[i:i+batch_size]
        batch_features = features_train[i:i+batch_size]
        batch_labels = y_train[i:i+batch_size]
        
        # Forward
        outputs = model(batch_customer, batch_product, batch_seller, batch_features)
        loss = criterion(outputs, batch_labels)
        
        # Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}")

# Đánh giá mô hình
model.eval()
with torch.no_grad():
    test_outputs = model(customer_test, product_test, seller_test, features_test)
    test_loss = criterion(test_outputs, y_test)
    test_predictions = (test_outputs >= 0.5).float()
    accuracy = (test_predictions == y_test).float().mean()
    print(f"Test Loss: {test_loss.item():.4f}, Test Accuracy: {accuracy.item():.4f}")

# Lưu mô hình
torch.save(model.state_dict(), "neumf_model.pth")
print("Mô hình đã được lưu vào 'neumf_model.pth'")