In [7]:
import pandas as pd

# Đường dẫn đến file
movies_file = "Dataset/movies.dat"
ratings_file = "Dataset/ratings.dat"
users_file = "Dataset/users.dat"

# Đọc dữ liệu (dùng encoding latin-1 để tránh lỗi UnicodeDecodeError)
movies = pd.read_csv(movies_file, sep="::", header=None, engine="python", encoding="latin-1",
                     names=["movie_id", "title", "genres"])
ratings = pd.read_csv(ratings_file, sep="::", header=None, engine="python", encoding="latin-1",
                      names=["user_id", "movie_id", "rating", "timestamp"])
users = pd.read_csv(users_file, sep="::", header=None, engine="python", encoding="latin-1",
                    names=["user_id", "gender", "age", "occupation", "zip"])

# Merge các bảng thành 1 dataset đầy đủ
df_out = ratings.merge(users, on="user_id").merge(movies, on="movie_id")

# In thử vài dòng đầu
print(df_out.head())
print("\nThông tin dataframe:")
print(df_out.info())


   user_id  movie_id  rating  timestamp gender  age  occupation    zip  \
0        1      1193       5  978300760      F    1          10  48067   
1        1       661       3  978302109      F    1          10  48067   
2        1       914       3  978301968      F    1          10  48067   
3        1      3408       4  978300275      F    1          10  48067   
4        1      2355       5  978824291      F    1          10  48067   

                                    title                        genres  
0  One Flew Over the Cuckoo's Nest (1975)                         Drama  
1        James and the Giant Peach (1996)  Animation|Children's|Musical  
2                     My Fair Lady (1964)               Musical|Romance  
3                  Erin Brockovich (2000)                         Drama  
4                    Bug's Life, A (1998)   Animation|Children's|Comedy  

Thông tin dataframe:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data colu

In [8]:
# In ra số lượng unique values và kiểu dữ liệu của từng feature
print("Feature summary:\n")
summary = []
for col in df_out.columns:
    summary.append([col, df_out[col].nunique(), df_out[col].dtype])

# Hiển thị dưới dạng DataFrame đẹp
summary_df = pd.DataFrame(summary, columns=["Feature", "Unique_Count", "Dtype"])
print(summary_df)


Feature summary:

      Feature  Unique_Count   Dtype
0     user_id          6040   int64
1    movie_id          3706   int64
2      rating             5   int64
3   timestamp        458455   int64
4      gender             2  object
5         age             7   int64
6  occupation            21   int64
7         zip          3439  object
8       title          3706  object
9      genres           301  object


In [9]:
import pandas as pd

# Xác định categorical & numeric features
categorical_features = ["user_id", "movie_id", "gender", "age", "occupation", "zip", "title", "genres"]
numeric_features = ["rating", "timestamp"]

# One-hot encoding cho các cột categorical
df_onehot = pd.get_dummies(df_out[categorical_features], columns=categorical_features, sparse=True)

# Ghép thêm các cột numeric
df_encoded = pd.concat([df_onehot, df_out[numeric_features]], axis=1)

print("Shape sau khi One-Hot Encoding:", df_encoded.shape)

Shape sau khi One-Hot Encoding: (1000209, 17224)


In [10]:
import pandas as pd
import numpy as np

# Xác định categorical & numeric features
categorical_features = ["user_id", "movie_id", "gender", "age", "occupation", "zip", "title", "genres"]
numeric_features = ["rating", "timestamp"]

# Shape sau one-hot (dùng sparse, không tốn RAM)
df_onehot_sparse = pd.get_dummies(df_out[categorical_features], columns=categorical_features, sparse=True)
df_encoded_sparse = pd.concat([df_onehot_sparse, df_out[numeric_features]], axis=1)

shape = df_encoded_sparse.shape
print("Sparse shape:", shape)

# Ước lượng bộ nhớ nếu lưu dense float32
approx_mem = shape[0] * shape[1] * 4 / (1024**3)  # 4 byte cho float32
print("Approx dense memory size: %.2f GB" % approx_mem)


Sparse shape: (1000209, 17224)
Approx dense memory size: 64.18 GB


In [14]:
import numpy as np

categorical_features = ["user_id","movie_id","gender","age","occupation","zip","title","genres"]
rows = len(df_out)
onehot_cols = sum(df_out[c].nunique() for c in categorical_features)
total_dense_cols = onehot_cols + 2  # + rating, timestamp

print(f"Will try to allocate array of shape ({rows:,}, {total_dense_cols:,}) float32...")
approx_gb = rows * total_dense_cols * 4 / (1024**3)
print(f"Expected allocation: ~{approx_gb:.2f} GB")

X = np.empty((rows, total_dense_cols), dtype=np.float32)


Will try to allocate array of shape (1,000,209, 17,224) float32...
Expected allocation: ~64.18 GB


MemoryError: Unable to allocate 64.2 GiB for an array with shape (1000209, 17224) and data type float32

In [16]:
import torch
import torch.nn as nn

# 1. Import hoặc định nghĩa lại class DeepFM y hệt lúc train
from deepfm_model import DeepFM   # ví dụ nếu bạn có file deepfm_model.py

# 2. Khởi tạo model với đúng feature_size, embedding_dim, hidden_dims... như khi train
feature_size = ...      # số feature đã encode (từ feature_index.pkl)
embedding_dim = 8       # hoặc giá trị bạn dùng lúc train
hidden_dims = [128, 64] # hoặc giá trị bạn dùng
dropout = 0.5           # hoặc giá trị bạn dùng

model = DeepFM(feature_size, embedding_dim, hidden_dims, dropout)

# 3. Load lại state_dict
state_dict = torch.load("artifacts/deepfm_pytorch.pth", map_location="cpu")
model.load_state_dict(state_dict)

# 4. Set eval mode
model.eval()
print("Model loaded thành công!")


ImportError: cannot import name 'DeepFM' from 'deepfm_model' (unknown location)