In [1]:
import pandas as pd
import pickle
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split

In [2]:
df = pd.read_csv('Products_ThoiTrangNam_rating_raw.csv',  sep='\t')
df

Unnamed: 0,product_id,user_id,user,rating
0,190,1,karmakyun2nd,5
1,190,2,tranquangvinh_vv,5
2,190,3,nguyenquoctoan2005,5
3,190,4,nguyenthuyhavi,5
4,190,5,luonganh5595,5
...,...,...,...,...
1024477,171107,378659,phantom061212,4
1024478,171107,225341,donggiang_94,4
1024479,171107,650634,ethanhunt94,3
1024480,171107,650635,nguyendiu.3.3.95,2


In [3]:
# Kiểm tra duplicate toàn bộ dòng
dup_all = df.duplicated()
print(f"Số dòng duplicate toàn bộ: {dup_all.sum()}")

# Kiểm tra duplicate theo user_id và product_id
dup_user_product = df.duplicated(subset=['user_id', 'product_id'])
print(f"Số dòng duplicate theo user_id + product_id: {dup_user_product.sum()}")

Số dòng duplicate toàn bộ: 24667
Số dòng duplicate theo user_id + product_id: 34340


In [4]:
df = df.drop_duplicates()

In [5]:
df = df.drop_duplicates(subset=['user_id', 'product_id'], keep='first')

In [6]:
df

Unnamed: 0,product_id,user_id,user,rating
0,190,1,karmakyun2nd,5
1,190,2,tranquangvinh_vv,5
2,190,3,nguyenquoctoan2005,5
3,190,4,nguyenthuyhavi,5
4,190,5,luonganh5595,5
...,...,...,...,...
1024477,171107,378659,phantom061212,4
1024478,171107,225341,donggiang_94,4
1024479,171107,650634,ethanhunt94,3
1024480,171107,650635,nguyendiu.3.3.95,2


In [7]:
df_sample = df.sample(n=10000, random_state=42).reset_index(drop=True)

In [8]:
reader = Reader(rating_scale=(df_sample['rating'].min(), df_sample['rating'].max()))
data = Dataset.load_from_df(df_sample[['user_id', 'product_id', 'rating']], reader)

In [9]:
# 5. Train-test split
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# 6. Train model
algo = SVD()
algo.fit(trainset)

# 7. Export model + dữ liệu cần thiết cho app
model_data = {
    'model': algo,
    'df_sample': df_sample
}

with open('surprise_model.pkl', 'wb') as f:
    pickle.dump(model_data, f)

print('✅ Đã lưu model và data vào file surprise_model.pkl')


✅ Đã lưu model và data vào file surprise_model.pkl
