In [1]:
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import warnings

In [26]:
from surprise import Dataset
from surprise.model_selection import train_test_split

from surprise import Reader
from surprise import accuracy

# ml-100k: 10만 개 평점 데이터
# data = Dataset.load_builtin('ml-100k')

# # surprise의 train_test_split() 사용
# trainset, testset = train_test_split(data, test_size=0.25, random_state=0)

In [5]:
reader = Reader(line_format='user item rating timestamp', sep=',',
               rating_scale=(1, 58))

In [6]:
customers_df = pd.read_csv('D:/etc_code/custormer_df.csv')
# customers_df = customers_df.rename(columns={'user': 'n_users'})
# customers_df = customers_df.astype(int)
customers_df

Unnamed: 0,user,apt_id,score
0,73.0,23609.0,9.923910
1,93.0,3155.0,3.839354
2,94.0,22302.0,16.391832
3,30.0,36379.0,29.020753
4,22.0,52759.0,18.010127
...,...,...,...
149995,33.0,51019.0,25.381025
149996,54.0,24961.0,35.742562
149997,21.0,6782.0,14.853286
149998,87.0,54226.0,1.385820


In [48]:
customers_df.describe()

Unnamed: 0,user,apt_id,score
count,150000.0,150000.0,150000.0
mean,50.09312,30020.09718,27.104009
std,28.586712,17325.133352,14.921979
min,1.0,1.0,1.300002
25%,25.0,15026.75,14.15373
50%,50.0,30006.0,27.175542
75%,75.0,45091.0,40.063885
max,99.0,59999.0,52.899725


In [14]:
customers_df.user.value_counts().reset_index().sort_values(by=['user'], ascending = False)

Unnamed: 0,index,user
0,13.0,1600
1,55.0,1597
2,59.0,1592
3,52.0,1587
4,99.0,1577
...,...,...
94,92.0,1441
95,26.0,1438
96,70.0,1434
97,43.0,1423


In [7]:
data = Dataset.load_from_df(customers_df,reader=reader)

trainset, testset = train_test_split(data, test_size=.25, random_state=0)

In [33]:
from surprise import SVD

# SVD를 이용한 잠재 요인 협업 필터링
algo = SVD(n_factors=30)
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1db2822bf70>

In [60]:
# 사용자 아이디(uid), 아이템 아이디(iid)는 문자열로 입력
uid = str(13)
iid = str(1)

# 추천 예측 평점 (.predict)
pred = algo.predict(uid, iid)
pd.DataFrame(pred).T

Unnamed: 0,0,1,2,3,4
0,13,1,,27.102874,{'was_impossible': False}


In [61]:
prediction_result.columns

Index(['uid', 'iid', 'r_ui', 'est', 'details'], dtype='object')

In [80]:
uid = str(13)
pred_values_all = []

for i in customers_df['apt_id'].unique():
    iid = i
    pred_values = algo.predict(uid, iid)
    pred_values = pd.DataFrame(pred_values).T
    pred_values.columns = ['uid', 'iid', 'r_ui', 'est', 'details']
    pred_values_all.append(pred_values)

pred_values_all = pd.concat(pred_values_all)
pred_values_all

Unnamed: 0,uid,iid,r_ui,est,details
0,13,23609.0,,27.508012,{'was_impossible': False}
0,13,3155.0,,27.486731,{'was_impossible': False}
0,13,22302.0,,27.506482,{'was_impossible': False}
0,13,36379.0,,26.570086,{'was_impossible': False}
0,13,52759.0,,26.705523,{'was_impossible': False}
...,...,...,...,...,...
0,13,52902.0,,27.221519,{'was_impossible': False}
0,13,55104.0,,27.056669,{'was_impossible': False}
0,13,43789.0,,27.102874,{'was_impossible': False}
0,13,29633.0,,27.501821,{'was_impossible': False}


In [81]:
pred_values_all[pred_values_all['r_ui'].isnull() == False]

Unnamed: 0,uid,iid,r_ui,est,details


In [74]:
pred_values_all = pred_values_all.rename(columns={'uid' : 'user_id', 'iid' : 'apt_id'})

In [75]:
pred_values_all.to_csv('./user13_output.csv', encoding='CP949', index=False)

In [68]:
pred_values_all.sort_values(by=['est'],ascending = False)[:5]

Unnamed: 0,uid,iid,r_ui,est,details
0,13,5803.0,,28.699817,{'was_impossible': False}
0,13,54911.0,,28.677532,{'was_impossible': False}
0,13,4801.0,,28.588761,{'was_impossible': False}
0,13,2636.0,,28.553117,{'was_impossible': False}
0,13,38041.0,,28.541588,{'was_impossible': False}


In [79]:
pred_values_all.sort_values(by=['est'],ascending = True)[:5]

Unnamed: 0,user_id,apt_id,r_ui,est,details
0,13,12128.0,,25.351435,{'was_impossible': False}
0,13,59403.0,,25.576141,{'was_impossible': False}
0,13,55742.0,,25.586484,{'was_impossible': False}
0,13,38925.0,,25.595889,{'was_impossible': False}
0,13,2887.0,,25.60799,{'was_impossible': False}


In [35]:
# Prediction(uid='196', iid='302', r_ui=None, est=4.04440275028659, details={'was_impossible': False})

In [36]:
predictions = algo.test( testset )

print('prediction type :',type(predictions), ' size:',len(predictions))
# print('prediction 결과의 최초 5개 추출')
accuracy.rmse(predictions)


prediction type : <class 'list'>  size: 37500
RMSE: 15.4888


15.488762289794932

In [24]:
predictions[100:]

[Prediction(uid=43.0, iid=25497.0, r_ui=8.24212466, est=28.86799969813404, details={'was_impossible': False}),
 Prediction(uid=85.0, iid=6601.0, r_ui=11.78006538, est=27.338504883641388, details={'was_impossible': False}),
 Prediction(uid=63.0, iid=45735.0, r_ui=40.02210839, est=30.610045531389222, details={'was_impossible': False}),
 Prediction(uid=90.0, iid=16575.0, r_ui=14.72166188, est=29.16291512539292, details={'was_impossible': False}),
 Prediction(uid=27.0, iid=6711.0, r_ui=29.48922077, est=27.551759194903543, details={'was_impossible': False}),
 Prediction(uid=11.0, iid=20658.0, r_ui=3.01265394, est=23.181684861799035, details={'was_impossible': False}),
 Prediction(uid=51.0, iid=21103.0, r_ui=51.41006851, est=30.270942845738034, details={'was_impossible': False}),
 Prediction(uid=3.0, iid=36757.0, r_ui=40.6290945, est=26.88006652654053, details={'was_impossible': False}),
 Prediction(uid=7.0, iid=30983.0, r_ui=48.10982115, est=27.79438555423112, details={'was_impossible': Fal

In [19]:
prediction_result = pd.DataFrame(predictions)
prediction_result

Unnamed: 0,uid,iid,r_ui,est,details
0,5.0,13679.0,22.414640,27.092422,{'was_impossible': False}
1,51.0,21310.0,5.537697,27.876553,{'was_impossible': False}
2,54.0,53788.0,2.458223,29.858181,{'was_impossible': False}
3,53.0,25906.0,6.937100,28.140406,{'was_impossible': False}
4,70.0,44374.0,37.725593,27.878519,{'was_impossible': False}
...,...,...,...,...,...
37495,61.0,10001.0,8.600527,27.050274,{'was_impossible': False}
37496,24.0,4715.0,45.339202,22.280882,{'was_impossible': False}
37497,88.0,15653.0,28.124077,27.188064,{'was_impossible': False}
37498,27.0,24878.0,32.615208,26.947794,{'was_impossible': False}


In [41]:
prediction_result.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37500 entries, 0 to 37499
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   uid      37500 non-null  float64
 1   iid      37500 non-null  float64
 2   r_ui     37500 non-null  float64
 3   est      37500 non-null  float64
 4   details  37500 non-null  object 
dtypes: float64(4), object(1)
memory usage: 1.4+ MB


In [42]:
prediction_result[prediction_result['uid'] == 13]

Unnamed: 0,uid,iid,r_ui,est,details
6,13.0,2409.0,38.496329,28.416704,{'was_impossible': False}
115,13.0,11055.0,45.327808,30.864853,{'was_impossible': False}
242,13.0,1530.0,23.605516,24.430054,{'was_impossible': False}
563,13.0,41405.0,44.978298,27.795319,{'was_impossible': False}
635,13.0,13036.0,21.009403,28.527370,{'was_impossible': False}
...,...,...,...,...,...
36842,13.0,7464.0,16.151149,26.640758,{'was_impossible': False}
36920,13.0,20231.0,5.820816,25.106522,{'was_impossible': False}
36931,13.0,40849.0,22.934797,21.453411,{'was_impossible': False}
37106,13.0,39066.0,28.060260,23.464275,{'was_impossible': False}


In [45]:
prediction_result[(prediction_result['uid'] == 13) & (prediction_result['iid'] == 554.0)]

Unnamed: 0,uid,iid,r_ui,est,details
29190,13.0,554.0,20.285336,31.222634,{'was_impossible': False}


In [22]:
prediction_result[prediction_result['uid'] == 13]['est'].describe()

count    410.000000
mean      26.535216
std        2.616229
min       14.304825
25%       25.326618
50%       26.640758
75%       27.582038
max       49.979034
Name: est, dtype: float64

In [23]:
prediction_result.describe()

Unnamed: 0,uid,iid,r_ui,est
count,37500.0,37500.0,37500.0,37500.0
mean,50.091173,30107.880213,27.107414,27.106671
std,28.642028,17365.220646,14.920074,2.905817
min,1.0,2.0,1.300815,1.16698
25%,25.0,15139.75,14.162597,25.942413
50%,50.0,30134.5,27.262653,27.086087
75%,75.0,45214.5,40.070381,28.246799
max,99.0,59996.0,52.899609,52.671375
