In [1]:
import os
import numpy as np
import pandas as pd
rating_file_path=os.getenv('HOME') + '/mini_projects/_E-08_recommend/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python')
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [2]:
len(ratings) - ratings.count()

user_id      0
movie_id     0
rating       0
timestamp    0
dtype: int64

In [3]:
ratings[ratings.duplicated()]

Unnamed: 0,user_id,movie_id,rating,timestamp


In [4]:
# 3점 이상만 남깁니다.
#ratings = ratings[ratings['rating']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')  # 왜 timestamp를 없애면 안줄어드는 거지?
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 1000209
Ratio of Remaining Data is 100.00%


In [5]:
# rating 컬럼의 이름을 count로 바꿉니다.                                    # 왜 바꾸는 거지? 평점이 아니라 본 횟수로 간주하려고?
ratings.rename(columns={'rating':'count'}, inplace=True)

In [6]:
ratings['count']

0          5
1          3
2          3
3          4
4          5
          ..
1000204    1
1000205    5
1000206    5
1000207    4
1000208    4
Name: count, Length: 1000209, dtype: int64

In [7]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/mini_projects/_E-08_recommend/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
movies['title'] = movies['title'].str.lower() # 검색을 쉽게하기 위해 소문자로 바꿔줍시다.
movies.head(10)

Unnamed: 0,movie_id,title,genre
0,1,toy story (1995),Animation|Children's|Comedy
1,2,jumanji (1995),Adventure|Children's|Fantasy
2,3,grumpier old men (1995),Comedy|Romance
3,4,waiting to exhale (1995),Comedy|Drama
4,5,father of the bride part ii (1995),Comedy
5,6,heat (1995),Action|Crime|Thriller
6,7,sabrina (1995),Comedy|Romance
7,8,tom and huck (1995),Adventure|Children's
8,9,sudden death (1995),Action
9,10,goldeneye (1995),Action|Adventure|Thriller


In [9]:
movies['genre'] = movies['genre'].str.lower() # 검색을 쉽게하기 위해 소문자로 바꿔줍시다.
movies.head(10)

Unnamed: 0,movie_id,title,genre
0,1,toy story (1995),animation|children's|comedy
1,2,jumanji (1995),adventure|children's|fantasy
2,3,grumpier old men (1995),comedy|romance
3,4,waiting to exhale (1995),comedy|drama
4,5,father of the bride part ii (1995),comedy
5,6,heat (1995),action|crime|thriller
6,7,sabrina (1995),comedy|romance
7,8,tom and huck (1995),adventure|children's
8,9,sudden death (1995),action
9,10,goldeneye (1995),action|adventure|thriller


In [10]:
condition = (ratings['user_id']== ratings.loc[0, 'user_id'])
ratings.loc[condition]

Unnamed: 0,user_id,movie_id,count,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
5,1,1197,3,978302268
6,1,1287,5,978302039
7,1,2804,5,978300719
8,1,594,4,978302268
9,1,919,4,978301368


In [11]:
# 유저 수
ratings['user_id'].nunique()

6040

In [12]:
# 유저가 본 영화 수
ratings['movie_id'].nunique()

3706

In [13]:
# 인기 많은 영화
movie_count = ratings.groupby('movie_id')['user_id'].count()        # 영화 이름이 나오게 하려면 어떻게 해야 되지?
movie_count.sort_values(ascending=False).head(30)

movie_id
2858    3428
260     2991
1196    2990
1210    2883
480     2672
2028    2653
589     2649
2571    2590
1270    2583
593     2578
1580    2538
1198    2514
608     2513
2762    2459
110     2443
2396    2369
1197    2318
527     2304
1617    2288
1265    2278
1097    2269
2628    2250
2997    2241
318     2227
858     2223
356     2194
2716    2181
296     2171
1240    2098
1       2077
Name: user_id, dtype: int64

In [14]:
# 유저별 몇 편의 영화를 봤는지에 대한 통계
user_count = ratings.groupby('user_id')['movie_id'].count()
user_count.describe()

count    6040.000000
mean      165.597517
std       192.747029
min        20.000000
25%        44.000000
50%        96.000000
75%       208.000000
max      2314.000000
Name: movie_id, dtype: float64

In [15]:
# 유저별 평점 중앙값에 대한 통계
user_median = ratings.groupby('user_id')['count'].median()
user_median.describe()

count    6040.000000
mean        3.840811
std         0.577449
min         1.000000
25%         4.000000
50%         4.000000
75%         4.000000
max         5.000000
Name: count, dtype: float64

In [16]:
mov = [1986, 1013, 1012, 1014, 2021]  # 영화 이름이 나오게 하려면 어떻게 해야 되지?

for i in mov:
    print(movies['title'][i])

hot lead and cold feet (1978)
so dear to my heart (1949)
sword in the stone, the (1963)
robin hood: prince of thieves (1991)
rescuers, the (1977)


In [17]:
# 본인이 좋아하시는 영화 데이터로 바꿔서 추가하셔도 됩니다! 단, 이름은 꼭 데이터셋에 있는 것과 동일하게 맞춰주세요. 
my_favorite = ['3245' , '1234' ,'2367' ,'34' ,'657']

# 'joshua'이라는 user_id가 위 영화를 4번 봤다고 가정하겠습니다.
my_playlist = pd.DataFrame({'user_id': ['lee']*5, 'movie_id': my_favorite, 'count':[4]*5,})
    
if not ratings.isin({'user_id':['lee']})['user_id'].any(): 
    ratings = ratings.append(my_playlist)


ratings.tail(10)       # 잘 추가되었는지 확인해 봅시다.

  mask |= (ar1 == a)


Unnamed: 0,user_id,movie_id,count,timestamp
1000204,6040,1091,1,956716541.0
1000205,6040,1094,5,956704887.0
1000206,6040,562,5,956704746.0
1000207,6040,1096,4,956715648.0
1000208,6040,1097,4,956715569.0
0,lee,3245,4,
1,lee,1234,4,
2,lee,2367,4,
3,lee,34,4,
4,lee,657,4,


In [18]:
# 고유한 유저, 영화를 찾아내는 코드
user_unique = ratings['user_id'].unique()
movie_unique = ratings['movie_id'].unique()

# 유저, 영화 indexing 하는 코드 idx는 index의 약자입니다.
user_to_idx = {v:k for k,v in enumerate(user_unique)}
movie_to_idx = {v:k for k,v in enumerate(movie_unique)}

In [19]:
# 인덱싱이 잘 되었는지 확인해 봅니다. 
print(user_to_idx['lee'])  


6040


In [20]:

# user_to_idx.get을 통해 user_id 컬럼의 모든 값을 인덱싱한 Series를 구해 봅시다. 
# 혹시 정상적으로 인덱싱되지 않은 row가 있다면 인덱스가 NaN이 될 테니 dropna()로 제거합니다. 
temp_user_data = ratings['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(ratings):   # 모든 row가 정상적으로 인덱싱되었다면
    print('user_id column indexing OK!!')
    ratings['user_id'] = temp_user_data   # data['user_id']을 인덱싱된 Series로 교체해 줍니다. 
else:
    print('user_id column indexing Fail!!')

# movie_to_idx을 통해 artist 컬럼도 동일한 방식으로 인덱싱해 줍니다. 
temp_movie_data = ratings['movie_id'].map(movie_to_idx.get).dropna()
if len(temp_movie_data) == len(ratings):
    print('artist column indexing OK!!')
    ratings['movie_id'] = temp_movie_data
else:
    print('artist column indexing Fail!!')

ratings

user_id column indexing OK!!
artist column indexing OK!!


Unnamed: 0,user_id,movie_id,count,timestamp
0,0,0,5,978300760.0
1,0,1,3,978302109.0
2,0,2,3,978301968.0
3,0,3,4,978300275.0
4,0,4,5,978824291.0
...,...,...,...,...
0,6040,3706,4,
1,6040,3707,4,
2,6040,3708,4,
3,6040,3709,4,


In [21]:
# 실습 위에 설명보고 이해해서 만들어보기
from scipy.sparse import csr_matrix

num_user = ratings['user_id'].nunique()
num_movie = ratings['movie_id'].nunique()



In [22]:
csr_data = csr_matrix((ratings['count'], (ratings.user_id, ratings.movie_id)), shape= (num_user, num_movie))
csr_data

<6041x3711 sparse matrix of type '<class 'numpy.int64'>'
	with 1000214 stored elements in Compressed Sparse Row format>

In [23]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

# implicit 라이브러리에서 권장하고 있는 부분입니다. 학습 내용과는 무관합니다.
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [24]:
# Implicit AlternatingLeastSquares 모델의 선언
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

In [25]:
# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해줍니다.)
csr_data_transpose = csr_data.T
csr_data_transpose

<3711x6041 sparse matrix of type '<class 'numpy.int64'>'
	with 1000214 stored elements in Compressed Sparse Column format>

In [26]:
csr_data_transpose.shape

(3711, 6041)

In [27]:
# 모델 훈련
als_model.fit(csr_data_transpose)

  0%|          | 0/15 [00:00<?, ?it/s]

In [28]:
lee, toy_story = user_to_idx['lee'], movie_to_idx[1]
lee_vector, toystory_vector = als_model.user_factors[lee], als_model.item_factors[toy_story]

print('슝=3')    

슝=3


In [29]:
print(lee_vector)
print(len(lee_vector))
lee_vector.shape

[ 4.62010689e-03 -7.75674416e-04 -1.11164926e-02  7.30716623e-03
  2.51553021e-03  7.81544764e-03 -3.08685889e-03  1.09492172e-03
 -2.09571538e-03  1.10402927e-02  3.32768029e-03  6.20692619e-04
  7.24437938e-04  4.67154384e-03  4.00708476e-03 -5.06245438e-03
  1.17845112e-03  9.60780017e-04 -1.54261105e-02  9.62143857e-03
 -2.49900040e-04 -1.00339446e-02 -2.40610843e-03  4.07298235e-03
 -7.65041634e-03  1.56127242e-02  1.72443483e-02 -1.72247365e-03
 -8.93641030e-04 -3.84712883e-04 -4.43826104e-03  1.25896586e-02
  4.84455237e-03 -3.28851095e-03 -1.11637162e-02 -2.17209663e-03
  1.92675404e-02 -4.28847596e-03  3.77536146e-03  9.05711763e-03
 -2.10304209e-03 -1.25402818e-02 -1.64293405e-02 -4.61262511e-03
 -6.70115463e-03 -7.29309116e-03 -2.11601425e-03 -2.49898853e-03
 -2.28340016e-03  2.09725206e-03  4.80785547e-03 -4.64525307e-04
  8.72782257e-05  1.46109459e-03 -1.93606038e-02 -9.62923281e-04
  4.60819993e-03 -9.17729922e-03 -2.79618078e-03 -1.09054279e-02
 -1.35284470e-04  1.28874

(100,)

In [30]:
print(toystory_vector)
len(toystory_vector)
toystory_vector.shape

[ 1.31176363e-04  9.40336031e-04  7.45215174e-03  2.26733871e-02
 -1.31747453e-02 -1.47103667e-02 -2.11265497e-02 -7.74076162e-03
  3.43256420e-03 -3.04382374e-05 -1.56078655e-02 -6.36721076e-03
 -2.32400484e-02  1.18175214e-02  8.80659465e-03 -6.96026674e-03
  4.80120592e-02  8.54541268e-03  1.89258996e-02  1.00171566e-02
  1.55459922e-02  9.07228049e-03  3.58877108e-02 -1.95313096e-02
  1.69159994e-02 -7.82426796e-04 -2.10746005e-02  2.14772229e-03
 -4.76715947e-03  6.34211348e-03  1.59151703e-02  1.92558859e-02
  2.41491869e-02 -2.52726302e-02  2.99910363e-02  4.98000532e-02
  1.93701349e-02 -1.63369495e-02  1.55370012e-02 -2.38856208e-03
  3.83617170e-02  1.84711209e-03 -7.27095967e-03  1.13450494e-02
  5.63267851e-03 -1.88659932e-02  2.82066334e-02  1.00049591e-02
 -1.88109484e-02  4.08112854e-02  6.08508149e-03  3.47587131e-02
  7.14968983e-03  2.10753866e-02 -6.98518800e-03  2.56181927e-03
 -3.06683425e-02 -8.57965834e-03 -1.46267144e-02 -5.17569191e-04
 -1.88663173e-02  2.13439

(100,)

In [31]:
np.dot(lee_vector, toystory_vector)

5.2466436e-05

In [32]:
user = user_to_idx['lee']
# recommend에서는 user*item CSR Matrix를 받습니다.
movie_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
movie_recommended
# 평점 3점 미만 버렸을 때
# factor 가 1000일 땐 0.04 ~ 0.05
# factor 가 500 일 땐 0.02
# factor 가 100일 땐 0.001 ~ 0.002

# 평점제한 없앴을 때
# 1000, 0.03~0.05
# 500, 0.02
# 100, 0.002

# 내 가설이 맞았다. 영화추천에선 평점 제한이 있으나 없으나 별 차이 없다.  그나저나 왜 이렇게 낮은거지?

[(716, 0.0027585418),
 (547, 0.002561129),
 (891, 0.0025558965),
 (2567, 0.0024867505),
 (2439, 0.0024274695),
 (420, 0.0022850558),
 (798, 0.0022527727),
 (60, 0.00221904),
 (3083, 0.0022126245),
 (1523, 0.0022067497),
 (2444, 0.0022007562),
 (2026, 0.002198916),
 (273, 0.0021967497),
 (206, 0.0021645816),
 (2529, 0.0021498245),
 (148, 0.0021326256),
 (912, 0.0021114894),
 (116, 0.0021100994),
 (2825, 0.0020689988),
 (1412, 0.002057631)]

In [33]:
for i in movie_recommended:
    print(movies['title'][i[0]])

great white hype, the (1996)
nightmare before christmas, the (1993)
vertigo (1958)
mummy's ghost, the (1944)
breaks, the (1999)
blue chips (1994)
alaska (1996)
eye for an eye (1996)
last picture show, the (1971)
dream with the fishes (1997)
pet sematary (1989)
shaggy d.a., the (1976)
milk money (1994)
waterworld (1995)
pushing tin (1999)
apollo 13 (1995)
2001: a space odyssey (1968)
if lucy fell (1996)
romance (1999)
cement garden, the (1993)
