In [3]:
#!mkdir -p ~/aiffel/recommendata_iu/data/ml-1m
#!ln -s ~/data/ml-1m/* ~/aiffel/recommendata_iu/data/ml-1m
!ls


aiffel	data


In [54]:
import pandas as pd
import os
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares
import numpy as np
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [4]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['rating']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')


orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [5]:
# rating 컬럼의 이름을 count로 바꿉니다.
ratings.rename(columns={'rating':'count1'}, inplace=True)

In [6]:
ratings['count1']


0          5
1          3
2          3
3          4
4          5
          ..
1000203    3
1000205    5
1000206    5
1000207    4
1000208    4
Name: count1, Length: 836478, dtype: int64

In [7]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre']
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()


Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
ratings

Unnamed: 0,user_id,movie_id,count1,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000203,6040,1090,3,956715518
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [9]:
a = len(ratings['movie_id'].unique()) # ratings에 있는 유니크한 영화 개수
b = len(ratings['user_id'].unique()) # rating에 있는 유니크한 사용자 수 
c = ratings.groupby('movie_id')['count1'].count()
print(f'ratings에 있는 유니크한 영화 개수 : {a} \nratings에 있는 유니크한 사용자 수 : {b}')
print('인기 많은 영화 수 정렬 :\n',c.sort_values(ascending=False).head(30))

ratings에 있는 유니크한 영화 개수 : 3628 
ratings에 있는 유니크한 사용자 수 : 6039
인기 많은 영화 수 정렬 :
 movie_id
2858    3211
260     2910
1196    2885
1210    2716
2028    2561
589     2509
593     2498
1198    2473
1270    2460
2571    2434
480     2413
2762    2385
608     2371
110     2314
1580    2297
527     2257
1197    2252
2396    2213
1617    2210
318     2194
858     2167
1265    2121
1097    2102
2997    2066
2716    2051
296     2030
356     2022
1240    2019
1       2000
457     1941
Name: count1, dtype: int64


In [11]:
ratings

Unnamed: 0,user_id,movie_id,count1,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000203,6040,1090,3,956715518
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [45]:
my_favorite = [1, 2, 3 , 4, 5]

# 'zimin'이라는 user_id가 위 아티스트의 노래를 30회씩 들었다고 가정하겠습니다.
my_playlist = pd.DataFrame({'user_id': [6042]*5, 'movie_id': my_favorite, 'count1':[5]*5,'timestamp':[978300275]*5})

if not ratings.isin({'user_id':['6042']})['user_id'].any():  # user_id에 'zimin'이라는 데이터가 없다면
    ratings = ratings.append(my_playlist)                           # 위에 임의로 만든 my_favorite 데이터를 추가해 줍니다.

In [43]:
ratings = ratings[:-7]

In [46]:
ratings

Unnamed: 0,user_id,movie_id,count1,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
0,6042,1,5,978300275
1,6042,2,5,978300275
2,6042,3,5,978300275
3,6042,4,5,978300275


In [26]:
movies

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [57]:
movie_data = {v:k for k,v in enumerate(movies['title'].unique())}
user_id_data = {v:k for k,v in enumerate(ratings['user_id'].unique())}


In [47]:
num_user = max(ratings['user_id'])+1
num_movie = max(ratings['movie_id'])+1

csr_data = csr_matrix((ratings.count1, (ratings.user_id, ratings.movie_id)), shape= (num_user, num_movie))
csr_data

<6043x3953 sparse matrix of type '<class 'numpy.longlong'>'
	with 836483 stored elements in Compressed Sparse Row format>

In [55]:
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

csr_data_transpose = csr_data.T

als_model.fit(csr_data_transpose)


  0%|          | 0/15 [00:00<?, ?it/s]

In [65]:
#movie_data
user_id_data[1000]

999

In [93]:
myid, movie_name = user_id_data[6040], movie_data['Jumanji (1995)']
myid_vector, toy_story_vector = als_model.user_factors[myid], als_model.item_factors[movie_name]

In [90]:
ratings['user_id'].unique

<bound method Series.unique of 0             1
1             1
2             1
3             1
4             1
           ... 
1000204    6040
1000205    6040
1000206    6040
1000207    6040
1000208    6040
Name: user_id, Length: 1000209, dtype: int64>

In [91]:
np.dot(myid_vector,toy_story_vector)

0.12922956

In [94]:
movie_data

{'Toy Story (1995)': 0,
 'Jumanji (1995)': 1,
 'Grumpier Old Men (1995)': 2,
 'Waiting to Exhale (1995)': 3,
 'Father of the Bride Part II (1995)': 4,
 'Heat (1995)': 5,
 'Sabrina (1995)': 6,
 'Tom and Huck (1995)': 7,
 'Sudden Death (1995)': 8,
 'GoldenEye (1995)': 9,
 'American President, The (1995)': 10,
 'Dracula: Dead and Loving It (1995)': 11,
 'Balto (1995)': 12,
 'Nixon (1995)': 13,
 'Cutthroat Island (1995)': 14,
 'Casino (1995)': 15,
 'Sense and Sensibility (1995)': 16,
 'Four Rooms (1995)': 17,
 'Ace Ventura: When Nature Calls (1995)': 18,
 'Money Train (1995)': 19,
 'Get Shorty (1995)': 20,
 'Copycat (1995)': 21,
 'Assassins (1995)': 22,
 'Powder (1995)': 23,
 'Leaving Las Vegas (1995)': 24,
 'Othello (1995)': 25,
 'Now and Then (1995)': 26,
 'Persuasion (1995)': 27,
 'City of Lost Children, The (1995)': 28,
 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)': 29,
 'Dangerous Minds (1995)': 30,
 'Twelve Monkeys (1995)': 31,
 'Wings of Courage (1995)': 32,
 'Babe (1995)'

In [97]:
movie_id = movie_data['Jumanji (1995)']
similar_movie = als_model.similar_items(movie_id,N=15)
idx_to_movie = {v:k for k,v in movie_data.items()}
[idx_to_movie[i[0]] for i in similar_movie]


['Jumanji (1995)',
 'Third Miracle, The (1999)',
 'Carrington (1995)',
 'Batman (1989)',
 'Heathers (1989)',
 "You've Got Mail (1998)",
 'Wild Bunch, The (1969)',
 'Maverick (1994)',
 'Little Voice (1998)',
 "Child's Play 2 (1990)",
 'I Love Trouble (1994)',
 'Deadly Friend (1986)',
 'Friday the 13th Part 3: 3D (1982)',
 'Air Force One (1997)',
 'Alan Smithee Film: Burn Hollywood Burn, An (1997)']

In [99]:
def get_similar_movie(movie_name: str):
    movie_id = movie_data[movie_name]
    similar_movie = als_model.similar_items(movie_id)
    similar_movie = [idx_to_movie[i[0]] for i in similar_movie]
    return similar_movie

In [100]:
get_similar_movie('Jumanji (1995)')

['Jumanji (1995)',
 'Third Miracle, The (1999)',
 'Carrington (1995)',
 'Batman (1989)',
 'Heathers (1989)',
 "You've Got Mail (1998)",
 'Wild Bunch, The (1969)',
 'Maverick (1994)',
 'Little Voice (1998)',
 "Child's Play 2 (1990)"]