## 프로젝트 - movielens 영화추천실습 

### 1. 데이터 준비와 전처리 
#### 1-1 데이터 불러오기 

In [1]:
import os 
import pandas as pd
rating_file_path = os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id','rating', 'timestamp']
ratings = pd.read_csv(rating_file_path,sep = '::', names = ratings_cols,engine = 'python')
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


### 1-2 3점 이상 데이터만 남기기 

In [2]:
ratings = ratings[ratings['rating']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size:{filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size/orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size:836478
Ratio of Remaining Data is 83.63%


### 1-3
#### 데이터의 이름 변경 평가 > 점수

In [3]:
ratings.rename(columns={'rating':'count'}, inplace=True)
#원본데이터의 columns의 이름 변경 

In [4]:
movie_file_path = os.getenv("HOME") + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols =['movie_id','title','genre']
movies = pd.read_csv(movie_file_path,sep='::', names=cols, engine='python')
movies.head(300)

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
295,298,Pushing Hands (1992),Comedy
296,299,Priest (1994),Drama
297,300,Quiz Show (1994),Drama
298,301,Picture Bride (1995),Drama|Romance


In [5]:
def title_to_id(name):
    t = movies['title'] == name
    tt = movies[t]
    r = tt['movie_id'].values[0].item()
    return r

def id_to_title(_id):
    t = movies['movie_id'] == _id
    tt = movies[t]
    r = tt['title'].item()
    return r

### 2. 분석해봅시다. 
#### ratings에 있는 유니크한 영화 개수 (중복되지 않는 순수한 영화의 수)


In [6]:
movies['title'] = movies['title'].str.lower()
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,toy story (1995),Animation|Children's|Comedy
1,2,jumanji (1995),Adventure|Children's|Fantasy
2,3,grumpier old men (1995),Comedy|Romance
3,4,waiting to exhale (1995),Comedy|Drama
4,5,father of the bride part ii (1995),Comedy


In [7]:
ratings['movie_id'].nunique() #여기서 유니크하다는건 어떤 의미일까? 

3628

### 2-1. rating에 있는 유니크한 사용자 수


In [8]:
ratings['user_id'].nunique()

6039

### 2-3. 가장 인기있는 영화 30개 

In [9]:
movies_count = ratings.groupby('movie_id')['user_id'].count()
movies_count.sort_values(ascending=False).head(30)

movie_id
2858    3211
260     2910
1196    2885
1210    2716
2028    2561
589     2509
593     2498
1198    2473
1270    2460
2571    2434
480     2413
2762    2385
608     2371
110     2314
1580    2297
527     2257
1197    2252
2396    2213
1617    2210
318     2194
858     2167
1265    2121
1097    2102
2997    2066
2716    2051
296     2030
356     2022
1240    2019
1       2000
457     1941
Name: user_id, dtype: int64

### 3. 내가 좋아하는 영화 5개 넣기 

In [10]:
my_favorite_movies = [2858,1094,562,1196,2028]
my_favorite_movies = pd.DataFrame({'user_id':['minje']*5,'movie_id':my_favorite_movies,'count':[5]*5})
if not ratings.isin({'user_id':['minje']})['user_id'].any():
    ratings = ratings.append(my_favorite_movies)
ratings.tail(10)

Unnamed: 0,user_id,movie_id,count,timestamp
1000203,6040,1090,3,956715518.0
1000205,6040,1094,5,956704887.0
1000206,6040,562,5,956704746.0
1000207,6040,1096,4,956715648.0
1000208,6040,1097,4,956715569.0
0,minje,2858,5,
1,minje,1094,5,
2,minje,562,5,
3,minje,1196,5,
4,minje,2028,5,



4) CSR matrix를 직접 만들어 봅시다.
5) als_model = AlternatingLeastSquares 모델을 직접 구성하여 훈련시켜 봅시다.
6) 내가 선호하는 5가지 영화 중 하나와 그 외의 영화 하나를 골라 훈련된 모델이 예측한 나의 선호도를 파악해 보세요.

In [11]:
user_unique = ratings['user_id'].unique()
movies_unique = ratings['movie_id'].unique()
user_unique

array([1, 2, 3, ..., 6039, 6040, 'minje'], dtype=object)

In [12]:
user_to_idx = {v:k for k,v in enumerate(user_unique)}
movies_to_idx = {v:k for k,v in enumerate(movies_unique)}

In [13]:
print(user_to_idx['minje'])
print(movies_to_idx[2858])

6039
99


In [14]:
temp_user_data = ratings['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(ratings):   
    print('user_id column indexing OK!!')
    ratings['user_id'] = temp_user_data   
else:
    print('user_id column indexing Fail!!')

temp_movies_data = ratings['movie_id'].map(movies_to_idx.get).dropna()
if len(temp_movies_data) == len(ratings):
    print('movies column indexing OK!!')
    ratings['movie_id'] = temp_movies_data
else:
    print('movies column indexing Fail!!')

ratings

#우와! 무비 아이디로 넣었다고 되는게 신기함

user_id column indexing OK!!
movies column indexing OK!!


Unnamed: 0,user_id,movie_id,count,timestamp
0,0,0,5,978300760.0
1,0,1,3,978302109.0
2,0,2,3,978301968.0
3,0,3,4,978300275.0
4,0,4,5,978824291.0
...,...,...,...,...
0,6039,99,5,
1,6039,986,5,
2,6039,311,5,
3,6039,117,5,


### 4. CSR matrix를 직접 만들어 봅시다. 

In [15]:
only_one = ratings[ratings['count']<3]
one,all_data =len(only_one),len(ratings)
print(f'{one},{all_data}')
print(f'Ratio of only_one over all data is {one/all_data:2%}') 
#3미만은 모두 삭제함

0,836483
Ratio of only_one over all data is 0.000000%


In [16]:
from scipy.sparse import csr_matrix

num_user = ratings['user_id'].nunique()
num_movies = ratings['movie_id'].nunique()



csr_data = csr_matrix((ratings['count'], (ratings.user_id,ratings.movie_id)),shape = (num_user,num_movies))
csr_data

<6040x3628 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

### 5. als_model = AlternatingLeastSquares 모델을 직접 구성하여 훈련시켜 봅시다. 

In [17]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [18]:
als_model = AlternatingLeastSquares(factors=1000, regularization=0.01, use_gpu=False, iterations=100, dtype=np.float32)

In [19]:
csr_data_transpose = csr_data.T
csr_data_transpose

<3628x6040 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [20]:
als_model.fit(csr_data_transpose)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




In [22]:
minje, m = user_to_idx['minje'], movies_to_idx[2858]
minje_vector, m = als_model.user_factors[minje], als_model.item_factors[m]

In [23]:
minje_vector

array([-1.40539128e-02,  3.05406507e-02, -2.64236070e-02,  2.12192535e-01,
        8.41539651e-02,  1.44212261e-01, -4.42454331e-02,  8.63520876e-02,
        5.22930585e-02, -2.04888523e-01,  1.36393726e-01,  1.72523275e-01,
       -1.75398722e-01,  4.35960218e-02, -5.00962771e-02, -8.12998340e-02,
       -5.43136857e-02, -8.93302858e-02,  9.60494056e-02, -1.18892513e-01,
       -4.83299829e-02,  4.14493531e-02,  9.44848880e-02, -1.95435569e-01,
       -2.00889066e-01,  7.77408807e-03, -2.17279959e-02,  3.75228561e-02,
        1.20168246e-01, -2.36865617e-02, -1.76379569e-02,  1.32830620e-01,
       -1.24232555e-02,  1.16038367e-01, -9.86489877e-02, -9.40872282e-02,
       -2.92318780e-02,  5.66696450e-02, -2.68462133e-02, -1.83238849e-01,
       -4.12823893e-02,  1.76662728e-01,  9.55488011e-02,  2.11136043e-03,
       -1.56875197e-02, -1.43646598e-02, -3.03084236e-02,  1.52933896e-01,
       -5.16188778e-02,  6.22438304e-02, -4.15352210e-02,  6.10128082e-02,
       -1.11827180e-01,  

In [24]:
m

array([-1.31910145e-02,  1.33689176e-02, -1.10711362e-02,  5.92903979e-03,
        8.54512211e-03,  1.75750330e-02, -5.06115844e-03, -1.23147778e-02,
        2.41983961e-02, -1.27103953e-02,  2.47083269e-02,  2.88766157e-02,
        3.56378104e-03,  1.51056536e-02, -5.50292805e-03,  2.32637990e-02,
        9.55655240e-03, -1.30235804e-02, -1.74554065e-02,  2.87376102e-02,
       -9.96666029e-03, -1.66255329e-02, -1.00464569e-02,  1.79157797e-02,
       -2.44214833e-02,  4.89103375e-03, -1.77869871e-02, -9.31286626e-03,
        2.16166694e-02, -1.22693097e-02,  3.17299291e-02,  1.25633217e-02,
       -1.84680000e-02,  1.93049423e-02, -2.11153049e-02,  3.04074157e-02,
       -1.77690946e-02,  9.33377724e-03,  3.02797370e-02, -3.51517200e-02,
       -3.75973992e-02,  2.70215888e-02,  2.37416234e-02,  2.76226960e-02,
        5.16621536e-03,  1.75554231e-02,  3.30355391e-02, -1.01921801e-02,
       -1.52936147e-03, -9.58501548e-03, -5.98992687e-03,  1.10651888e-02,
       -7.48507492e-03,  

In [25]:
np.dot(minje_vector,m)

0.987526

### 6. 내가 선호하는 5가지 영화 중 하나와 그 외의 영화 하나를 골라 훈련된 모델이 예측한 나의 선호도를 파악해 보세요.


In [26]:
def title_to_id(name):
    t = movies['title'] == name
    tt = movies[t]
    r = tt['movie_id'].values[0].item()
    return r

def id_to_title(_id):
    t = movies['movie_id'] == _id
    tt = movies[t]
    r = tt['title'].item()
    return r

In [27]:
movie = movies_to_idx[2028]
movie_vector = als_model.item_factors[2028]
np.dot(minje_vector,movie_vector)

0.031448122

### 7. 내가 좋아하는 영화와 비슷한 영화를 추천받아 봅시다.

In [28]:
favorite_movie = 2858
movie_id = movies_to_idx[favorite_movie]
similar_movie = als_model.similar_items(movie_id, N = 15)
similar_movie

[(99, 0.6428484),
 (3517, 0.15954457),
 (3466, 0.15657629),
 (3480, 0.15597984),
 (3614, 0.15596683),
 (3534, 0.15568951),
 (3565, 0.15515502),
 (3420, 0.15512161),
 (3523, 0.1535697),
 (3604, 0.15333734),
 (3550, 0.15301484),
 (3374, 0.1527859),
 (3603, 0.15278156),
 (3600, 0.1524709),
 (3414, 0.15217182)]

In [29]:
idx_to_movies = {v:k for k,v in movies_to_idx.items()}
[idx_to_movies[i[0]] for i in similar_movie]

[2858,
 1787,
 989,
 3890,
 2438,
 3779,
 1316,
 2591,
 1886,
 2576,
 3876,
 643,
 1145,
 3748,
 1098]

In [30]:
a = [idx_to_movies[i[0]] for i in similar_movie]
for i in a:
    print(id_to_title(i))

american beauty (1999)
paralyzing fear: the story of polio in america, a (1998)
schlafes bruder (brother of sleep) (1995)
back stage (2000)
outside ozona (1998)
project moon base (1953)
anna (1996)
jeanne and the perfect guy (jeanne et le gar�on formidable) (1998)
i got the hook up (1998)
love, etc. (1996)
jerry & tom (1998)
peanuts - die bank zahlt alles (1996)
snowriders (1996)
match, the (1999)
search for one-eye jimmy, the (1996)


### 8. 내가 가장 좋아할 만한 영화들을 추천받아 봅시다.

In [31]:
user = user_to_idx['minje']
movies_recommended = als_model.recommend(user, csr_data, N = 20, filter_already_liked_items = True)
movies_recommended

[(1480, 0.082495905),
 (450, 0.06760685),
 (207, 0.06286987),
 (990, 0.057500787),
 (1487, 0.056678206),
 (1956, 0.05510632),
 (2037, 0.054605078),
 (1560, 0.05411025),
 (1731, 0.053904645),
 (2569, 0.051092092),
 (901, 0.05106592),
 (539, 0.050467357),
 (1821, 0.05006475),
 (61, 0.049708955),
 (1680, 0.048981413),
 (872, 0.048933893),
 (1391, 0.04891046),
 (1016, 0.048718233),
 (2290, 0.04842683),
 (2027, 0.048350133)]

In [32]:
a = [idx_to_movies[i[0]] for i in movies_recommended]
for i in a:
    print(id_to_title(i))

slums of beverly hills, the (1998)
life is beautiful (la vita � bella) (1997)
citizen ruth (1996)
american werewolf in london, an (1981)
take the money and run (1969)
restoration (1995)
kicking and screaming (1995)
dead men don't wear plaid (1982)
picnic at hanging rock (1975)
bastard out of carolina (1996)
center stage (2000)
mystery science theater 3000: the movie (1996)
my science project (1985)
rebel without a cause (1955)
cemetery man (dellamorte dellamore) (1994)
it came from outer space (1953)
hairspray (1988)
halloween (1978)
ninth gate, the (2000)
croupier (1998)


우와!!!!!!!!!!!먼가 중간중간 우여곡절이 많았지만 그래도 정말 오랜만에 하나를 끝냈다. 


als 모델이 어떻게 사용되는지 암묵적인 추천에 모델과 학습법  

다량의 mf매트릭스일 때 빈값의 존재로 연산이 어려우니 csr로 기존데이터에서 추출해오는법 

for문의 다양성 a = [idx_to_movies[i[0]] for i in movies_recommended] 이렇게 생긴 for문을 보다니..하하하하하하하하하 



아직 어떻게해서 이 모델이 나에게 이런 추천을 해주는지는 잘 모른다. 

하지만 내 수준에서는 index할때 되도록이면 기존 이름으로 꼭꼭 챙겨와야 에러가 안나며 count라는 col을 찾고 싶은데 그 안에 col이 있다면 ''처리로 찾을 수 있다 것 등등..

오늘 또 무언가를 배운 기분이다.근데 넘나 피곤하다..ㅠㅠ

행복하다ㅎㅎㅎ