### MovieLens 웹 사이트에서 축적된 영화 추천 데이터
#### https://grouplens.org/datasets/movielens/

## 1. 추천시스템 맛보기 (A Simple Recommender System)
- 높은 평점순으로 영화를 추천하는 시스템

In [1]:
# pandas와 numpy 모듈 불러오기
import pandas
import numpy

In [2]:
# 컬럼명 정의
user_cols = ['user id', 'age', 'gender', 'occupation', 'zip code']
item_cols = ['movie id', 'movie title', 'release date', 'video release date',
            'IMDb URL', 'unknown', 'Action', 'Adventure', 'Animation',
            'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama',
            'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
            'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
rating_cols = ['user id', 'movie id', 'rating', 'timestamp']

In [3]:
# pandas의 read_csv() 함수를 활용해 각 데이터셋을 불러온다
users = pandas.read_csv('ml-100k/u.user', sep = '|', names = user_cols, encoding = 'latin-1')
items = pandas.read_csv('ml-100k/u.item', sep = '|', names = item_cols, encoding = 'latin-1')
ratings = pandas.read_csv('ml-100k/u.data', sep = '\t', names = rating_cols, encoding = 'latin-1')

In [4]:
# 데이터 형태 출력
print(users.shape) # 943 users in total
print(items.shape) # 1682 items (movies) in total
print(ratings.shape) # 100,000 ratings in total

(943, 5)
(1682, 24)
(100000, 4)


In [5]:
users.head() # users DataFrame의 첫 5행의 데이터

Unnamed: 0,user id,age,gender,occupation,zip code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [6]:
items.head() # items DataFrame의 첫 5행의 데이터

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Childrens,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [7]:
ratings.head() # ratings DataFrame의 첫 5행의 데이터

Unnamed: 0,user id,movie id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


### 데이터 전처리(Data Preprocessing) 

In [8]:
# 데이터 병합 
data = pandas.merge(pandas.merge(items, ratings), users)

In [9]:
data.head()

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Childrens,...,Thriller,War,Western,user id,rating,timestamp,age,gender,occupation,zip code
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,308,4,887736532,60,M,retired,95076
1,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,308,5,887737890,60,M,retired,95076
2,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,1,0,0,308,4,887739608,60,M,retired,95076
3,7,Twelve Monkeys (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Twelve%20Monk...,0,0,0,0,0,...,0,0,0,308,4,887738847,60,M,retired,95076
4,8,Babe (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Babe%20(1995),0,0,0,0,1,...,0,0,0,308,5,887736696,60,M,retired,95076


In [10]:
# number of people who rated each movie / 영화 제목을 기준으로 grouping
ratings_total = data.groupby('movie title').size()

In [11]:
ratings_total.head()

movie title
'Til There Was You (1997)      9
1-900 (1994)                   5
101 Dalmatians (1996)        109
12 Angry Men (1957)          125
187 (1997)                    41
dtype: int64

In [12]:
# mean rating of each movie / 각 영화에 매겨진 회원들의 평점을 평균해 봄
ratings_mean = (data.groupby('movie title'))['movie title', 'rating'].mean()
ratings_mean.head()

Unnamed: 0_level_0,rating
movie title,Unnamed: 1_level_1
'Til There Was You (1997),2.333333
1-900 (1994),2.6
101 Dalmatians (1996),2.908257
12 Angry Men (1957),4.344
187 (1997),3.02439


In [13]:
# 병합(merging)준비
ratings_total = pandas.DataFrame({'movie title': ratings_total.index, \
                                 'total ratings': ratings_total.values})
ratings_total.head()

Unnamed: 0,movie title,total ratings
0,'Til There Was You (1997),9
1,1-900 (1994),5
2,101 Dalmatians (1996),109
3,12 Angry Men (1957),125
4,187 (1997),41


In [14]:
ratings_mean['movie title'] = ratings_mean.index
ratings_mean.head()

Unnamed: 0_level_0,rating,movie title
movie title,Unnamed: 1_level_1,Unnamed: 2_level_1
'Til There Was You (1997),2.333333,'Til There Was You (1997)
1-900 (1994),2.6,1-900 (1994)
101 Dalmatians (1996),2.908257,101 Dalmatians (1996)
12 Angry Men (1957),4.344,12 Angry Men (1957)
187 (1997),3.02439,187 (1997)


In [15]:
# sort movies by the number of total ratings 
final = pandas.merge(ratings_mean, ratings_total).\
sort_values(by = 'total ratings', ascending = False)

final.head(10)

Unnamed: 0,rating,movie title,total ratings
1398,4.358491,Star Wars (1977),583
333,3.803536,Contact (1997),509
498,4.155512,Fargo (1996),508
1234,4.00789,Return of the Jedi (1983),507
860,3.156701,Liar Liar (1997),485
460,3.656965,"English Patient, The (1996)",481
1284,3.441423,Scream (1996),478
1523,3.878319,Toy Story (1995),452
32,3.63109,Air Force One (1997),431
744,3.438228,Independence Day (ID4) (1996),429


In [16]:
# 평균 평점을 기준으로 내림차순 정렬하여 높은 평점 300개의 영화를 추출하여 이 순서대로 영화를 추천함
recommendation = final[:300].sort_values(by = 'rating', ascending = False)
recommendation.head(10)

Unnamed: 0,rating,movie title,total ratings
1281,4.466443,Schindler's List (1993),298
1652,4.466102,"Wrong Trousers, The (1993)",118
273,4.45679,Casablanca (1942),243
1317,4.44523,"Shawshank Redemption, The (1994)",283
1215,4.38756,Rear Window (1954),209
1572,4.385768,"Usual Suspects, The (1995)",267
1398,4.358491,Star Wars (1977),583
3,4.344,12 Angry Men (1957),125
303,4.292929,Citizen Kane (1941),198
1507,4.292237,To Kill a Mockingbird (1962),219


# 2. 컨텐츠 기반 필터링 추천시스템
### (A Contents-based Filtering Recommender System)

- user가 높이 평가한 영화들과 유사한 특성을 갖는 영화를 추천하는 시스템

In [17]:
# pandas와 numpy 모듈 불러오기
import pandas
import numpy

In [18]:
# 컬럼명 정의
user_cols = ['user id', 'age', 'gender', 'occupation', 'zip code']
item_cols = ['movie id', 'movie title', 'release date', 'video release date',
            'IMDb URL', 'unknown', 'Action', 'Adventure', 'Animation',
            'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama',
            'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
            'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
rating_cols = ['user id', 'movie id', 'rating', 'timestamp']

In [19]:
# pandas의 read_csv() 함수를 활용해 각 데이터셋을 불러온다
users = pandas.read_csv('ml-100k/u.user', sep = '|', names = user_cols, encoding = 'latin-1')
items = pandas.read_csv('ml-100k/u.item', sep = '|', names = item_cols, encoding = 'latin-1')
ratings = pandas.read_csv('ml-100k/u.data', sep = '\t', names = rating_cols, encoding = 'latin-1')

### 데이터 전처리(Data Preprocessing) 

#### Train-Test Split
- 전체 데이터 셋을 학습데이터/검증데이터로 나눔
- 학습데이터 : User 1~942까지의 데이터(1~99832까지의 관측치)
- 검증데이터 : User 943의 데이터(99833~100000까지의 관측치)
- 1~942까지의 user 데이터를 통해 943번째 user의 선호 예측

In [20]:
ratings_train = (ratings.sort_values('user id'))[:99832]
ratings_test = (ratings.sort_values('user id'))[99832:]

In [21]:
print(ratings_train.shape)
print(ratings_test.shape)

(99832, 4)
(168, 4)


In [22]:
ratings_train.head() # 학습 데이터 출력

Unnamed: 0,user id,movie id,rating,timestamp
66567,1,55,5,875072688
62820,1,203,4,878542231
10207,1,183,5,875072262
9971,1,150,5,876892196
22496,1,68,4,875072688


In [23]:
ratings_test.head() # 검증 데이터 출력

Unnamed: 0,user id,movie id,rating,timestamp
70296,943,215,5,888639000
91841,943,132,3,888639093
91810,943,204,3,888639117
77956,943,94,4,888639929
87415,943,53,3,888640067


In [24]:
# User 943이 선호하는 영화 정렬 / rating을 기준으로 내림차순 정렬
ratings_test = ratings_test.sort_values(by=['rating'], ascending=False)
ratings_test.head()

Unnamed: 0,user id,movie id,rating,timestamp
70296,943,215,5,888639000
71726,943,471,5,875502042
70174,943,186,5,888639478
76855,943,943,5,888639614
79678,943,55,5,888639118


In [25]:
# Pandas의 .values를 사용해서 DataFrame을 Numpy의 array로 반환
ratings_train = ratings_train[['user id', 'movie id', 'rating']].values
ratings_test = ratings_test[['user id', 'movie id', 'rating']].values

In [26]:
type(ratings_train), ratings_train.shape # timestamp열을 drop했기에 shape이 하나 줄어듦

(numpy.ndarray, (99832, 3))

In [27]:
type(ratings_test), ratings_test.shape # timestamp열을 drop했기에 shape이 하나 줄어듦

(numpy.ndarray, (168, 3))

In [None]:
# 참고 위 25~27 코드와 동일한 결과
ratings_train = ratings_train.as_matrix(columns = ['user id', 'movie id', 'rating'])
ratings_test = ratings_test.as_matrix(columns = ['user id', 'movie id', 'rating'])

In [28]:
# 결과문 확인 : 선호하는 영화 vs 선호하지 않는 영화
print(ratings_test[0]) # 배열의 첫번째 요소 / movie id 215인 영화에 5점의 평점을 부여
print(ratings_test[-1]) # 배열의 마지막 요소 / movie id 941인 영화에 1점의 평점을 부여

[943 215   5]
[943 941   1]


In [29]:
# 추천하고 싶은 영화의 개수(임의)
num_recommendations = 5 

In [30]:
favorite_items = []
for i in range(num_recommendations):
    favorite_items.append(ratings_test[i][1])

In [31]:
favorite_items # user 943이 선호한 아이템 목록

[215, 471, 186, 943, 55]

### 유사도 (거리) / Similarity(distance)
- item-item similarity calculation (아이템 간 유사도 계산)
- 장르를 기준으로 하여 영화 간 유사도를 계산

In [32]:
# 두 영화 간의 거리를 계산하기 위한 함수를 정의
def EucledianDist(item1, item2):
    import math
    s = 0
    v1 = item1[5:]
    v2 = item2[5:]
    
    for i in range(len(v1)):
        # 각 원소를 서로 뺀 후에 제곱근 해 모두 더한다
        temp = (v1[i] - v2[i])*(v1[i] - v2[i])
        s += temp
    return math.sqrt(s)

In [33]:
# items 데이터프레임을 numPy 배열로 변환
items = items.values
items.shape # 1682:전체 아이템의 개수/ 24:한 아이템 당 특징의 개수

(1682, 24)

In [34]:
items[0]

array([1, 'Toy Story (1995)', '01-Jan-1995', nan,
       'http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)', 0, 0, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=object)

In [35]:
# 1682 x 1682 크기의 0으로 이루어진 행렬을 생성
distance_matrix = numpy.zeros((1682,1682))
distance_matrix.shape

(1682, 1682)

In [37]:
# 유사도 행렬의 각 원소 값 계산
for i in range(len(items)):
    for j in range(len(items)):
        # 영(zero)행렬의 각 원소에 각 열번호와 행번호의 영화 간의 거리로 계산해 담는다
        distance_matrix[i][j] = EucledianDist(items[i], items[j])

In [38]:
distance_matrix # 각 영화 간의 거리를 나타내는 2차원 배열(행렬)

array([[ 0.        ,  2.44948974,  2.        , ...,  2.23606798,
         1.41421356,  2.        ],
       [ 2.44948974,  0.        ,  1.41421356, ...,  2.23606798,
         2.        ,  2.        ],
       [ 2.        ,  1.41421356,  0.        , ...,  1.73205081,
         1.41421356,  1.41421356],
       ..., 
       [ 2.23606798,  2.23606798,  1.73205081, ...,  0.        ,
         1.73205081,  1.        ],
       [ 1.41421356,  2.        ,  1.41421356, ...,  1.73205081,
         0.        ,  1.41421356],
       [ 2.        ,  2.        ,  1.41421356, ...,  1.        ,
         1.41421356,  0.        ]])

### 추천하기(Recommendation)

In [39]:
# 추천 영화 후보군 추출 / user 943이 높게 평가한 영화들과 거리가 짧은 영화 목록 생성
recommendation_list = []

for item in favorite_items:
    # 각 영화와 가장 거리가 짧은 영화의 인덱스를 찾는다
    idx = numpy.argmin(distance_matrix[item -1])
    # 인덱스를 통해 그 영화의 이름을 찾아 리스트에 첨부
    recommendation_list.append(items[idx][1])
    print(idx)

5
9
185
2
54


In [40]:
# 영화 추천하기 / 추천 영화를 찾아 출력
for movie in recommendation_list:
    print(movie)

Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)
Richard III (1995)
Blues Brothers, The (1980)
Four Rooms (1995)
Professional, The (1994)


# 3. 협업필터링 추천시스템
### (A Collavorative Filtering Recommender System)

- 참고 
https://acodeforthought.wordpress.com/2016/12/29/building-a-recommender-system-on-user-user-collaborative-filtering-movielens-dataset/

In [41]:
# pandas와 numpy 모듈 불러오기
import pandas
import numpy

In [42]:
# 컬럼명 정의
user_cols = ['user id', 'age', 'gender', 'occupation', 'zip code']
item_cols = ['movie id', 'movie title', 'release date', 'video release date',
            'IMDb URL', 'unknown', 'Action', 'Adventure', 'Animation',
            'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama',
            'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
            'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
rating_cols = ['user id', 'movie id', 'rating', 'timestamp']

In [43]:
# pandas의 read_csv() 함수를 활용해 각 데이터셋을 불러온다
users = pandas.read_csv('ml-100k/u.user', sep = '|', names = user_cols, encoding = 'latin-1')
items = pandas.read_csv('ml-100k/u.item', sep = '|', names = item_cols, encoding = 'latin-1')
ratings = pandas.read_csv('ml-100k/u.data', sep = '\t', names = rating_cols, encoding = 'latin-1')

In [44]:
ratings_train = (ratings.sort_values('user id'))[:99832]
ratings_test = (ratings.sort_values('user id'))[99832:]

In [45]:
ratings_train.head()

Unnamed: 0,user id,movie id,rating,timestamp
66567,1,55,5,875072688
62820,1,203,4,878542231
10207,1,183,5,875072262
9971,1,150,5,876892196
22496,1,68,4,875072688


In [46]:
ratings_test.head()

Unnamed: 0,user id,movie id,rating,timestamp
70296,943,215,5,888639000
91841,943,132,3,888639093
91810,943,204,3,888639117
77956,943,94,4,888639929
87415,943,53,3,888640067


In [47]:
ratings_train = ratings_train.as_matrix(columns = ['user id', 'movie id', 'rating'])
ratings_test = ratings_test.as_matrix(columns = ['user id', 'movie id', 'rating'])

- 각 user의 rating 정보를 담은 리스트를 생성한다
- 결과물이 user_list 리스트는 1부터 942까지의 각 userrㅏ 어떠한 영화에 대해 rating을 어떻게 했는지에 관한 정보를 담고 있다.

In [50]:
users_list = []
for i in range(1,943):
    temp = []
    for j in range(0, len(ratings_train)): # range(len(ratings_train)):
        if ratings_train[j][0] == i:
            temp.append(ratings_train[j])
        else:
            break
    
    ratings_train = ratings_train[j:]
    users_list.append(temp)
    
print(len(users_list))

942


In [51]:
print(users_list[0][0]) 

[ 1 55  5]


### Similarity calculation
- train user(1-942)와 test user(943) 간의 유사도를 계산한다
- user 943과 높은 유사도를 가진 user를 추려낸다
- 추려낸 user 들이 높게 rating을 매긴 영화(item)들을 추천한다

### EuclideanScore() function
- 두 user 간의 유사도를 계산할 수 있는 EuclideanScore() 함수를 정의한다
- 두 user 간에 4회 이상 동일한 영화를 rating한 경우에만 거리를 계산한다
- 두 user 간에 4회 이하로 동일한 영화를 rating한 경우에는 거리를 1,000,000(임의의 큰 정수)로 정의한다

In [58]:
def EucledianScore(train_user, test_user):
    import math
    s = 0
    count = 0
    for i in test_user:
        score = 0
        for j in train_user:
            if(int(i[1]) == int(j[1])):
                score= ((float(i[2])-float(j[2]))*(float(i[2])-float(j[2])))
                count= count +1
            s = s + score
    if(count<4):
        s = 1000000
    return(math.sqrt(s))

### calculate Euclidean Scores

In [59]:
score_list = []
for i in range(942):
    score_list.append([i+1,EucledianScore(users_list[i], ratings_test)])

In [60]:
print(score_list[0])

[1, 149.80654191322887]


해석: 회원 1은 회원 943과 149.80

In [62]:
score = pandas.DataFrame(score_list, columns = ['user id','Eucledian Score'])
score = score.sort_values(by = 'Eucledian Score')
print(score.shape)

(942, 2)


In [63]:
print(score.head()) # 유사도가 가장 높은 5명을 출력
print()
print(score.tail()) # 유사도가 가장 낮은 5명을 출력

     user id  Eucledian Score
309      310         1.732051
138      139         3.872983
45        46         4.000000
208      209         4.242641
557      558         4.582576

     user id  Eucledian Score
530      531           1000.0
219      220           1000.0
528      529           1000.0
519      520           1000.0
627      628           1000.0


In [64]:
# pandas dataframe을 numpy array로 변환한다
score_matrix = score.as_matrix()

In [65]:
# user 310이 평가한 모든 영화 정보를 담기위한 full_list와
# user 943과 user 310이 공히 평가한 영화 정보를 담기 위한 common_list 생성
user = int(score_matrix[0][0])
common_list = []
full_list = []

In [66]:
print(user)

310


In [67]:
# common_list와 full_list를 채워넣는다
for i in ratings_test:
    for j in users_list[user-1]:
        if int(i[1]) == int(j[1]):
            common_list.append(int(j[1]))
        full_list.append(j[1])

In [68]:
# 각 리스트를 집합으로 변환한다
common_list = set(common_list)
full_list = set(full_list)

In [69]:
# 추천 영화를 추려내기 위해 user 310이 평가한 영화 중에 user 943이 이미 본 영화를 배제한다.
# recommendation = full_list - common_list
recommendation = full_list.difference(common_list)

In [70]:
print(recommendation)

{257, 258, 14, 275, 536, 294, 304, 832, 845, 222, 740, 1386, 748, 116, 1142, 251, 1022}


In [74]:
merged = pandas.merge(items, ratings).sort_values(by = 'movie id')
grouped = merged.groupby('movie id')
item_list = grouped['movie id', 'movie title', 'rating']
item_list = item_list.mean() # 여러 user의 특정 영화 rating의 평균을 계산
item_list = item_list.sort_values(by = ['movie id']) ###삽입
print(item_list.head())

                   movie id    rating
movie title                          
Toy Story (1995)        1.0  3.878319
GoldenEye (1995)        2.0  3.206107
Four Rooms (1995)       3.0  3.033333
Get Shorty (1995)       4.0  3.550239
Copycat (1995)          5.0  3.302326


In [None]:
a =
1682 / title 

In [None]:
impror


In [76]:
item_list['movie title'] = item_list.index
print(item_list.head())

                   movie id    rating        movie title
movie title                                             
Toy Story (1995)        1.0  3.878319   Toy Story (1995)
GoldenEye (1995)        2.0  3.206107   GoldenEye (1995)
Four Rooms (1995)       3.0  3.033333  Four Rooms (1995)
Get Shorty (1995)       4.0  3.550239  Get Shorty (1995)
Copycat (1995)          5.0  3.302326     Copycat (1995)


In [77]:
grouped.head(10)

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Childrens,...,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,user id,rating,timestamp
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,308,4,887736532
308,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,303,5,879466966
307,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,886,4,876031433
306,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,789,3,880332089
305,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,295,4,879517580
304,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,138,4,879023031
303,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,216,4,880232615
302,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,921,3,879379601
301,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,883,3,891914583
300,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,438,4,879868096


In [78]:
item_list = item_list.as_matrix()
print(item_list.shape)
print(item_list[0]) # 배열의 첫 번쨰 요소를 출력해 본다

(1664, 3)
[1.0 3.8783185840707963 'Toy Story (1995)']


In [80]:
recommendation_list = []
for i in recommendation:
    item = item_list[i-1]
    print(item)
    recommendation_list.append(item)

[258.0 3.8035363457760316 'Contact (1997)']
[259.0 2.685185185185185 'George of the Jungle (1997)']
[14.0 3.9672131147540983 'Postino, Il (1994)']
[277.0 3.464788732394366 'Restoration (1995)']
[540.0 2.511627906976744 'Money Train (1995)']
[296.0 3.3333333333333335 'Promesse, La (1996)']
[310.0 3.6 'Rainmaker, The (1997)']
[839.0 2.5 'Loch Ness (1995)']
[852.0 1.0 'Bloody Child, The (1996)']
[222.0 3.66027397260274 'Star Trek: First Contact (1996)']
[747.0 3.4607843137254903 'Benny & Joon (1993)']
[1398.0 4.5 'Anna (1996)']
[755.0 3.3125 'Jumanji (1995)']
[116.0 3.824 'Cold Comfort Farm (1995)']
[1152.0 3.4285714285714284 'In Love and War (1996)']
[252.0 2.9430379746835444 'Lost World: Jurassic Park, The (1997)']
[1032.0 2.875 'Little Big League (1994)']


In [81]:
recommendations = pandas.DataFrame(recommendation_list, columns = \
                ['movie id', 'mean rating', 'movie title'])

In [83]:
# 평균 평점(mean rating)이 높은 순으로 출력한다
recommendations = recommendations.sort_values(by = 'mean rating', ascending = False)

In [84]:
print(recommendations[['mean rating', 'movie title']])

    mean rating                            movie title
11     4.500000                            Anna (1996)
2      3.967213                     Postino, Il (1994)
13     3.824000               Cold Comfort Farm (1995)
0      3.803536                         Contact (1997)
9      3.660274        Star Trek: First Contact (1996)
6      3.600000                  Rainmaker, The (1997)
3      3.464789                     Restoration (1995)
10     3.460784                    Benny & Joon (1993)
14     3.428571                 In Love and War (1996)
5      3.333333                    Promesse, La (1996)
12     3.312500                         Jumanji (1995)
15     2.943038  Lost World: Jurassic Park, The (1997)
16     2.875000               Little Big League (1994)
1      2.685185            George of the Jungle (1997)
4      2.511628                     Money Train (1995)
7      2.500000                       Loch Ness (1995)
8      1.000000               Bloody Child, The (1996)
