In [2]:
# page 43

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

mpl.rc('font', family='NanumGothic')
mpl.rc('axes', unicode_minus=False)

sns.set(font='NanumGothic', rc={"axes.unicode_minus":False}, style='darkgrid')
plt.rc("figure", figsize=(10,8))

warnings.filterwarnings("ignore")


In [3]:
# page44

movies = pd.read_csv('tmdb_5000_movies.csv')
print(movies.head(1))
print("shape=",movies.shape)



      budget  \
0  237000000   

                                                                            genres  \
0  [{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "...   

                      homepage     id  \
0  http://www.avatarmovie.com/  19995   

                                                                          keywords  \
0  [{"id": 1463, "name": "culture clash"}, {"id": 2964, "name": "future"}, {"id...   

  original_language original_title  \
0                en         Avatar   

                                                                          overview  \
0  In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora o...   

   popularity  \
0  150.437577   

                                                              production_companies  \
0  [{"name": "Ingenious Film Partners", "id": 289}, {"name": "Twentieth Century...   

                                                              production_countri

In [5]:
# page45

col_lst = ['id', 'title', 'genres', 'vote_average', 'vote_count', 'popularity', 'keywords', 'overview']
movies_df = movies[col_lst]



In [6]:
# page46


pd.set_option('max_colwidth', 80)
print(movies_df[['genres','keywords']][:1])


                                                                            genres  \
0  [{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "...   

                                                                          keywords  
0  [{"id": 1463, "name": "culture clash"}, {"id": 2964, "name": "future"}, {"id...  


In [7]:
# page47

from ast import literal_eval

print("literal_eval=",movies_df['genres'].apply(literal_eval)[0])


literal_eval= [{'id': 28, 'name': 'Action'}, {'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id': 878, 'name': 'Science Fiction'}]


In [8]:
# page48

movies_df['genres'] = movies_df['genres'].apply(literal_eval)
movies_df['keywords'] = movies_df['keywords'].apply(literal_eval)

movies_df['genres'] = movies_df['genres'].apply(lambda x : [dic['name'] for dic in x])
movies_df['keywords'] = movies_df['keywords'].apply(lambda x : [dic['name'] for dic in x])

print("lambda=",movies_df[['genres','keywords']][:1])


lambda=                                           genres  \
0  [Action, Adventure, Fantasy, Science Fiction]   

                                                                          keywords  
0  [culture clash, future, space war, space colony, society, space travel, futu...  


In [9]:
# page49

from sklearn.feature_extraction.text import CountVectorizer

movies_df['genres_literal'] = movies_df['genres'].apply(lambda x: (' ').join(x))

count_vect = CountVectorizer(min_df=0.01, ngram_range=(1,2)) # min_df=0 으로 할수 없습니다.
genre_mat = count_vect.fit_transform(movies_df['genres_literal'])

print(genre_mat.shape)



(4803, 60)


In [10]:
# page50

from sklearn.metrics.pairwise import cosine_similarity

genre_sim = cosine_similarity(genre_mat, genre_mat)
print(genre_sim[0])


[1.         0.63245553 0.53033009 ... 0.         0.         0.        ]


In [11]:
# page51

def find_sim_movie(df, sim_matrix, title_name, top_n=10):
  title_movie = df[df['title'] == title_name]
  title_index = title_movie.index.values

  df['similarity'] = sim_matrix[title_index, :].reshape(-1,1)

  temp = df.sort_values(by="similarity", ascending=False)
  final_index = temp.index.values[:top_n]

  return df.iloc[final_index]


In [12]:
# page52

similar_movies = find_sim_movie(movies_df, genre_sim, 'The Godfather', 10)
print(similar_movies[['title','vote_average','similarity']])

                            title  vote_average  similarity
1149              American Hustle           6.8         1.0
3594              Spring Breakers           5.0         1.0
3337                The Godfather           8.4         1.0
2582   The Place Beyond the Pines           6.8         1.0
1663  Once Upon a Time in America           8.2         1.0
1847                   GoodFellas           8.2         1.0
892                        Casino           7.8         1.0
2731       The Godfather: Part II           8.3         1.0
4065                   Mi America           0.0         1.0
3112      Blood Done Sign My Name           6.0         1.0


In [None]:
# page53

print(movies_df[['title','vote_average','vote_count']].sort_values('vote_average', ascending=False)[:10])

                         title  vote_average  vote_count
4662            Little Big Top          10.0           1
3519          Stiff Upper Lips          10.0           1
4045     Dancer, Texas Pop. 81          10.0           1
4247     Me You and Five Bucks          10.0           2
3992                 Sardaarji           9.5           2
2386            One Man's Hero           9.3           2
1881  The Shawshank Redemption           8.5        8205
2970        There Goes My Baby           8.5           2
3337             The Godfather           8.4        5893
2796     The Prisoner of Zenda           8.4          11


In [13]:
# page54~55

percentile = 0.6
m = movies_df['vote_count'].quantile(percentile)
C = movies_df['vote_average'].mean()

def weighted_vote_average(record):
  v = record['vote_count']
  R = record['vote_average']

  return ((v/(v+m))*R) + ((m/(m+v))*C)

movies_df['weighted_vote'] = movies_df.apply(weighted_vote_average, axis=1)

print(movies_df[['title','vote_average','vote_count','weighted_vote']].sort_values('weighted_vote', ascending=False)[:10])

                         title  vote_average  vote_count  weighted_vote
1881  The Shawshank Redemption           8.5        8205       8.396052
3337             The Godfather           8.4        5893       8.263591
662                 Fight Club           8.3        9413       8.216455
3232              Pulp Fiction           8.3        8428       8.207102
65             The Dark Knight           8.2       12002       8.136930
1818          Schindler's List           8.3        4329       8.126069
3865                  Whiplash           8.3        4254       8.123248
809               Forrest Gump           8.2        7927       8.105954
2294             Spirited Away           8.3        3840       8.105867
2731    The Godfather: Part II           8.3        3338       8.079586


In [14]:
# page56

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

mpl.rc('font', family='NanumGothic')
mpl.rc('axes', unicode_minus=False)

sns.set(font='NanumGothic', rc={"axes.unicode_minus":False}, style='darkgrid')
plt.rc("figure", figsize=(10,8))

warnings.filterwarnings("ignore")

In [16]:
# page60

R = np.array([[4, np.nan, np.nan, 2, np.nan],
              [np.nan, 5, np.nan, 3, 1],
              [np.nan, np.nan, 3, 4, 4],
              [5,2,1,2,np.nan]])

print("org R", R.shape)


org R (4, 5)


In [18]:
# page61

K=3
num_users, num_items = R.shape

np.random.seed(1)
P = np.random.normal(scale=1./K, size=(num_users, K))
Q = np.random.normal(scale=1./K, size=(num_items, K))

print("org P", P.shape)
print("org Q", Q.shape)

org P (4, 3)
org Q (5, 3)


In [19]:
# page62

from sklearn.metrics import mean_squared_error

def get_rmse(R, P, Q, not_nan_index):
  # 예측 행렬 전체를 계산
  full_pred_matrix = P @ Q.T

  # R에서 NaN이 아닌 실제 값들만 추출
  R_not_null = R[not_nan_index]
  # full_pred_matrix에서도 R_not_null과 동일한 위치의 예측 값들만 추출
  full_pred_matrix_not_null = full_pred_matrix[not_nan_index]

  mse = mean_squared_error(R_not_null, full_pred_matrix_not_null)
  rmse = np.sqrt(mse)

  return rmse



In [23]:
# page63

not_nan_index = np.where(np.isnan(R) == False)

steps = 1000
learning_rate = 0.01
r_lambda = 0.01

for step in range(steps):
  for u, i, r in zip(not_nan_index[0], not_nan_index[1], R[not_nan_index]):
    r_hat_ui = P[u,:] @ Q[i,:].T
    e_ui = r - r_hat_ui

    P[u,:] = P[u,:] + learning_rate*(e_ui * Q[i,:] - r_lambda*P[u,:])
    Q[i,:] = Q[i,:] + learning_rate*(e_ui * P[u,:] - r_lambda*Q[i,:])

  rmse = get_rmse(R, P, Q, not_nan_index)

  if ( (step+1) % 50 ) == 0:
      print("### iteration step:", step+1, ", rmse:", np.round(rmse,4))

### iteration step: 50 , rmse: 0.015
### iteration step: 100 , rmse: 0.015
### iteration step: 150 , rmse: 0.015
### iteration step: 200 , rmse: 0.0149
### iteration step: 250 , rmse: 0.0149
### iteration step: 300 , rmse: 0.0149
### iteration step: 350 , rmse: 0.0149
### iteration step: 400 , rmse: 0.0149
### iteration step: 450 , rmse: 0.0148
### iteration step: 500 , rmse: 0.0148
### iteration step: 550 , rmse: 0.0148
### iteration step: 600 , rmse: 0.0148
### iteration step: 650 , rmse: 0.0148
### iteration step: 700 , rmse: 0.0147
### iteration step: 750 , rmse: 0.0147
### iteration step: 800 , rmse: 0.0147
### iteration step: 850 , rmse: 0.0147
### iteration step: 900 , rmse: 0.0147
### iteration step: 950 , rmse: 0.0146
### iteration step: 1000 , rmse: 0.0146


In [25]:
# page65

pred_matrix = P @ Q.T
print("predict matrix:")
print(np.round(pred_matrix, 3))

print("-"*35)

print("real matrix:")
print(R)

predict matrix:
[[3.991 1.54  1.194 1.999 1.623]
 [5.562 4.977 0.858 2.985 1.004]
 [5.93  1.514 2.987 3.979 3.985]
 [4.971 2.004 1.005 2.007 1.418]]
-----------------------------------
real matrix:
[[ 4. nan nan  2. nan]
 [nan  5. nan  3.  1.]
 [nan nan  3.  4.  4.]
 [ 5.  2.  1.  2. nan]]


In [26]:
# page66

def matrix_factorization(R, K, steps=200, learning_rate=0.01, r_lambda=0.01):
  num_users, num_items = R.shape

  np.random.seed(1)
  P = np.random.normal(scale=1./K, size=(num_users, K))
  Q = np.random.normal(scale=1./K, size=(num_items, K))

  break_count = 0

  not_nan_index = np.where(np.isnan(R) == False)

  for step in range(steps):
    for u, i, r in zip(not_nan_index[0], not_nan_index[1], R[not_nan_index]):
      r_hat_ui = P[u,:] @ Q[i,:].T
      e_ui = r - r_hat_ui

      P[u,:] = P[u,:] + learning_rate*(e_ui * Q[i,:] - r_lambda*P[u,:])
      Q[i,:] = Q[i,:] + learning_rate*(e_ui * P[u,:] - r_lambda*Q[i,:])

    rmse = get_rmse(R, P, Q, not_nan_index)

    if ( (step+1) % 50 ) == 0:
        print("### iteration step:", step+1, ", rmse:", np.round(rmse,4))

  return P, Q

In [31]:
# page67

movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')

#print("movies=\n", movies.head(3))
#print("ratings=\n", ratings.head(3))

rating_movies = pd.merge(ratings, movies, on="movieId")

rating_matrix = rating_movies.pivot_table("rating", "userId", "title")

print(rating_matrix.head(3))

title   "Great Performances" Cats (1998)  $9.99 (2008)  \
userId                                                   
1                                    NaN           NaN   
2                                    NaN           NaN   
3                                    NaN           NaN   

title   'Hellboy': The Seeds of Creation (2004)  \
userId                                            
1                                           NaN   
2                                           NaN   
3                                           NaN   

title   'Neath the Arizona Skies (1934)  'Round Midnight (1986)  \
userId                                                            
1                                   NaN                     NaN   
2                                   NaN                     NaN   
3                                   NaN                     NaN   

title   'Salem's Lot (2004)  'Til There Was You (1997)  'burbs, The (1989)  \
userId                                 

"\n\nratings = ratings[['userId', 'movieId', 'rating']]\nratings_matrix = ratings.pivot_table('rating', index='userId', columns='movieId')\n##ratings_matrix.head(3)\nprint(rating_matrix.head(3))\n"

In [32]:
# page68

P, Q = matrix_factorization(rating_matrix.values, K=50, steps=200)

pred_matrix = P @ Q.T

#print("pred=\n", pred_matrix.head(3))

### iteration step: 50 , rmse: 0.2584
### iteration step: 100 , rmse: 0.1779
### iteration step: 150 , rmse: 0.1569
### iteration step: 200 , rmse: 0.1474


AttributeError: 'numpy.ndarray' object has no attribute 'head'

In [55]:
# page69

rating_pred_matrix = pd.DataFrame(data=pred_matrix, index=rating_matrix.index, columns=rating_matrix.columns)

print("rating_pred_matrix=\n", rating_pred_matrix.head(3))

rating_pred_matrix=
 title   "Great Performances" Cats (1998)  $9.99 (2008)  \
userId                                                   
1                               1.641676      2.889387   
2                               2.164730      3.624265   
3                               2.142794      3.537876   

title   'Hellboy': The Seeds of Creation (2004)  \
userId                                            
1                                      1.099882   
2                                      1.494340   
3                                      1.403330   

title   'Neath the Arizona Skies (1934)  'Round Midnight (1986)  \
userId                                                            
1                              0.295160                1.345731   
2                              0.269798                1.933644   
3                              0.337382                1.591148   

title   'Salem's Lot (2004)  'Til There Was You (1997)  'burbs, The (1989)  \
userId            

In [51]:
# page70

def get_unseen_movies(rating_matrix, userId):
  user_rating = rating_matrix.loc[userId,:]

  unseen_movie_list = user_rating[user_rating.isnull()].index.tolist()

  movies_list = rating_matrix.columns.tolist()

  unseen_list = [movie for movie in movies_list if movie in unseen_movie_list]

  return unseen_list

def recomm_movie_by_userId(pred_df, userId, unseen_list, top_n=10):
  recomm_movies = pred_df.loc[userId, unseen_list].sort_values(ascending=False)[:top_n]

  return recomm_movies


In [54]:
# page71

#print("rating_matrix=\n", rating_matrix.index.dtype)

unseen_list = get_unseen_movies(rating_matrix, 9)

recomm_movies = recomm_movie_by_userId(rating_pred_matrix, 9, unseen_list, top_n=10)

recomm_movies = pd.DataFrame(data=recomm_movies.values, index=recomm_movies.index, columns=['pred_score'])
print(recomm_movies)

                                                      pred_score
title                                                           
Lives of Others, The (Das leben der Anderen) (2006)     5.284725
Bowling for Columbine (2002)                            5.273318
Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)    5.217321
Dancer in the Dark (2000)                               5.179148
In the Line of Fire (1993)                              5.174083
Run Lola Run (Lola rennt) (1998)                        5.154215
WALL·E (2008)                                           5.110012
Searching for Bobby Fischer (1993)                      5.039262
Finding Nemo (2003)                                     5.022706
Boys Don't Cry (1999)                                   5.015613
