In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Read rating data
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('C:/RecoSys/Data/u.data', names=r_cols,  sep='\t',encoding='latin-1')

In [3]:
#Load the u.user file into a dataframe
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('C:/RecoSys/Data/u.user', sep='|', names=u_cols, encoding='latin-1')

In [4]:
#Load the u.item file into a dataframe
i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 
          'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 
          'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 
          'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
movies = pd.read_csv('C:/RecoSys/Data/u.item', sep='|', names=i_cols, encoding='latin-1')

In [5]:
# 모든 유저간 나이차 계산 full matrix 
user_age = users.pivot_table(values='age', index='user_id')
age_diff = np.zeros([len(user_age) , len(user_age)])

for i in user_age.index:
    age_diff[i-1] = abs(user_age['age'].values - user_age.loc[i].values)
    
age_diff = pd.DataFrame(age_diff , columns=user_age.index , index=user_age.index)

In [6]:
# Rating 데이터를 test, train으로 나누고 train을 full matrix로 변환
x = ratings.copy()
y = ratings['user_id']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y, random_state=20)
rating_matrix = x_train.pivot_table(values='rating', index='user_id', columns='movie_id')
time_matrix = x_train.pivot_table(values='timestamp', index='user_id', columns='movie_id')

In [7]:
# RMSE 계산을 위한 함수
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

def score2(cf_model, neighbor_size=0):
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    y_pred = np.array([cf_model(user, movie, neighbor_size) for (user, movie) in id_pairs])
    y_true = np.array(x_test['rating'])
    return RMSE(y_true, y_pred)

In [8]:
# 모든 가능한 사용자 pair의 Cosine similarities 계산
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)

In [9]:
# 모든 user의 rating 평균 계산 
rating_mean = rating_matrix.mean(axis=1)

In [10]:
def ubcf_sig_weighting(user_id, movie_id, neighbor_size=0):
    # 현 user의 평균 가져오기
    user_mean = rating_mean[user_id]
    if movie_id in rating_matrix:
        # 현 user와 다른 사용자 간의 유사도 가져오기
        sim_scores = user_similarity[user_id]
        # 현 user와 다른 사용자 간의 time gap 가져오기
        t_gap = time_gap[user_id]
        # 현 movie의 rating 가져오기. 즉, rating_matrix의 열(크기: 943)을 추출
        movie_ratings = rating_matrix[movie_id]
        # 모든 사용자의 rating 평균 가져오기
        others_mean = rating_mean
        # 현 user와 다른 사용자 간의 공통 rating개수 가져오기
        common_counts = sig_counts[user_id]
        # 현 movie에 대한 rating이 없는 user 선택
        no_rating = movie_ratings.isnull()
        # 공통으로 평가한 영화의 수가 SIG_LEVEL보다 낮은 사람 선택
        low_significance = common_counts < SIG_LEVEL
        # 영화의 평가시점이 너무 먼 사람을 선택
        too_far = t_gap > TIME_GAP
        # 나이차가 50 이상 나는 사람 선택
        age_gap = age_diff[user_id]
        too_much = age_gap >= 50
        
        # 평가를 안 하였거나, SIG_LEVEL, 평가시점이 기준 이하, 나이차가 50 이상 나는  user 제거
        none_rating_idx = movie_ratings[no_rating | low_significance | too_far|too_much].index
        movie_ratings = movie_ratings.drop(none_rating_idx)
        sim_scores = sim_scores.drop(none_rating_idx)
        others_mean = others_mean.drop(none_rating_idx)
        if len(movie_ratings) > MIN_RATINGS:    # 충분한 rating이 있는지 확인
            if neighbor_size == 0:              # Neighbor size가 지정되지 않은 경우
                # 편차로 예측치 계산
                movie_ratings = movie_ratings - others_mean
                prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
                # 예측값에 현 사용자의 평균 더하기
                prediction = prediction + user_mean
            else:                               # Neighbor size가 지정된 경우
                # 지정된 neighbor size 값과 해당 영화를 평가한 총사용자 수 중 작은 것으로 결정
                neighbor_size = min(neighbor_size, len(sim_scores))
                # array로 바꾸기 (argsort를 사용하기 위함)
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                others_mean = np.array(others_mean)
                # 유사도를 순서대로 정렬
                user_idx = np.argsort(sim_scores)
                # 유사도, rating, 평균값을 neighbor size만큼 받기 
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                others_mean = others_mean[user_idx][-neighbor_size:]
                # 편차로 예측치 계산
                movie_ratings = movie_ratings - others_mean
                prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
                # 예측값에 현 사용자의 평균 더하기
                prediction = prediction + user_mean
        else:
            prediction = user_mean
    else:
        prediction = user_mean
    return prediction

In [11]:
# 각 사용자 쌍의 공통 rating 수(significance level)를 집계하기 위한 함수

def count_num():       # matrix 연산 이용
    # 각 user의 rating 영화를 1로 표시
    global rating_matrix
    rating_flag1 = np.array((rating_matrix > 0).astype(float))
    rating_flag2 = rating_flag1.T
    # 사용자별 공통 rating 수 계산
    counts = np.dot(rating_flag1, rating_flag2)
    return counts

In [12]:
def time_gap_calc():
    global time_matrix
    tg_matrix = np.array(time_matrix)

    return np.nanmean(np.abs(tg_matrix[np.newaxis,:,:] - tg_matrix[:,np.newaxis,:]), axis=2)

In [13]:
sig_counts = count_num()
sig_counts = pd.DataFrame(sig_counts, index=rating_matrix.index, columns=rating_matrix.index)

time_gap = time_gap_calc()
time_gap = pd.DataFrame(time_gap, index=time_matrix.index, columns=time_matrix.index).fillna(0)

SIG_LEVEL = 4       # minimum significance level 지정. 공통적으로 평가한 영화의 수
MIN_RATINGS = 2     # 예측치 계산에 사용할 minimum rating 수 지정
TIME_GAP = 16000000 # 평가한 시점이 얼마 이상 차이가 날때 제외할지에 대한 기준

score2(ubcf_sig_weighting, 37)

  """


0.9392773996674373

In [16]:
def calc_all():
    x = ratings.copy()
    y = ratings['user_id']
    
    # 기존변수 초기화
    global rating_matrix, time_matrix, matrix_dummy, user_similarity, rating_mean, sig_counts, time_gap
    
    rating_matrix = x.pivot_table(values='rating', index='user_id', columns='movie_id')
    time_matrix = x.pivot_table(values='timestamp', index='user_id', columns='movie_id')
    
    matrix_dummy = rating_matrix.copy().fillna(0)
    user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
    user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)
    
    rating_mean = rating_matrix.mean(axis=1)
    
    sig_counts = count_num()
    sig_counts = pd.DataFrame(sig_counts, index=rating_matrix.index, columns=rating_matrix.index)
    
    time_gap = time_gap_calc()
    time_gap = pd.DataFrame(time_gap, index=time_matrix.index, columns=time_matrix.index).fillna(0)
    
    # 나이차 계산
    user_age = users.pivot_table(values='age', index='user_id')
    age_diff = np.zeros([len(user_age) , len(user_age)])

    for i in user_age.index:
        age_diff[i-1] = abs(user_age['age'].values - user_age.loc[i].values)
    age_diff = pd.DataFrame(age_diff , columns=user_age.index , index=user_age.index)

In [17]:
calc_all()

  """


In [18]:
def recommend_movie(user_id, neighbor_size=0):
    
    # movie_id와 predicted_rating을 컬럼으로 갖는 빈 데이터 프레임 생성
    predictions = pd.DataFrame( columns=['movie_id', 'predicted_rating'])
    predictions['movie_id'] = movies['movie_id']
    
    # 모든 영화에 대한 해당 유저의 predicted_rating 값을 계산
    rating_pred = []
    for i in movies['movie_id']:
        pred = ubcf_sig_weighting(user_id, i, neighbor_size=0)
        rating_pred.append(pred)
    predictions['predicted_rating'] = rating_pred
    
    # predicted_rating을 내림차순 정렬하여 상위 5개 선택
    top5_result = predictions.sort_values(by='predicted_rating' , ascending=False).head(5)
    
    recommended_movies = pd.merge(top5_result, movies[['movie_id', 'title']] , how='left', on='movie_id')['title'].values
    
    print("[영화 추천 리스트]")
    print("1. {}".format(recommended_movies[0]))
    print("2. {}".format(recommended_movies[1]))
    print("3. {}".format(recommended_movies[2]))
    print("4. {}".format(recommended_movies[3]))
    print("5. {}".format(recommended_movies[4]))

In [19]:
recommend_movie(1)

[영화 추천 리스트]
1. Boys, Les (1997)
2. Star Kid (1997)
3. Faust (1994)
4. Pather Panchali (1955)
5. Prefontaine (1997)


In [20]:
recommend_movie(2, 10)

[영화 추천 리스트]
1. Two or Three Things I Know About Her (1966)
2. Pather Panchali (1955)
3. Prefontaine (1997)
4. Maya Lin: A Strong Clear Vision (1994)
5. Close Shave, A (1995)
