In [1]:
import numpy as np
import pandas as pd

In [2]:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('../dataset/u.user', sep='|', names=u_cols, encoding='latin-1')
i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 'unknown', 
          'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 
          'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
          'Thriller', 'War', 'Western']
movies = pd.read_csv('../dataset//u.item', sep='|', names=i_cols, encoding='latin-1')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('../dataset//u.data', sep='\t', names=r_cols, encoding='latin-1')

In [5]:
raings = ratings.drop('timestamp', axis=1)

In [6]:
movies = movies[['movie_id','title']]

In [7]:
movies

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
...,...,...
1677,1678,Mat' i syn (1997)
1678,1679,B. Monkey (1998)
1679,1680,Sliding Doors (1998)
1680,1681,You So Crazy (1994)


In [55]:
from sklearn.model_selection import train_test_split
x = ratings.copy()
y= ratings['user_id']
x_train, x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,stratify=y)

In [56]:
def RMSE(y_true,y_pred):
    return np.sqrt(np.mean((np.array(y_true)-np.array(y_pred))**2))

In [57]:
def score(model,neighbor_size=0):
    id_pairs = zip(x_test['user_id'],x_test['movie_id'])
    y_pred = np.array([model(user,movie,neighbor_size) for (user,movie) in id_pairs ])
    y_true = np.array(x_test['rating'])
    return RMSE(y_true,y_pred)


In [58]:
rating_matrix = x_train.pivot(index='user_id',columns='movie_id',values='rating')


In [59]:
from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy,matrix_dummy)
user_similarity = pd.DataFrame(user_similarity,index=rating_matrix.index, columns=rating_matrix.index)

In [68]:
# train 데이터의 userdml rating 평균과 영화의 평점편차 계산
rating_mean = rating_matrix.mean(axis=1)
rating_bias = (rating_matrix.T -rating_mean).T


In [69]:
# 사용자별 공통 평가 수 계산
rating_binary1 = np.array((rating_matrix>0).astype(float))
rating_binary2 = rating_binary1.T
counts = np.dot(rating_binary1,rating_binary2)
counts = pd.DataFrame(counts, index = rating_matrix.index,columns=rating_matrix.index).fillna(0)

In [74]:
rating_bias

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,-0.587156,0.412844,-0.587156,,,0.412844,-2.587156,,-0.587156,...,,,,,,,,,,
2,0.380000,,,,,,,,,-1.620000,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,1.092857,0.092857,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,0.769231,,...,,,,,,,,,,
940,,,,-1.418605,,,0.581395,1.581395,-0.418605,,...,,,,,,,,,,
941,1.055556,,,,,,0.055556,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


In [85]:
def CF_knn_bias_sig(user_id,movie_id,neighbor_size = 0):
    if movie_id in rating_bias:
        sim_scores = user_similarity[user_id]
        movie_ratings = rating_bias[movie_id]
        no_rating = movie_ratings.isnull()
        common_counts = counts[user_id]
        low_significance = common_counts< SIG_LEVEL
        #평가를 안 하였거나, sig_level이 기준 이하인 user 제거
        none_rating_idx = movie_ratings[no_rating| low_significance].index
        movie_ratings = movie_ratings.drop(none_rating_idx)
        sim_scores = sim_scores.drop(none_rating_idx)
        if neighbor_size ==0 :
            prediction = np.dot(sim_scores,movie_ratings)/ sim_scores.sum()
            # 편차 예측값에 현 사용자의 평균 더하기
            prediction = prediction + rating_mean[user_id]
        else:
            if len(sim_scores) > MIN_RATINGS:
                #지정된 neighbor size값과 해당 영화를 평가한 총 사용자 수 중 작은 것으로 결정
                neighbor_size = min(neighbor_size, len(sim_scores))
                #array로 바꾸기 (argsort를 사용하기 위함)
                sim_scores=np.array(sim_scores)
                movie_ratings= np.array(movie_ratings)
                # 유사도를 순서대로 정렬
                prediction = np.dot(sim_scores,movie_ratings)/sim_scores.sum()
                # 예측값에 현 사용자의 평균 더하기
                prediction = prediction+rating_mean[user_id]
            else:
                prediction= rating_mean[user_id]
    else:
        prediction = rating_mean[user_id]
    if prediction <1 :
        prediction = 1
    elif prediction>5:
        prediction = 5
    else:
        pass
    return prediction


                
        
        

In [86]:
SIG_LEVEL = 3
MIN_RATINGS = 2
score(CF_knn_bias_sig,30)

0.9606194699625429