데이터 프레임
- sample_df: 샘플데이터
- sm_df: 유사도 행렬
- closer_count: 유사한 사용자 데이터의 갯수
- ms_df: 유사도 행렬의 평균값
- pred_df: 예측값에 대한 행렬
- recommend_df: 추천 결과

프로세스
- 샘플 데이터
- 유사도 행렬 만들기
- 예측 행렬 만들기
- 기사 추천 리스트
- 모델 성능 측정

In [1]:
from scipy import spatial

### 1. 샘플 데이터셋 

In [2]:
columns = ["article_1", "article_2", "article_3", "article_4", "article_5"]
index = ["user_1", "user_2", "user_3", "user_4"]

data = np.array([
    [5,3,0,0,2],
    [2,0,0,1,4],
    [0,0,4,3,1],
    [4,0,4,5,0],
])

sample_df = pd.DataFrame(data, columns=columns, index=index)
sample_df

Unnamed: 0,article_1,article_2,article_3,article_4,article_5
user_1,5,3,0,0,2
user_2,2,0,0,1,4
user_3,0,0,4,3,1
user_4,4,0,4,5,0


### 2. 유사도 행렬

In [3]:
print(data[0], data[1])

# todo
idx = data[0].nonzero()[0]
print(idx)

data[0][idx], data[1][idx]

[5 3 0 0 2] [2 0 0 1 4]
[0 1 4]


(array([5, 3, 2]), array([2, 0, 4]))

In [4]:
# 유클리디안 유사도
def euclidean_similarity(vector_1, vector_2):
    
    idx = np.array(vector_1).nonzero()[0]
    vector_1 = np.array(vector_1)[idx]
    vector_2 = np.array(vector_2)[idx]
    
#     idx = np.array(vector_2).nonzero()[0]
#     vector_1 = np.array(vector_1)[idx]
#     vector_2 = np.array(vector_2)[idx]
    
    return np.linalg.norm(vector_1 - vector_2)

In [5]:
user_1 = list(sample_df.loc["user_1"])
user_2 = list(sample_df.loc["user_2"])
euclidean_similarity(user_1, user_2)

4.69041575982343

In [6]:
def cosine_similarity(vector_1, vector_2):
    
    idx = np.array(vector_1).nonzero()[0]
    vector_1 = np.array(vector_1)[idx]
    vector_2 = np.array(vector_2)[idx]
    
#     idx = np.array(vector_2).nonzero()[0]
#     vector_1 = np.array(vector_1)[idx]
#     vector_2 = np.array(vector_2)[idx]
    
    return 1 - spatial.distance.cosine(vector_1, vector_2)

In [7]:
cosine_similarity(user_1, user_2)

0.6529286250990105

In [8]:
def similarity_matrix(sample_df, similarity_func):
    
    # index 데이터 저장
    index = sample_df.index
    
    matrix = []
    for idx_1, value_1 in sample_df.iterrows():
        row = []
        for idx_2, value_2 in sample_df.iterrows():
            row.append(similarity_func(value_1, value_2))
        matrix.append(row)   
        
    return pd.DataFrame(matrix, columns=index, index=index)

In [9]:
sm_df = similarity_matrix(sample_df, cosine_similarity)
sm_df

Unnamed: 0,user_1,user_2,user_3,user_4
user_1,1.0,0.652929,0.324443,0.811107
user_2,0.729397,1.0,0.483046,0.443039
user_3,0.196116,0.332956,1.0,0.949474
user_4,0.529813,0.770054,0.82121,1.0


### 3. 유사도에 대한 평균값 - Mean Score

In [10]:
# 추천할 대상, 유사한 데이터를 몇개까지 사용할지
target, closer_count = "user_1", 2

# 추천할 사용자의 데이터 제거
ms_df = sm_df.drop(target)
ms_df

Unnamed: 0,user_1,user_2,user_3,user_4
user_2,0.729397,1.0,0.483046,0.443039
user_3,0.196116,0.332956,1.0,0.949474
user_4,0.529813,0.770054,0.82121,1.0


In [11]:
# 유사도 값을 내림차순으로 정렬
ms_df = ms_df.sort_values(target, ascending=False)
ms_df

Unnamed: 0,user_1,user_2,user_3,user_4
user_2,0.729397,1.0,0.483046,0.443039
user_4,0.529813,0.770054,0.82121,1.0
user_3,0.196116,0.332956,1.0,0.949474


In [12]:
# 사용자와 유사도가 높은 사용자를 필터링
ms_df = ms_df[:closer_count]
ms_df

Unnamed: 0,user_1,user_2,user_3,user_4
user_2,0.729397,1.0,0.483046,0.443039
user_4,0.529813,0.770054,0.82121,1.0


In [13]:
sample_df.loc[ms_df.index]

Unnamed: 0,article_1,article_2,article_3,article_4,article_5
user_2,2,0,0,1,4
user_4,4,0,4,5,0


In [14]:
# 평균값을 저장할 ndarray 만듦
mean = np.zeros(len(sample_df.columns))

# ms_df의 데이터의 평균값을 구함
for ms_user in ms_df.index: 
    mean += sample_df.loc[ms_user]
mean /= len(ms_df.index)
mean

article_1    3.0
article_2    0.0
article_3    2.0
article_4    3.0
article_5    2.0
Name: user_2, dtype: float64

In [15]:
pred_df = pd.DataFrame(columns=sample_df.columns)
pred_df.loc["user"] = sample_df.loc[target]
pred_df.loc["mean"] = mean
pred_df

Unnamed: 0,article_1,article_2,article_3,article_4,article_5
user,5,3,0,0,2
mean,3,0,2,3,2


In [16]:
def mean_score(sample_df, sm_df, target, closer_count):
    ms_df = sm_df.drop(target)
    ms_df = ms_df.sort_values(target, ascending=False)
    ms_df = ms_df[:closer_count]
    ms_df = sample_df.loc[ms_df.index]
    
    # pred_df 결과 생성
    pred_df = pd.DataFrame(columns=sample_df.columns)
    pred_df.loc["user"] = sample_df.loc[target]
    pred_df.loc["mean"] = ms_df.mean()

    return pred_df

In [17]:
target, closer_count = "user_1", 2
pred_df = mean_score(sample_df, sm_df, target, closer_count)
pred_df

Unnamed: 0,article_1,article_2,article_3,article_4,article_5
user,5,3,0,0,2
mean,3,0,2,3,2


### 4. 추천 기사 리스트 출력

In [18]:
recommend_df = pred_df.T
recommend_df

Unnamed: 0,user,mean
article_1,5,3
article_2,3,0
article_3,0,2
article_4,0,3
article_5,2,2


In [19]:
recommend_df = recommend_df[recommend_df["user"] == 0]
recommend_df

Unnamed: 0,user,mean
article_3,0,2
article_4,0,3


In [20]:
recommend_df = recommend_df.sort_values("mean", ascending=False)
recommend_df

Unnamed: 0,user,mean
article_4,0,3
article_3,0,2


In [21]:
list(recommend_df.index)

['article_4', 'article_3']

In [22]:
def recommend(pred_df):
    recommend_df = pred_df.T
    recommend_df = recommend_df[recommend_df["user"] == 0]
    recommend_df = recommend_df.sort_values("mean", ascending=False)

    return list(recommend_df.index)

In [23]:
recommend(pred_df)

['article_4', 'article_3']

### 5. run function

In [24]:
def run(sample_df, similarity_func, target, closer_count):
    
    # 유사도 행렬 데이터
    sm_df = similarity_matrix(sample_df, similarity_func)
    
    # 유사도가 높은 데이터의 평균값을 구하기
    pred_df = mean_score(sample_df, sm_df, target, closer_count)
    
    return recommend(pred_df)

In [25]:
run(sample_df, cosine_similarity, "user_1", 2)

['article_4', 'article_3']

In [26]:
run(sample_df, cosine_similarity, "user_2", 2)

['article_3', 'article_2']

In [27]:
run(sample_df, cosine_similarity, "user_3", 2)

['article_1', 'article_2']

In [28]:
run(sample_df, cosine_similarity, "user_4", 2)

['article_2', 'article_5']

### 6. 성능 평가
- MSE
    - Mean Squared Error
    
$$
\text{MSE} = \dfrac{\sum_{i=1}^n (y_i - \hat{y}_i)^2}{n}
$$

- RMSE
    - Root Mean Squared Error
    
$$
\text{RMSE} = \sqrt{\dfrac{\sum_{i=1}^n (y_i - \hat{y}_i)^2}{n}}
$$

    
- MAE
    - Mean Absolute Error
    
$$
e_i = y_i - \hat{y}_i \\
\text{MAE} = \dfrac{\sum_{i=1}^n |e_i|}{n}
$$

In [29]:
# MSE
def mse(value, pred):
    
    # value의 0 데이터 제거
    idx = value.nonzero()[0]
    value, pred = np.array(value)[idx], np.array(pred)[idx]
    
    # pred의 0 데이터 제거
    idx = pred.nonzero()[0]
    value, pred = np.array(value)[idx], np.array(pred)[idx]
    
#     print(value, pred.astype('int'))

    return sum((value - pred) ** 2) / len(idx)

In [30]:
mse(pred_df.loc["user"], pred_df.loc["mean"])

2.0

In [31]:
# RMSE
def rmse(value, pred):
    
     # value의 0 데이터 제거
    idx = value.nonzero()[0]
    value, pred = np.array(value)[idx], np.array(pred)[idx]
    
    # pred의 0 데이터 제거
    idx = pred.nonzero()[0]
    value, pred = np.array(value)[idx], np.array(pred)[idx]
    
    return np.sqrt(sum((value - pred) ** 2) / len(idx))

In [32]:
rmse(pred_df.loc["user"], pred_df.loc["mean"])

1.4142135623730951

In [33]:
# MAE
def mae(value, pred):
    
    # value의 0 데이터 제거
    idx = value.nonzero()[0]
    value, pred = np.array(value)[idx], np.array(pred)[idx]
    
    # pred의 0 데이터 제거
    idx = pred.nonzero()[0]
    value, pred = np.array(value)[idx], np.array(pred)[idx]
    
    return sum(abs(value - pred)) / len(idx)
#     return sum(np.absolute(value - pred)) / len(idx)

In [34]:
mae(pred_df.loc["user"], pred_df.loc["mean"])

1.0

In [35]:
# 전체 추천 모델에 대한 성능 평가
def evaluate(df, sm_df, closer_count, algorithm):
    
    # user 리스트
    users = df.index
    
    # user 별 envaluate값의 모음
    evaluate_list = []
    
    for target in users:
        # 하나의 user에 대한 예측 값을 얻음
        pred_df = mean_score(df, sm_df, target, closer_count)
        evaluate_var = algorithm(pred_df.loc["user"], pred_df.loc["mean"])
        evaluate_list.append(evaluate_var)
        
    return np.average(evaluate_list)

In [36]:
evaluate(sample_df, sm_df, 2, mse)

3.916666666666667

In [37]:
evaluate(sample_df, sm_df, 2, rmse)

1.9009287182747576

In [38]:
# cosine similarity
evaluate(sample_df, sm_df, 2, mae)

1.666666666666667

In [39]:
# euclidean similarity
sm_df = similarity_matrix(sample_df, euclidean_similarity)
sm_df

Unnamed: 0,user_1,user_2,user_3,user_4
user_1,0.0,4.690416,5.91608,3.741657
user_2,3.741657,0.0,4.123106,6.0
user_3,5.09902,5.385165,0.0,2.236068
user_4,6.480741,6.0,4.472136,0.0


In [40]:
evaluate(sample_df, sm_df, 2, mae)

1.9374999999999998