In [1]:
! pip install mlxtend

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## 추천 시스템(Tailored to individual user : 개인이나 아이템의 특성에 따라 추천)

특정 시점에 특정 고객이 관심 가질만한 제품이나 콘텐츠를 찾는 것이 핵심

- 연관규칙 /현업필터링, 콘텐츠 기반 필터링 

# 연관규칙

- A 제품을 구매한 사람은 B 제품을 구매할 확률이 높다 -> 과거 고객들의 구매이력(트랜잭션)을 기반으로 추천 점수를 구해야 한다.

ex) 아이폰과 에어팟을 구매한 사람은 애플워치를 구매할 확률이 높다. 

- 모든 조합에 대한 지지도, 신뢰도, 향상도 계산


### 규칙 분석의 알고리즘

- Apriori : 아이템들의 조합에 대한 경우의 수를 최소화 하여 처리 속도 효율을 높인 알고리즘 
- FP-Gowth : 트리 기반 알고리즘, 항목 간 비교 계산을 최소화 
- DHP : 항목 집합의 개수가 2개인 트랜잭션을 먼저 해시 테이블로 만들어 처리 속도 효율 상승


In [2]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import pandas as pd
import numpy as np

In [3]:
# https://www.kaggle.com/datasets/sewonghwang/market-basket
df = pd.read_csv("datasets/market_basket.csv")

df.head()

Unnamed: 0,cust_cd,std_dt,prdct_cd,prdct_nm
0,C617077280704,2021-06-19,A10001,tropical fruit
1,C617077280704,2021-06-19,A10002,whole milk
2,C617077280704,2021-06-19,A10003,pip fruit
3,C617077280704,2021-06-19,A10004,other vegetables
4,C617077280704,2021-06-19,A10005,cream


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188769 entries, 0 to 188768
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   cust_cd   188769 non-null  object
 1   std_dt    188769 non-null  object
 2   prdct_cd  188769 non-null  object
 3   prdct_nm  188769 non-null  object
dtypes: object(4)
memory usage: 5.8+ MB


#### 연관규칙 분석은 아래 3가지 핵심 지표를 통해 품목 조합간의 연관성의 수준 도출

- 지지도(support) : 전체 구매 횟수 중에서 해당 아이템+조합의 구매가 얼마나 발생하는지. 지지도는 A와 B가 함께 등장할 확률이다. 전체 거래의 수를 A와 B가 동시에 포함된 거래수를 나눠주면 구할 수 있다.
- 신뢰도(confidence) : 항목 A가 포함하는 거래에 A+B 다 있을 확률. 신뢰도는 조건부 확률과 유사하다. A가 일어났을 때 B의 확률이다. A의 확률을 A와 B가 동시에 포함될 확률을 나눠주면 구할 수 있다.단, 비대칭적 척도다
- 향상도(lift) : A가 주어지지 않을 때의 품목 B의 확률에 비해 A가 주어졌을 때 품목 B의 증가 비율. B의 확률이 A가 일어났을 때 B의 확률을 나눴을 때 구할 수 있다. 기준은 1. 1에 가까울 수록 두 아이템은 서로 독립적. 1보다 작으면 서로 음의 상관관계, lift 값은 1이면 서로 독립적인 관계이며1보다 크면 두 품목이 서로 양의 상관관계, 1보다 작으면 두 품목이 서로 음의 상관관계이다. A와 B가 독립이면 분모, 분자가 같기 때문에 1이 나온다. -> 조건절과 결과절의 위치가 달라도 값이 동일하다. 

#### 기준 척도를 주는 방법

1. 지지도와 신뢰도 기준을 잡아서 특정 cu off 미만의 아이템 관계를 필터링
2. 향상도를 내림차순으로 정렬하여 사우이에 있는 아이템 조합을 선정
3. 평가 척도 : IS측도(향상도와 지지도를 함께 고려) 와 교차지지도(전체 아이템 조합에서 어느 정도 수준의 지지도 이하를 버릴 것인가 판단하는 보조지표)


### Aprioir 알고리즘의 분석 순서

최소 지지도 설정 -> 개별 아이템 중에서 최소 지지도 미만의 모든 아이템 찾기 -> 제외되지 않은 아이템만을 이용해 최소 지지도 이상의 2가지 조합 찾기 -> 제외되지 않은 2가지 조합을 결합하여 최소 지지도 이상의 3가지 조합 찾기 -> 아이템 조합을 늘려가며 반복적으로 최소 지지도 이상의 조합 찾기 -> 해당 조합들의 지지도, 신뢰도, 향상도 산출


In [5]:
# apriori 모델 적용을 위한 품목 리스트 가공

itemset = df.drop_duplicates(
    ['cust_cd', 'std_dt', 'prdct_nm']).groupby(
    ['cust_cd','std_dt'])['prdct_nm'].apply(list)

itemset= pd.DataFrame(itemset).reset_index().drop(
    ['cust_cd', 'std_dt'], axis='columns')

itemset = itemset.squeeze()

itemset.head()

0    [beef, herbs, tropical fruit, whole milk, chic...
1    [sugar, packaged fruit/vegetables, sausage, sp...
2    [berries, tropical fruit, fruit/vegetable juic...
3    [yogurt, beef, cream, herbs, chicken, bottled ...
4    [berries, beef, yogurt, specialty bar, bottled...
Name: prdct_nm, dtype: object

In [6]:
# apriori 모델 적용을 위한 장바구니 - 품목 더미 가공

encoder = TransactionEncoder()
encoder_T = encoder.fit(itemset).transform(itemset)

# 데이터프레임으로 변경
itemset_matrix = pd.DataFrame(encoder_T, columns=encoder.columns_) 

itemset_matrix.head()

Unnamed: 0,beef,berries,beverages,bottled beer,bottled water,brown bread,butter,butter milk,canned beer,chicken,...,sparkling wine,specialty bar,specialty chocolate,sugar,syrup,tropical fruit,turkey,white wine,whole milk,yogurt
0,True,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,True,False,False,True,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,True,True,False,True,False,False,False,False
2,False,True,False,False,False,False,False,False,False,True,...,False,False,False,False,False,True,False,False,False,False
3,True,False,False,False,True,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,True
4,True,True,False,False,True,False,False,False,False,True,...,False,True,False,False,False,True,False,False,False,True


In [7]:
# 지지도 0.1 기준으로 apriori 적용

itemset_apriori = apriori(itemset_matrix, min_support=0.01, use_colnames=True)
itemset_apriori.head()

Unnamed: 0,support,itemsets
0,0.166612,(beef)
1,0.105074,(berries)
2,0.01701,(beverages)
3,0.025754,(bottled beer)
4,0.095191,(bottled water)


In [8]:
# 향상도 5 이상 상품 조합 추출

association_rules(itemset_apriori, metric="lift", min_threshold=5) 

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,"(beef, ham)",(bottled water),0.028179,0.095191,0.014585,0.517601,5.437508,0.011903,1.875645,0.839756
1,(bottled water),"(beef, ham)",0.095191,0.028179,0.014585,0.153223,5.437508,0.011903,1.14767,0.90195
2,"(beef, yogurt)",(bottled water),0.023844,0.095191,0.011977,0.502311,5.276885,0.009707,1.818022,0.830291
3,(bottled water),"(beef, yogurt)",0.095191,0.023844,0.011977,0.12582,5.276885,0.009707,1.116654,0.895763
4,"(beef, cream, pastry)",(bottled water),0.030163,0.095191,0.014879,0.493301,5.182229,0.012008,1.785693,0.832132
5,"(beef, cream)","(pastry, bottled water)",0.069878,0.037474,0.014879,0.212934,5.6822,0.012261,1.222929,0.885918
6,"(beef, pastry)","(cream, bottled water)",0.046916,0.047797,0.014879,0.31715,6.635276,0.012637,1.394453,0.891097
7,"(cream, bottled water)","(beef, pastry)",0.047797,0.046916,0.014879,0.311299,6.635276,0.012637,1.383887,0.891922
8,"(pastry, bottled water)","(beef, cream)",0.037474,0.069878,0.014879,0.397059,5.6822,0.012261,1.542642,0.856093
9,(bottled water),"(beef, cream, pastry)",0.095191,0.030163,0.014879,0.15631,5.182229,0.012008,1.149519,0.891937


# 협업 필터링

아이템의 메타정보를 활용한다. (다른 제품과의 유사도 측정 후 유사한 속성을 가진 제품 측정)

유사도는 피어슨 유사도나 코사인 유사도를 통해 구할 수 있다. (피어슨의 경우 자기 자신에 대한 유사도는 1이며 사용자들 간의 유사도는 -1~1을 갖는다. )

희소행렬 : 구매자들이 평점을 주는 경우가 드문것 

- 명시적 데이터 : 호불호가 정확한 것(영화 평점)
- 암묵적 데이터 : 호불보가 명확하지 않은 것 (구매, 클릭 이력)

-> 암묵적 데이터 위주의 도메인에서는 협업 필터링 중에서 잠재요인 모델을 사용하는 것이 좋다. 데이터를 행렬분해 하여 데이터 안에 숨겨진 잠재 요인을 도출함. 

행렬분해 : 특잇값 분해(SVD)는 데이터가 null 이면 안됨. ALS(병렬 시스템으로 빠름), SGD

#### 협업필터링의 단점

1. 콜드 스타트 문제 (새로 가입한 사용자가 내역이 없음 -> 가입 시 선호템 선택하게함) 
2. 클릭, 구매 등 상호작용이 부족한 경우 데이터가 희박해서 추천 성능 저하
3. 행동 양식이 일관적이지 않은 특이 취향 사용자에 대한 추천 정확도 떨어짐

-> 특이취향만 분리하여 별도의 유사도 알고리즘 적용하여 추천 정확도 향상 (하이브리드 필터링)


##### 하이브리드 필터링

1. 현업 필터링과 콘텐츠 기반 필터링을 각각 구현하여 혼합(각 모델의 스코어 결과값의 가중합을 구하여 최종 스코어 산출)
2. 현업 필터링에 콘텐츠 기반 필터링의 특성 적용(사용자의 특성을 나타내는 변수 추가 -> 희소행렬 완롸)
3. 콘텐츠 기반 필터링에 현업 필터링의 특성 적용(사용자의 성별, 연령, 취향 등을 차원 압축하여 몇개의 잠재 요이능로 변환하여 사용)


In [9]:
# 협업 필터링 실습용 필요한 패키지 설치
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

In [11]:
df_movies = pd.read_csv("datasets/movies.csv")
df_ratings = pd.read_csv("datasets/ratings.csv")


df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523


In [12]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [13]:
# 고객, 영화 유사도 측정을 위한 전치 데이터셋 생성

# ratings 데이터와 movies 데이터 결합
df_merge = pd.merge(df_ratings, df_movies, on="movieId")

# 고객-아이템 평점 행렬 생성
df_merge_pivot = df_merge.pivot_table("rating", "userId", "title")

# 결측 0으로 변환
df_merge_pivot_null = df_merge_pivot.fillna(0)

# 아이템-사용자 평점 행렬로 전치
df_merge_pivot_T = df_merge_pivot_null.T
df_merge_pivot_T.head()

userId,1,2,3,4,5,6,7,8,9,10,...,659,660,661,662,663,664,665,666,667,668
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5
'Til There Was You (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"'burbs, The (1989)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# 영화 유사도 행렬 생성
item_sim = cosine_similarity(df_merge_pivot_T)

# 데이터 프레임 형태 변환
item_sim_df = pd.DataFrame(item_sim, index=df_merge_pivot_T.index,
                           columns=df_merge_pivot_T.index)

item_sim_df.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...And Justice for All (1979),10 (1979),...,[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),a/k/a Tommy Chong (2005),eXistenZ (1999),loudQUIETloud: A Film About the Pixies (2006),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.342682,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.158272,0.0,0.098324,0.0
'Hellboy': The Seeds of Creation (2004),0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.147486,0.0
'Round Midnight (1986),0.0,0.0,1.0,0.0,0.0,0.0,0.081094,0.0,0.257012,0.680414,...,0.0,0.227429,0.141421,0.0,0.100219,0.0,0.221581,0.0,0.098324,1.0
'Til There Was You (1997),0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"'burbs, The (1989)",0.0,0.0,0.0,0.0,1.0,0.0,0.03161,0.231897,0.100923,0.0,...,0.057358,0.0,0.0,0.0,0.212684,0.0,0.104192,0.0,0.16182,0.0


In [15]:
# 500일의 썸머와 유사도가 높은 상위 5개 영화 추출

item_sim_df["(500) Days of Summer (2009)"].sort_values(ascending=False)[1:6]

title
Scott Pilgrim vs. the World (2010)    0.502121
Up in the Air (2009)                  0.498354
Social Network, The (2010)            0.497004
Forgetting Sarah Marshall (2008)      0.472271
Shutter Island (2010)                 0.468202
Name: (500) Days of Summer (2009), dtype: float64

In [16]:
# 고객 유사도 행렬 생성
user_sim = cosine_similarity(df_merge_pivot_null)

# 데이터 프레임 형태 변환
user_sim_df = pd.DataFrame(user_sim, index=df_merge_pivot_null.index,
                           columns=df_merge_pivot_null.index)

user_sim_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,659,660,661,662,663,664,665,666,667,668
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.101113,0.210044,0.128766,0.057896,0.07713,0.35809,0.097434,0.239189,0.026663,...,0.291162,0.144741,0.106583,0.091049,0.236805,0.154519,0.245071,0.23866,0.278217,0.153493
2,0.101113,1.0,0.115559,0.03461,0.032705,0.028305,0.062914,0.471918,0.194232,0.0,...,0.068325,0.0,0.47733,0.146887,0.163553,0.061737,0.050948,0.051423,0.035907,0.064822
3,0.210044,0.115559,1.0,0.058208,0.044426,0.012816,0.084522,0.06662,0.459703,0.068454,...,0.152078,0.301021,0.081626,0.098949,0.310234,0.079452,0.092821,0.08094,0.158943,0.109658
4,0.128766,0.03461,0.058208,1.0,0.019298,0.005781,0.059089,0.02442,0.050572,0.0,...,0.05586,0.024329,0.040467,0.108881,0.076241,0.014011,0.042643,0.174275,0.061677,0.157809
5,0.057896,0.032705,0.044426,0.019298,1.0,0.053378,0.080822,0.041536,0.023168,0.011915,...,0.05845,0.007315,0.024708,0.038163,0.053085,0.048993,0.055431,0.026053,0.086667,0.068281


In [17]:
# 7번 고객과 유사도가 높은 상위 5명 추출

user_sim_df[7].sort_values(ascending=False)[1:6]

userId
403    0.432287
358    0.414600
228    0.396949
328    0.391268
590    0.387817
Name: 7, dtype: float64

In [18]:
# 협업 필터링용 샘플 행렬 생성

# 잠재요인 차원 30으로 설정
K=30

# 샘플용 영화 30개만 필터링
df_merge_sample = df_merge_pivot.iloc[:,0:30]
df_array = df_merge_sample.values
user_cnt, item_cnt = df_array.shape

# 고객수, 영화 수 x 자원 수 행렬 행성
np.random.seed(47)
user_matrix = np.random.normal(scale=1./K, size=(user_cnt, K))
item_matrix = np.random.normal(scale=1./K, size=(item_cnt, K))

print("고객 행렬 확인:", user_matrix.shape)
print("영화 행렬 확인:", item_matrix.shape)

고객 행렬 확인: (668, 30)
영화 행렬 확인: (30, 30)


In [19]:
# RMSE 함수 정의

def get_rmse(df_array, user_matrix, item_matrix, not_nan_index):
    error = 0
    # 예측용 df_array 생성
    pred_rating_matrix = user_matrix @ item_matrix.T
    
    # 결측 없는 실제 행렬과 예측 행렬 생성
    df_array_not_null = df_array[not_nan_index]
    pred_rating_matrix_not_null = pred_rating_matrix[not_nan_index]
    
    # RMSE 산출
    mse = mean_squared_error(df_array_not_null, pred_rating_matrix_not_null)
    rmse = np.sqrt(mse)
    
    return rmse

In [20]:
# 행렬 분해 함수 정의

def matrix_factorization(df_array, K, steps=1000, 
                         learning_rate=0.01, r_lambda = 0.01):
     
    # 결측값이 아닌 df_array의 index 생성
    not_nan_index = np.where(np.isnan(df_array) == False)
    
    # SGD 행렬 분해 알고리즘 적용
    for step in range(steps):
        for p, q, r in zip(not_nan_index[0], not_nan_index[1]
                           ,df_array[not_nan_index]):
            # 실제 값과 예측 값 차이 계산
            r_pq = user_matrix[p, :] @ item_matrix[q, :].T
            error_pq = r - r_pq

            # SGD 
            user_matrix[p,:] = user_matrix[p,:] + learning_rate*(
                error_pq * item_matrix[q, :] - r_lambda*user_matrix[p,:])
            
            item_matrix[q,:] = item_matrix[q,:] + learning_rate*(
                error_pq * user_matrix[p, :] - r_lambda*item_matrix[q,:])

        rmse = get_rmse(df_array, user_matrix, item_matrix, not_nan_index)

        if ( (step + 1)  % 100) == 0 :
            print("반복 횟수: ", step + 1 ," RMSE: ", np.round(rmse,3))
            
    return user_matrix, item_matrix

In [21]:
# 행렬 분해, 내적

user_matrix, item_matrix = matrix_factorization(
    df_array, K, steps=1000,
    learning_rate=0.01, r_lambda = 0.01)

pred_matrix = user_matrix @ item_matrix.T

반복 횟수:  100  RMSE:  0.097
반복 횟수:  200  RMSE:  0.027
반복 횟수:  300  RMSE:  0.024
반복 횟수:  400  RMSE:  0.023
반복 횟수:  500  RMSE:  0.021
반복 횟수:  600  RMSE:  0.02
반복 횟수:  700  RMSE:  0.02
반복 횟수:  800  RMSE:  0.019
반복 횟수:  900  RMSE:  0.019
반복 횟수:  1000  RMSE:  0.018


In [22]:
# 데이터 프레임 변환
ratings_pred_matrix = pd.DataFrame(data=pred_matrix, 
                                   index= df_merge_sample.index,
                                   columns = df_merge_sample.columns)

ratings_pred_matrix.head(5)

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...And Justice for All (1979),10 (1979),...,"10th Kingdom, The (2000)",11-11-11 (11-11-11: The Prophecy) (2011),11:14 (2003),"11th Hour, The (2007)",12 (2007),12 Angry Men (1957),12 Angry Men (1997),12 Rounds (2009),12 Years a Slave (2013),127 Hours (2010)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.051631,-0.083938,-0.06966,-0.105834,-0.186691,-0.094306,-0.098581,-0.177139,-0.103241,-0.082794,...,-0.045882,-0.079783,-0.112016,-0.110756,-0.034177,-0.163101,-0.076059,-0.051933,-0.128448,-0.113047
2,0.068995,0.063763,0.029378,0.103909,0.070665,0.035862,0.094289,0.170707,0.130561,0.053904,...,0.134833,0.066054,0.067439,0.097416,0.013622,0.127945,-0.041964,0.049973,0.106233,0.082817
3,0.128438,0.08974,0.090381,0.066533,0.103108,0.031881,0.212484,0.048289,0.182902,0.07097,...,0.142718,0.191402,0.256842,0.230997,0.064861,0.149501,-0.003335,0.198401,0.207483,0.198376
4,2.895991,1.712282,1.787002,2.657818,2.820935,2.213617,2.082815,2.555981,3.306608,2.252366,...,1.648483,1.936613,2.729978,2.677794,1.10038,4.98198,0.620007,2.018014,3.723871,2.951902
5,0.001853,0.076697,-0.039216,-0.02927,-0.007633,-0.051874,-0.026496,0.004058,0.040571,-0.024389,...,0.042998,0.052845,0.024583,0.030629,-0.026081,-0.008237,0.040031,0.000849,0.007089,-0.021943


In [23]:
# 영화 추천을 위한 함수 설정

# 미상영 영화 리스트 추출 함수
def get_unseen_movies(df_merge_sample, userId):

    # 모든 영화 리스트 생성
    movies_list = df_merge_sample.columns.tolist()
    
    # 고객 별 평점 테이블 생성
    ratings = df_merge_sample.loc[userId,:]
    
    # 평점을 메기지 않은 영화 리스트 생성
    none_rating_list = ratings[ratings.isnull()].index.tolist()
    
    # 평점 없는 영화로 미상영 영화 리스트 생성
    unseen_movie_list = [ movie for movie in movies_list if movie in none_rating_list]
    
    return unseen_movie_list

# 미상영 영화 중 예측 점수가 높은 순으로 정렬
def recomm_movie_by_userid(pred_df, userId, unseen_movie_list, top_n=10):    
    recomm_movies = pred_df.loc[userId, unseen_movie_list
                               ].sort_values(ascending=False)[:top_n]
    
    return recomm_movies

In [24]:
# 575번 고객의 추천 영화 리스트 생성

# 575번 고객의 미상영 영화 리스트 생성
unseen_movie_list = get_unseen_movies(df_merge_sample, 575)

# 미상영 영화 중 예측 평점 높은 영화 리스트 생성
recomm_movies = recomm_movie_by_userid(ratings_pred_matrix, 575
                                       ,unseen_movie_list, top_n=10)

# 최종 데이터셋 생성
recomm_movies = pd.DataFrame(data=recomm_movies.values,
                             index=recomm_movies.index,
                             columns=['pred_score']).reset_index()

recomm_movies.head(10)

Unnamed: 0,title,pred_score
0,12 Years a Slave (2013),3.569347
1,127 Hours (2010),3.362532
2,101 Dalmatians (One Hundred and One Dalmatians...,2.968062
3,10 Items or Less (2006),2.949394
4,11:14 (2003),2.8847
5,"11th Hour, The (2007)",2.742791
6,*batteries not included (1987),2.701928
7,'71 (2014),2.672483
8,12 Rounds (2009),2.58529
9,10th & Wolf (2006),2.489459
