<a href="https://colab.research.google.com/github/molybdenum-jo/Recommend-Algorithm/blob/main/model_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install scikit-surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import Reader, Dataset, SVD
from surprise import KNNWithMeans
from surprise.model_selection import cross_validate
from surprise import accuracy
from surprise import KNNBasic

In [None]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
sub = pd.read_csv("data/sample_submission.csv")

In [None]:
train = pd.read_csv("Train (3).csv")

# 컨텐츠 기반 필터링

🙋‍♂️ 가설 2: **콘텐츠 기반 필터링 기반의 추천 시스템**이 도서 평점 예측에 효과적일 것이다.

- 결과물: 책의 속성(저자, 출판사, 장르 등)을 기반으로 한 **콘텐츠 기반 필터링 모델**을 구현하고, 모델의 **평점 예측 성능**을 평가한다. 협업 필터링 모델과 성능을 비교하여 콘텐츠 기반 필터링의 효과를 분석한다.

In [None]:
df = train.sample(20000, random_state=42).copy()
df['Book-Title'] = df['Book-Title'].str.upper()

#TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

tfidfvect = TfidfVectorizer(max_features=2000, lowercase=False)
tfidfvect

# fit_transform 을 통해 변환
tfidf_overview = tfidfvect.fit_transform(df["Book-Title"])
tfidf_overview

df_tfidf = pd.DataFrame(tfidf_overview.toarray(), columns=tfidfvect.get_feature_names_out())
df_tfidf

# Cosine 유사도 

from sklearn.metrics.pairwise import cosine_similarity
cosine_matrix = cosine_similarity(tfidf_overview, tfidf_overview)
cosine_matrix

df_cosine = pd.DataFrame(cosine_matrix, columns=df.index, index=df.index)
df_cosine.head()

#Travel 단어와 연관성이 있는 책 찾아보기
book = "TRAVEL"
display(df.loc[df["Book-Title"].str.contains(book), "Book-Title"])
book_id = df.loc[df["Book-Title"].str.contains(book), "Book-Title"].index[-1]
book_id

#위 내용을 함수로 정리 
def find_item(book, df_cosine, df):
    try:
        df_id = df[df["Book-Title"].str.contains(book.upper())].index[0]
        df_item = df_cosine[df_id].drop_duplicates()
        recomm_idx = df_item[df_item < 1].nlargest(10).index
        recomm_item = df.loc[recomm_idx, ["Book-Title"]]
        return recomm_item
    except:
        return "추천 도서 없음"

book = "SISTERHOOD OF THE TRAVELING PANTS"
find_item(book, df_cosine, df)

7739              ARKANSAS TRAVELER (BENNI HARPER MYSTERY)
24786                  GULLIVER'S TRAVELS (SIGNET CLASSIC)
13561              THE TIME TRAVELER'S WIFE (HARVEST BOOK)
5107     THE TIME TRAVELER'S WIFE (TODAY SHOW BOOK CLUB...
27488    ROAD LESS TRAVELED : A NEW PSYCHOLOGY OF LOVE,...
20890    HOW TO TRAVEL WITH A SALMON &AMP; OTHER ESSAYS...
24023    THE UNSAVVY TRAVELER: WOMEN'S COMIC TALES OF C...
16088    THE MINDFUL TRAVELER: A GUIDE TO JOURNALING AN...
26653    GUTSY MAMAS: TRAVEL TIPS AND WISDOM FOR MOTHER...
2225                     SISTERHOOD OF THE TRAVELING PANTS
26073    TOURIST GAZE: LEISURE AND TRAVEL IN CONTEMPORA...
20853    GULLIFUR'S TRAVELS (ADVENTURES OF WISHBONE, NO...
2254     THE ROAD LESS TRAVELED, 25TH ANNIVERSARY EDITI...
6646         BAEDEKER AUSTRALIA (BAEDEKER'S TRAVEL GUIDES)
2299            TRAVELS WITH CHARLEY: IN SEARCH OF AMERICA
7256                                     THE ART OF TRAVEL
17195    SONGS FOR THE OPEN ROAD : POEMS OF TRAVEL AND .

Unnamed: 0,Book-Title
9178,THE SISTERHOOD OF THE TRAVELING PANTS
4420,TRAVELING SOLO
17588,SISTERHOOD
24377,PANTS ON FIRE
24033,TALES FROM A TRAVELING COUCH: A PSYCHOTHERAPIS...
2522,DANCING IN MY NUDDY- PANTS (CONFESSIONS OF GEO...
26207,TRAVELING MERCIES: SOME THOUGHTS ON FAITH
27937,THE DIVINE SECRETS OF THE YA-YA SISTERHOOD: A ...
3160,THE RAPTURE OF CANAAN
12691,DIVINE SECRETS OF THE YA-YA SISTERHOOD: A NOVEL


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Book-Title을 기반으로 특성 추출
title_vectorizer = TfidfVectorizer()
title_matrix = title_vectorizer.fit_transform(train['Book-Title'])

# Book-Author을 기반으로 특성 추출
author_vectorizer = TfidfVectorizer()
author_matrix = author_vectorizer.fit_transform(train['Book-Author'])
from sklearn.preprocessing import StandardScaler

# Year-Of-Publication 스케일링
year_scaler = StandardScaler()
year_scaled = year_scaler.fit_transform(train[['Year-Of-Publication']])
from sklearn.decomposition import TruncatedSVD

n_components = 200  # 변경 가능한 값으로, 원하는 차원 수를 선택합니다.
svd = TruncatedSVD(n_components=n_components)

title_matrix_reduced = svd.fit_transform(title_matrix)
author_matrix_reduced = svd.fit_transform(author_matrix)

X = np.concatenate([title_matrix_reduced, author_matrix_reduced, year_scaled], axis=1)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# 데이터를 학습 및 테스트 세트로 분할
X_train, X_test, y_train, y_test = train_test_split(X, train['Book-Rating'], test_size=0.2, random_state=42)

# 회귀 모델 학습
model = LinearRegression()
model.fit(X_train, y_train)

# 예측 및 성능 평가
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)
# >>> 3.819596362147247

ValueError: ignored

In [None]:
base_path='data'
df_submit = pd.read_csv(f"{base_path}/sample_submission.csv")
df_submit.head()

In [None]:
y_pred = y_pred[:len(df_submit)]
y_pred

In [None]:
df_submit = pd.DataFrame(index=test.index)
df_submit["Book-Rating"] = abs(y_pred)


# 최근접 이웃 협업 필터링 (KNNBasic)


🙋‍♂️ 가설 1: **협업 필터링 기반의 추천 시스템**이 도서 평점 예측에 효과적일 것이다.

- 결과물: **사용자 기반 협업 필터링** 및 **아이템 기반 협업 필터링** 모델을 구현하고, 이들 모델의 **평점 예측 성능**을 평가한다. **RMSE 값**으로 성능을 비교하여 어떤 협업 필터링 방법이 더 나은 성능을 보이는지 결정한다.

### 사용자 기반 최근접 이웃 협업 필터링

In [None]:
cs_matrix = pd.crosstab(df["User-ID"], df["Book-ID"])
cs_matrix.head()

# 사용자 기반 최근접 이웃협업 필터링 
# 코사인 유사도 구하기 

# sklearn.metrics.pairwise 에서 cosine_similarity 불러오기
from sklearn.metrics.pairwise import cosine_similarity

# 유저간 유사도 구하기
user_sim = cosine_similarity(cs_matrix, cs_matrix)
user_sim


# df_user_sim 
df_user_sim = pd.DataFrame(user_sim, index=cs_matrix.index, columns=cs_matrix.index)
df_user_sim.head(5)

idx = 2
usr_sim = df_user_sim.iloc[idx]
curr_usr = usr_sim.index[idx]
print("해당 유저 : ", curr_usr)
most_sim_user = usr_sim[usr_sim.index != usr_sim.index[idx]].nlargest(1)
print("가장 유사한 유저 : ", most_sim_user.index[0])


# 현재 유저가 이용한 책
curr_user_book = cs_matrix.loc[curr_usr].nlargest(20).index

# 가장 유사한 유저가 이한 책
most_usr_book = cs_matrix.loc[most_sim_user.index[0]].nlargest(20).index

# 가장 유사한 유저가 구매한 책 - 내가 구매한 책
# ==> 가장 유사한 유저가 구매한 책 중에 내가 구매하지 않은 책
recomm_book = list(set(most_usr_book) - set(curr_user_book))

book_desc = train.drop_duplicates("Book-ID")[["Book-ID", "Book-Title"]]
book_desc = book_desc.set_index("Book-ID")
book_desc.shape

# df_sim_buy join으로 상품의 Description 구하기
pd.DataFrame(recomm_book).set_index(0).join(book_desc)

해당 유저 :  USER_00004
가장 유사한 유저 :  USER_04985


Unnamed: 0_level_0,Book-Title
0,Unnamed: 1_level_1
BOOK_000212,Don't Speak to Strangers
BOOK_000218,Never Nosh a Matzo Ball
BOOK_000230,The Sisterhood of the Traveling Pants
BOOK_000252,The Marine &amp; The Debutante (Bachelor Batt...
BOOK_000223,Jane Eyre (Penguin Classics)


In [None]:
def usr_recomm(idx, df_user_sim, cs_matrix):
    try:
        curr_usr = df_user_sim.index[idx]
        usr_sim = df_user_sim.iloc[idx]
        most_sim_user = usr_sim[usr_sim.index != usr_sim.index[idx]].nlargest(1)
        print("현재 유저 : ", curr_usr, "가장 유사한 유저 : ", most_sim_user.index[0])
        # 현재 유저가 구매한 책
        curr_user_book = cs_matrix.loc[curr_usr].nlargest(20).index
        # 가장 유사한 유저가 구매한 책
        most_usr_book = cs_matrix.loc[most_sim_user.index[0]].nlargest(20).index
        # 가장 유사한 유저가 구매한 책 - 내가 구매한 책
        # ==> 가장 유사한 유저가 구매한 책 중에 내가 구매하지 않은 책
        recomm_book = list(set(most_usr_book) - set(curr_user_book))
        reomm_result = pd.DataFrame(recomm_book).set_index(0).join(book_desc)
        return reomm_result
    except:
        return "추천할 도서 없음"

usr_idx = 150
usr_recomm(usr_idx, df_user_sim, cs_matrix)

현재 유저 :  USER_00329 가장 유사한 유저 :  USER_04643


'추천할 도서 없음'

In [None]:
from surprise.prediction_algorithms.knns import KNNBasic

# model
knn = KNNBasic(name="cosine", user_base=True)
knn.fit(X_train)
predictions_knn = knn.test(X_test)
X_test[3]
predictions_knn[3]
df["Book-Rating"].describe()
accuracy.rmse(predictions_knn)


df.sample()
iid = "BOOK_011665"
df[df["Book-ID"] == iid]
uid = 'USER_44609'
iid = "BOOK_011665"
df[(df["User-ID"] == uid) & (df["Book-ID"] == iid)]

NameError: ignored

In [None]:
# 샘플 추출
df = train[['User-ID','Book-Title', 'Book-Rating']].drop_duplicates().sample(100000, random_state=42)

r_min = df['Book-Rating'].min()
r_max = df['Book-Rating'].max()

reader = Reader(rating_scale=(r_min, r_max))

data = Dataset.load_from_df(
    df[['User-ID','Book-Title', 'Book-Rating']], 
    reader)

# 데이터셋 나누기 
from surprise.model_selection import train_test_split
X_train, X_test = train_test_split(data, test_size=0.2, random_state=42)

X_train.n_users, X_train.n_items, X_train.n_ratings
type(X_test), len(X_test)

# model
from surprise.prediction_algorithms.knns import KNNBasic
knn = KNNBasic(name="cosine",
               user_base=True,
               k = 20,
               min_k = 15,
               min_support = 3,
               verbose = False
              )
# 훈련 및 테스트
knn.fit(X_train)
predictions = knn.test(X_test)

# rmse 값
accuracy.rmse(predictions)

RMSE: 3.8908


3.8907782808322637

### 아이템 기반 최근접 이웃 협업 필터링

In [None]:
knni = KNNBasic(name="cosine", user_base=False)
knni.fit(X_train)
predictions_item = knni.test(validset)
accuracy.rmse(predictions_item)

In [None]:
# 샘플 추출
df = train[['User-ID','Book-Title', 'Book-Rating']].drop_duplicates().sample(200000, random_state=42)

r_min = df['Book-Rating'].min()
r_max = df['Book-Rating'].max()

reader = Reader(rating_scale=(r_min, r_max))

data = Dataset.load_from_df(
    df[['User-ID','Book-Title', 'Book-Rating']], 
    reader)

# 데이터셋 나누기 
from surprise.model_selection import train_test_split
X_train, X_test = train_test_split(data, test_size=0.2, random_state=42)

X_train.n_users, X_train.n_items, X_train.n_ratings
type(X_test), len(X_test)

from surprise.prediction_algorithms.knns import KNNBasic
# model

# KNNBasic model
knni = KNNBasic(name="cosine", user_base=False)

knn.fit(X_train)
predictions = knn.test(X_test)

# accuracy.rmse
accuracy.rmse(predictions)

# 잠재 요인 협업 필터링 (SVD)

🙋‍♂️ 가설 3: **행렬 인수분해 기반의 추천 시스템**이 도서 평점 예측에 효과적일 것이다.

- 결과물: **SVD**와 **ALS**와 같은 **행렬 인수분해 기법**을 사용하여 모델을 구현하고, 모델의 **평점 예측 성능**을 평가한다. 다른 추천 시스템과 성능을 비교하여 행렬 인수분해의 효과를 분석한다.


In [None]:
svd = SVD(random_state=42)
svd.fit(X_train)
X_test = [(f"TEST_{uid[6:]}", iid, rating) for uid, iid, rating in X_test]
predictions_svd = svd.test(X_test)

pred_svd = [pred.est for pred in predictions_svd]
pred_svd = np.array(pred_svd)
pred_svd.shape

df_submit = pd.read_csv(f"{base_path}/sample_submission.csv")
df_submit.head()

pred_svd = pred_svd[:len(df_submit)]
pred_svd

df_submit["Book-Rating"] = abs(pred_svd)

file_name = f"{base_path}/submit.csv"
df_submit.to_csv(file_name, index=False)
pd.read_csv(file_name)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sub = pd.read_csv("sample_submission.csv")
train['Age'] = np.where((train['Age'] == 0) | (train['Age'] >= 80), 35, train['Age'])
test['Age'] = np.where((test['Age'] == 0) | (test['Age'] >= 80), 35, test['Age'])
train['Year-Of-Publication'] = np.where(train['Year-Of-Publication'] == -1, 1997, train['Year-Of-Publication'])
test['Year-Of-Publication'] = np.where(test['Year-Of-Publication'] == -1, 1997, test['Year-Of-Publication'])
import pandas as pd
from surprise import Dataset, Reader
from surprise import SVD
from surprise.model_selection import train_test_split

user_item_rating = train[["User-ID", "Book-ID", "Book-Rating"]]

# 데이터를 Surprise 라이브러리 형식으로 변환
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(user_item_rating, reader)

# Train/Test 분리
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)
# SVD 알고리즘을 사용하여 모델 구축 및 학습
algo = SVD()
algo.fit(trainset)

# 테스트 데이터에 대한 예측
predictions = algo.test(testset)
# 성능 평가 (RMSE)
from surprise import accuracy
rmse = accuracy.rmse(predictions)
print(f"RMSE: {rmse:.4f}")
>>> 3.4935

# 원래의 테스트 데이터에서 Book-Rating 예측
test["Book-Rating"] = test.apply(lambda row: algo.predict(row["User-ID"], row["Book-ID"]).est, axis=1)

# 결과를 저장할 데이터프레임 생성 및 결합
sub = sub.copy()
sub["Book-Rating"] = test["Book-Rating"]

# 결과를 CSV 파일로 저장
sub.to_csv(f"svd{rmse:.4f}.csv", index=False)

In [None]:
df = train[['User-ID','위치+책id', 'Book-Rating']]

# reader 
r_min = df['Book-Rating'].min()
r_max = df['Book-Rating'].max()
reader = Reader(rating_scale=(r_min, r_max))


# Dataset.load_from_df 로 데이터를 로드합니다.
data = Dataset.load_from_df(
    df[['User-ID','위치+책id', 'Book-Rating']], 
    reader)


# train, test 데이터셋 나누기 
from surprise.model_selection import train_test_split
X_train, X_test = train_test_split(data, test_size=0.2, random_state=42)

svd = SVD()

# fit, test
svd.fit(X_train)
predictions_svd = svd.test(X_test)


# rmse 점수 확인하기 
accuracy.rmse(predictions_svd)

In [None]:
from sklearn.model_selection import train_test_split

# Train/Validation 데이터 분할
train_data, val_data = train_test_split(train, test_size=0.2, random_state=42)

# 사용자별 평균 평점 특성 생성
user_mean_rating = train_data.groupby('User-ID')['Book-Rating'].mean().reset_index()
user_mean_rating.columns = ['User-ID', 'User-Mean-Rating']

# 사용자별 평균 평점을 train_data와 val_data에 병합
train_data = train_data.merge(user_mean_rating, on='User-ID', how='left')
val_data = val_data.merge(user_mean_rating, on='User-ID', how='left')

# 사용자별 평균 평점이 결측치인 경우, 전체 평균 평점으로 대체
mean_rating = train_data['Book-Rating'].mean()
train_data['User-Mean-Rating'] = train_data['User-Mean-Rating'].fillna(mean_rating)
val_data['User-Mean-Rating'] = val_data['User-Mean-Rating'].fillna(mean_rating)

# 사용자별 평균 평점을 포함한 새로운 데이터프레임 생성
user_item_rating_mean = train_data[["User-ID", "Book-ID", "User-Mean-Rating"]]


# 데이터를 Surprise 라이브러리 형식으로 변환
reader = Reader(rating_scale=(1, 10))
data_mean = Dataset.load_from_df(user_item_rating_mean, reader)

# 전체 데이터를 trainset으로 변환
trainset = data_mean.build_full_trainset()

# SVD 알고리즘을 사용하여 모델 구축 및 학습
algo = SVD()
algo.fit(trainset)

# 검증 데이터에 대한 예측
val_data["predicted_rating"] = val_data.apply(lambda row: algo.predict(row["User-ID"], row["Book-ID"]).est, axis=1)

# 성능 평가 (RMSE)
from sklearn.metrics import mean_squared_error
rmse = mean_squared_error(val_data["Book-Rating"], val_data["predicted_rating"], squared=False)
print(f"RMSE: {rmse:.4f}")

### SVD 하이퍼파라미터 조정

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sub = pd.read_csv("sample_submission.csv")
train['Age'] = np.where((train['Age'] == 0) | (train['Age'] >= 80), 35, train['Age'])
test['Age'] = np.where((test['Age'] == 0) | (test['Age'] >= 80), 35, test['Age'])
train['Year-Of-Publication'] = np.where(train['Year-Of-Publication'] == -1, 1997, train['Year-Of-Publication'])
test['Year-Of-Publication'] = np.where(test['Year-Of-Publication'] == -1, 1997, test['Year-Of-Publication'])
from sklearn.model_selection import train_test_split

# Train/Validation 데이터 분할
train_data, val_data = train_test_split(train, test_size=0.2, random_state=42)

# 사용자별 평균 평점 특성 생성
user_mean_rating = train_data.groupby('User-ID')['Book-Rating'].mean().reset_index()
user_mean_rating.columns = ['User-ID', 'User-Mean-Rating']

# 사용자별 평균 평점을 train_data와 val_data에 병합
train_data = train_data.merge(user_mean_rating, on='User-ID', how='left')
val_data = val_data.merge(user_mean_rating, on='User-ID', how='left')

# 사용자별 평균 평점이 결측치인 경우, 전체 평균 평점으로 대체
mean_rating = train_data['Book-Rating'].mean()
train_data['User-Mean-Rating'] = train_data['User-Mean-Rating'].fillna(mean_rating)
val_data['User-Mean-Rating'] = val_data['User-Mean-Rating'].fillna(mean_rating)

# 사용자별 평균 평점을 포함한 새로운 데이터프레임 생성
user_item_rating_mean = train_data[["User-ID", "Book-ID", "User-Mean-Rating"]]
# 데이터를 Surprise 라이브러리 형식으로 변환
reader = Reader(rating_scale=(1, 10))
data_mean = Dataset.load_from_df(user_item_rating_mean, reader)
# 전체 데이터를 trainset으로 변환
trainset = data_mean.build_full_trainset()

# SVD 알고리즘을 사용하여 모델 구축 및 학습
algo = SVD(random_state=42,
          n_epochs= 100 , 
          lr_all= 0.01,
          n_factors = 200,
          reg_all = 0.06,
          verbose=True)
algo.fit(trainset)


# 검증 데이터에 대한 예측
val_data["predicted_rating"] = val_data.apply(lambda row: algo.predict(row["User-ID"], row["Book-ID"]).est, axis=1)
# 성능 평가 (RMSE)
from sklearn.metrics import mean_squared_error
rmse = mean_squared_error(val_data["Book-Rating"], val_data["predicted_rating"], squared=False)
print(f"RMSE: {rmse:.4f}")
>>> RMSE: 3.4074


# 원래의 테스트 데이터에서 Book-Rating 예측
test["Book-Rating"] = test.apply(lambda row: algo.predict(row["User-ID"], row["Book-ID"]).est, axis=1)

# 결과를 저장할 데이터프레임 생성 및 결합
sub = sub.copy()
sub["Book-Rating"] = test["Book-Rating"]

# 결과를 CSV 파일로 저장
sub.to_csv(f"svd{rmse:.4f}.csv", index=False)

### SVD++ 알고리즘

In [None]:
from surprise import Dataset
from surprise import Reader
from surprise import SVDpp
from surprise import accuracy
from surprise.model_selection import train_test_split

# 데이터 준비 
# 유저 - 아이템 - 평점 데이터셋 만들기 
df = train_data[["User-ID", "Book-Title", "Book-Rating"]].sample(300000)

# reader 
r_min = df_tr['Book-Rating'].min()
r_max = df_tr['Book-Rating'].max()
reader = Reader(rating_scale=(r_min, r_max))


# Dataset.load_from_df 로 데이터를 로드합니다.
data = Dataset.load_from_df(
    df[["User-ID", "Book-Title", "Book-Rating"]], 
    reader)

from surprise.model_selection import KFold

# 데이터셋 로드
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(df[["User-ID", "Book-Title", "Book-Rating"]], reader)

# k-fold 교차 검증에 사용할 데이터 분할
kf = KFold(n_splits=5)

for trainset, testset in kf.split(data):
    # SVD++ 모델 생성 및 학습
    model = SVDpp(n_factors=50, lr_all=0.01, reg_all=0.1)
    model.fit(trainset)

    # 테스트셋으로 예측 및 평가
    3.530020675576203predictions = model.test(testset)
    rmse = accuracy.rmse(predictions)
    print('RMSE:', rmse)

>>> 3.530020675576203

In [None]:
from sklearn.model_selection import train_test_split
# Train/Validation 데이터 분할
train_data, val_data = train_test_split(train, test_size=0.2, random_state=42)
# 사용자별 평균 평점 특성 생성
user_mean_rating = train_data.groupby('User-ID')['Book-Rating'].mean().reset_index()
user_mean_rating.columns = ['User-ID', 'User-Mean-Rating']
# 사용자별 평균 평점을 train_data와 val_data에 병합
train_data = train_data.merge(user_mean_rating, on='User-ID', how='left')
val_data = val_data.merge(user_mean_rating, on='User-ID', how='left')
# 사용자별 평균 평점이 결측치인 경우, 전체 평균 평점으로 대체
mean_rating = train_data['Book-Rating'].mean()
train_data['User-Mean-Rating'] = train_data['User-Mean-Rating'].fillna(mean_rating)
val_data['User-Mean-Rating'] = val_data['User-Mean-Rating'].fillna(mean_rating)
# 사용자별 평가 횟수 특성 생성
user_rating_count = train_data.groupby('User-ID')['Book-Rating'].count().reset_index()
user_rating_count.columns = ['User-ID', 'User-Rating-Count']

# 사용자별 평가 횟수를 train_data와 val_data에 병합
train_data = train_data.merge(user_rating_count, on='User-ID', how='left')
val_data = val_data.merge(user_rating_count, on='User-ID', how='left')

user_item_rating_mean = train_data[["User-ID", "Book-ID", 'User-Mean-Rating']].sample(30000)
reader = Reader(rating_scale=(0, 10))
data_mean = Dataset.load_from_df(user_item_rating_mean, reader)
trainset = data_mean.build_full_trainset()



# SVD++ 알고리즘으로 모델 생성 및 학습
algo = SVDpp(random_state=42)
algo.fit(trainset)

val_data["predicted_rating"] = val_data.apply(lambda row: svd.predict(row["User-ID"], row["Book-ID"]).est, axis=1)
from sklearn.metrics import mean_squared_error
rmse = mean_squared_error(val_data["Book-Rating"], val_data["predicted_rating"], squared=False)
print(f"RMSE: {rmse:.4f}")
>>> 3.5217

# 앙상블

🙋‍♂️ 가설 5: **앙상블 기법을 사용한 추천 시스템**이 도서 평점 예측에 효과적일 것이다.

- 결과물: 여러 추천 시스템 모델을 결합하여 앙상블 모델을 구현한다. **가중 평균**, **스태킹(Stacking)** 등의 앙상블 기법을 적용하여 모델의 평점 예측 성능을 평가한다. 단일 모델과 성능을 비교하여 앙상블 기법의 효과를 분석한다.

### xgboost + Randomforest

In [None]:
# 데이터 전처리
categorical_feature = train.select_dtypes(exclude="number").columns
categorical_feature

train[categorical_feature] = train[categorical_feature].astype("category")
test[categorical_feature] = test[categorical_feature].astype("category")
train.info(), test.info()

from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
train[categorical_feature] = oe.fit_transform(train[categorical_feature])
test[categorical_feature] = oe.transform(test[categorical_feature])

X = train.drop(columns="Book-Rating")
y = train["Book-Rating"]
X.shape, y.shape

from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size = 0.1, random_state=42
)

X_train.shape, X_valid.shape, y_train.shape, y_valid.shape
X_test = test
X_test.shape

# model 
import xgboost as xgb

model_xgb = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=250, 
                             max_depth=3,
                             random_state=42, 
                             n_jobs=-1)
                             
model_xgb

dtrain = xgb.DMatrix(X, label=y)
from xgboost.callback import EarlyStopping
# fit
es = xgb.callback.EarlyStopping(
    rounds=2,
    save_best=True,
    maximize=False,
    data_name="validation_0",
    metric_name="rmse",
)

model_xgb.fit(X_train, y_train, 
              eval_set=[(X_valid, y_valid)], callbacks=[es])

y_valid_predict = model_xgb.predict(X_valid)
y_valid_predict[:5]

fi = pd.Series(model_xgb.feature_importances_)
fi.index = model_xgb.feature_names_in_
fi.nlargest(20).plot.barh()

# model_xgb.plot_importance
xgb.plot_importance(model_xgb, max_num_features=20);


xgb.plot_tree(model_xgb, num_trees=0)
fig = plt.gcf()
fig.set_size_inches(30, 20)

from sklearn.metrics import r2_score
r2_score(y_valid, y_valid_predict)

score_xgb = model_xgb.score(X_valid, y_valid)
score_xgb

X_test.shape

y_pred_xgb = model_xgb.predict(X_test)
y_pred_xgb

df_submit = pd.read_csv(f"{base_path}/sample_submission.csv")
y_pred_xgb = y_pred_xgb[:len(df_submit)].round()
df_submit["Book-Rating"] = abs(y_pred_xgb)

# 저장 
file_name = f"{base_path}/submit.csv"
df_submit.to_csv(file_name, index=False)
pd.read_csv(file_name)

### Decision Tree + xgboost

In [None]:
categorical_feature = train.select_dtypes(exclude="number").columns
categorical_feature

train[categorical_feature] = train[categorical_feature].astype("category")
test[categorical_feature] = test[categorical_feature].astype("category")
train.info(), test.info()

from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
train[categorical_feature] = oe.fit_transform(train[categorical_feature])
test[categorical_feature] = oe.transform(test[categorical_feature])

X = train.drop(columns="Book-Rating")
y = train["Book-Rating"]
X.shape, y.shape

from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size = 0.1, random_state=42
)

X_train.shape, X_valid.shape, y_train.shape, y_valid.shape
X_test = test
X_test.shape

# model 
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

tree_model = DecisionTreeClassifier()
xgb_model = XGBClassifier()

oting_model = VotingClassifier(estimators=[('tree', tree_model), ('xgb', xgb_model)], voting='hard')
voting_model.fit(X_train, y_train)

y_pred = voting_model.predict(X_test)
rmse_tf_xg = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"rmse_tf_xg: {rmse_tf_xg}")
>>> 4.686224381869224

### Desicion tree + tensorflow

In [None]:
categorical_feature = train.select_dtypes(exclude="number").columns
categorical_feature

train[categorical_feature] = train[categorical_feature].astype("category")
test[categorical_feature] = test[categorical_feature].astype("category")
train.info(), test.info()

from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
train[categorical_feature] = oe.fit_transform(train[categorical_feature])
test[categorical_feature] = oe.transform(test[categorical_feature])

X = train.drop(columns="Book-Rating")
y = train["Book-Rating"]
X.shape, y.shape

from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size = 0.1, random_state=42
)

X_train.shape, X_valid.shape, y_train.shape, y_valid.shape
X_test = test
X_test.shape

# model 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
tree_model = DecisionTreeClassifier()
import tensorflow as tf
tf.model = tf.keras.Sequential([
    # tf.keras.layers.BatchNormalization(input_shape=X_train.iloc[0].shape),
    tf.keras.layers.Dense(units=64, activation='relu', input_shape=X_train.iloc[0].shape),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(units=64),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(units=1)
])
voting_model_tt = VotingClassifier(estimators=[('tree', tree_model), ('tensorflow', xgb_model)], voting='hard')
voting_model_tt.fit(X_train, y_train)

y_pred_tt = voting_model_tt.predict(X_test)
rmse_tt = np.sqrt(mean_squared_error(y_test, y_pred_tt))
print(f"rmse_tt: {rmse_tt}")
>>> 4.685774587644766

### Randomforest + xgboost

In [None]:
categorical_feature = train.select_dtypes(exclude="number").columns
categorical_feature

train[categorical_feature] = train[categorical_feature].astype("category")
test[categorical_feature] = test[categorical_feature].astype("category")
train.info(), test.info()

from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
train[categorical_feature] = oe.fit_transform(train[categorical_feature])
test[categorical_feature] = oe.transform(test[categorical_feature])

X = train.drop(columns="Book-Rating")
y = train["Book-Rating"]
X.shape, y.shape

from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size = 0.1, random_state=42
)

X_train.shape, X_valid.shape, y_train.shape, y_valid.shape
X_test = test
X_test.shape

# model 
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

xgb = XGBRegressor(n_estimators=100, random_state=42)
xgb.fit(X_train, y_train)

rf_pred = rf.predict(X_test)
xgb_pred = xgb.predict(X_test)
ensemble_pred = (rf_pred + xgb_pred) / 2
rmse = np.sqrt(mean_squared_error(y_test, ensemble_pred))
print(f"Ensemble RMSE: {rmse}")
>>> 3.442358268797963

# submisson 에 옮겨 적고 저장
ensemble_pred = ensemble_pred[:len(df_submit)]
df_submit["Book-Rating"] = abs(ensemble_pred)
file_name = f"{base_path}/ensemble_rf_xgb.csv"
df_submit.to_csv(file_name, index=False)
pd.read_csv(file_name)

### SVD + TensorFlow

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import Reader, Dataset, SVD
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sub = pd.read_csv("sample_submission.csv")
train['Age'] = np.where(train['Age'] == 0, 15, np.where(train['Age'] >= 80, 80, train['Age']))
test['Age'] = np.where(test['Age'] == 0, 15, np.where(test['Age'] >= 80, 80, test['Age']))
train['Year-Of-Publication'] = np.where(train['Year-Of-Publication'] == -1, 1997, train['Year-Of-Publication'])
test['Year-Of-Publication'] = np.where(test['Year-Of-Publication'] == -1, 1997, test['Year-Of-Publication'])
from sklearn.model_selection import train_test_split

# Train/Validation 데이터 분할
train_data, val_data = train_test_split(train, test_size=0.2, random_state=42)

# 사용자별 평균 평점 특성 생성
user_mean_rating = train_data.groupby('User-ID')['Book-Rating'].mean().reset_index()
user_mean_rating.columns = ['User-ID', 'User-Mean-Rating']

# 사용자별 평균 평점을 train_data와 val_data에 병합
train_data = train_data.merge(user_mean_rating, on='User-ID', how='left')
val_data = val_data.merge(user_mean_rating, on='User-ID', how='left')

# 사용자별 평균 평점이 결측치인 경우, 전체 평균 평점으로 대체
mean_rating = train_data['Book-Rating'].mean()
train_data['User-Mean-Rating'] = train_data['User-Mean-Rating'].fillna(mean_rating)
val_data['User-Mean-Rating'] = val_data['User-Mean-Rating'].fillna(mean_rating)

# 사용자별 평가 횟수 특성 생성
user_rating_count = train_data.groupby('User-ID')['Book-Rating'].count().reset_index()
user_rating_count.columns = ['User-ID', 'User-Rating-Count']

# 사용자별 평가 횟수를 train_data와 val_data에 병합
train_data = train_data.merge(user_rating_count, on='User-ID', how='left')
val_data = val_data.merge(user_rating_count, on='User-ID', how='left')

# 사용자별 평균 평점과 책별 평균 평점을 포함한 새로운 데이터프레임 생성
user_item_rating_mean = train_data[["User-ID", "Book-ID", 'User-Mean-Rating']]
# 데이터를 Surprise 라이브러리 형식으로 변환
reader = Reader(rating_scale=(1, 10))
data_mean = Dataset.load_from_df(user_item_rating_mean, reader)
# 전체 데이터를 trainset으로 변환
trainset = data_mean.build_full_trainset()

# SVD 알고리즘을 사용하여 모델 구축 및 학습
svd = SVD(random_state=42,
          n_epochs= 100, 
          lr_all= 0.01,
          n_factors = 200,
          reg_all = 0.06,
          verbose=True)
svd.fit(trainset)

from surprise.model_selection import cross_validate
# 교차 검증을 사용하여 RMSE 계산
cv_results = cross_validate(svd, data_mean, measures=['RMSE'], cv=5, verbose=True)

# 평균 RMSE 출력
mean_rmse = np.mean(cv_results['test_rmse'])
print("Mean RMSE: {:.4f}".format(mean_rmse))

user_factors = svd.pu
item_factors = svd.qi
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Concatenate
from tensorflow.keras.models import Model

user_input = Input(shape=(user_factors.shape[1],), name='User-Factors')
item_input = Input(shape=(item_factors.shape[1],), name='Item-Factors')

x = Concatenate()([user_input, item_input])
x = Dense(128, activation='relu')(x)
x = Dense(100, activation='relu')(x)
x = Dense(64, activation='relu')(x)
output = Dense(1, activation='linear', name='Rating')(x)

model = Model(inputs=[user_input, item_input], outputs=output)
model.compile(optimizer='adam', loss='mse')

from tensorflow.keras.callbacks import EarlyStopping

user_ids = train_data['User-ID'].apply(lambda x: trainset.to_inner_uid(x))
item_ids = train_data['Book-ID'].apply(lambda x: trainset.to_inner_iid(x))

train_user_factors = user_factors[user_ids]
train_item_factors = item_factors[item_ids]

# EarlyStopping 콜백 정의
early_stopping = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)

history = model.fit([train_user_factors, train_item_factors],
                    train_data['Book-Rating'],
                    epochs=1000,
                    batch_size=128,
                    validation_split=0.2,
                    callbacks=[early_stopping])  # EarlyStopping


from sklearn.metrics import mean_squared_error
from math import sqrt

# Test data의 User-ID와 Book-ID를 내부 ID로 변환
test_user_ids = val_data['User-ID'].apply(lambda x: trainset.to_inner_uid(x) if x in trainset._raw2inner_id_users else -1)
test_item_ids = val_data['Book-ID'].apply(lambda x: trainset.to_inner_iid(x) if x in trainset._raw2inner_id_items else -1)

# 평균 사용자 요인 및 아이템 요인 계산
mean_user_factors = np.mean(user_factors, axis=0)
mean_item_factors = np.mean(item_factors, axis=0)

# Test data의 사용자 요인 및 아이템 요인 가져오기
test_user_factors = np.array([user_factors[i] if i != -1 else mean_user_factors for i in test_user_ids])
test_item_factors = np.array([item_factors[i] if i != -1 else mean_item_factors for i in test_item_ids])

# TensorFlow 모델을 사용하여 예측
predictions = model.predict([test_user_factors, test_item_factors]).flatten()

# 실제 평점 및 예측 평점 간의 RMSE 계산
rmse = sqrt(mean_squared_error(val_data['Book-Rating'], predictions))
print("RMSE on test data: {:.4f}".format(rmse))
>>> 3.7377

### SVD+아이템기반협업필터링

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import Reader, Dataset, SVD
from surprise import KNNWithMeans
from surprise.model_selection import cross_validate
from surprise import accuracy
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sub = pd.read_csv("sample_submission.csv")
train['Age'] = np.where(train['Age'] == 0, 15, np.where(train['Age'] >= 80, 80, train['Age']))
test['Age'] = np.where(test['Age'] == 0, 15, np.where(test['Age'] >= 80, 80, test['Age']))
train['Year-Of-Publication'] = np.where(train['Year-Of-Publication'] == -1, 1997, train['Year-Of-Publication'])
test['Year-Of-Publication'] = np.where(test['Year-Of-Publication'] == -1, 1997, test['Year-Of-Publication'])
from sklearn.model_selection import train_test_split

# Train/Validation 데이터 분할
train_data, val_data = train_test_split(train, test_size=0.2, random_state=42)

# 사용자별 평균 평점 특성 생성
user_mean_rating = train_data.groupby('User-ID')['Book-Rating'].mean().reset_index()
user_mean_rating.columns = ['User-ID', 'User-Mean-Rating']

# 사용자별 평균 평점을 train_data와 val_data에 병합
train_data = train_data.merge(user_mean_rating, on='User-ID', how='left')
val_data = val_data.merge(user_mean_rating, on='User-ID', how='left')

# 사용자별 평균 평점이 결측치인 경우, 전체 평균 평점으로 대체
mean_rating = train_data['Book-Rating'].mean()
train_data['User-Mean-Rating'] = train_data['User-Mean-Rating'].fillna(mean_rating)
val_data['User-Mean-Rating'] = val_data['User-Mean-Rating'].fillna(mean_rating)

# 사용자별 평가 횟수 특성 생성
user_rating_count = train_data.groupby('User-ID')['Book-Rating'].count().reset_index()
user_rating_count.columns = ['User-ID', 'User-Rating-Count']

# 사용자별 평가 횟수를 train_data와 val_data에 병합
train_data = train_data.merge(user_rating_count, on='User-ID', how='left')
val_data = val_data.merge(user_rating_count, on='User-ID', how='left')

# 사용자별 평균 평점과 책별 평균 평점을 포함한 새로운 데이터프레임 생성
user_item_rating_mean = train_data[["User-ID", "Book-ID", 'User-Mean-Rating']]
# 데이터를 Surprise 라이브러리 형식으로 변환
reader = Reader(rating_scale=(0, 10))
data_mean = Dataset.load_from_df(user_item_rating_mean, reader)
# 전체 데이터를 trainset으로 변환
trainset = data_mean.build_full_trainset()

# SVD 알고리즘을 사용하여 모델 구축 및 학습
svd = SVD(random_state=42,
          n_epochs= 100, 
          lr_all= 0.01,
          n_factors = 200,
          reg_all = 0.06,
          verbose=True)
svd.fit(trainset)

# 데이터 샘플링
sampled_train_data = train_data.sample(frac=0.1, random_state=42)

# 사용자별 평균 평점과 책별 평균 평점을 포함한 새로운 데이터프레임 생성
user_item_rating_mean_sampled = sampled_train_data[["User-ID", "Book-ID", 'User-Mean-Rating']]

# 데이터를 Surprise 라이브러리 형식으로 변환
reader = Reader(rating_scale=(0, 10))
data_mean_sampled = Dataset.load_from_df(user_item_rating_mean_sampled, reader)

# 샘플링된 데이터를 trainset으로 변환
trainset_sampled = data_mean_sampled.build_full_trainset()

# 아이템 기반 협업 필터링 (KNNWithMeans) 모델 구축
item_based_cf = KNNWithMeans(k=40, sim_options={'name': 'pearson_baseline', 'user_based': False}, verbose=True)

# 샘플링된 데이터로 모델 학습
item_based_cf.fit(trainset_sampled)

# SVD 예측
svd_preds = []
for _, row in val_data.iterrows():
    svd_preds.append(svd.predict(row['User-ID'], row['Book-ID'])[3])

# 아이템 기반 협업 필터링 예측
item_based_cf_preds = []
for _, row in val_data.iterrows():
    item_based_cf_preds.append(item_based_cf.predict(row['User-ID'], row['Book-ID'])[3])
# 앙상블 예측 (가중 평균)
ensemble_preds = []
alpha = 0.999  # 가중치 설정 (0과 1 사이의 값을 선택)
for i in range(len(svd_preds)):
    ensemble_preds.append(alpha * svd_preds[i] + (1 - alpha) * item_based_cf_preds[i])
from sklearn.metrics import mean_squared_error
from math import sqrt

# SVD RMSE
svd_rmse = sqrt(mean_squared_error(val_data['Book-Rating'], svd_preds))
print("SVD RMSE:", svd_rmse)
>>> 3.3974045875897705

# 아이템 기반 협업 필터링 RMSE
item_based_cf_rmse = sqrt(mean_squared_error(val_data['Book-Rating'], item_based_cf_preds))
print("Item-Based CF RMSE:", item_based_cf_rmse)
>>> 3.945342860840093

# 앙상블 RMSE
ensemble_rmse = sqrt(mean_squared_error(val_data['Book-Rating'], ensemble_preds))
print("Ensemble RMSE:", ensemble_rmse)
>>> 3.3974878879011983

### SVD+사용자기반협업필터링 앙상블

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import Reader, Dataset, SVD
from surprise import KNNWithMeans
from surprise.model_selection import cross_validate
from surprise import accuracy
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sub = pd.read_csv("sample_submission.csv")
train['Age'] = np.where(train['Age'] == 0, 15, np.where(train['Age'] >= 80, 80, train['Age']))
test['Age'] = np.where(test['Age'] == 0, 15, np.where(test['Age'] >= 80, 80, test['Age']))
train['Year-Of-Publication'] = np.where(train['Year-Of-Publication'] == -1, 1997, train['Year-Of-Publication'])
test['Year-Of-Publication'] = np.where(test['Year-Of-Publication'] == -1, 1997, test['Year-Of-Publication'])
from sklearn.model_selection import train_test_split

# Train/Validation 데이터 분할
train_data, val_data = train_test_split(train, test_size=0.2, random_state=42)

# 사용자별 평균 평점 특성 생성
user_mean_rating = train_data.groupby('User-ID')['Book-Rating'].mean().reset_index()
user_mean_rating.columns = ['User-ID', 'User-Mean-Rating']

# 사용자별 평균 평점을 train_data와 val_data에 병합
train_data = train_data.merge(user_mean_rating, on='User-ID', how='left')
val_data = val_data.merge(user_mean_rating, on='User-ID', how='left')

# 사용자별 평균 평점이 결측치인 경우, 전체 평균 평점으로 대체
mean_rating = train_data['Book-Rating'].mean()
train_data['User-Mean-Rating'] = train_data['User-Mean-Rating'].fillna(mean_rating)
val_data['User-Mean-Rating'] = val_data['User-Mean-Rating'].fillna(mean_rating)

# 사용자별 평가 횟수 특성 생성
user_rating_count = train_data.groupby('User-ID')['Book-Rating'].count().reset_index()
user_rating_count.columns = ['User-ID', 'User-Rating-Count']

# 사용자별 평가 횟수를 train_data와 val_data에 병합
train_data = train_data.merge(user_rating_count, on='User-ID', how='left')
val_data = val_data.merge(user_rating_count, on='User-ID', how='left')

# 사용자별 평균 평점과 책별 평균 평점을 포함한 새로운 데이터프레임 생성
user_item_rating_mean = train_data[["User-ID", "Book-ID", 'User-Mean-Rating']]
# 데이터를 Surprise 라이브러리 형식으로 변환
reader = Reader(rating_scale=(0, 10))
data_mean = Dataset.load_from_df(user_item_rating_mean, reader)
# 전체 데이터를 trainset으로 변환
trainset = data_mean.build_full_trainset()

# SVD 알고리즘을 사용하여 모델 구축 및 학습
svd = SVD(random_state=42,
          n_epochs= 100, 
          lr_all= 0.01,
          n_factors = 200,
          reg_all = 0.06,
          verbose=True)
svd.fit(trainset)

# 사용자 기반 협업 필터링 (KNNWithMeans) 모델 구축
user_based_cf = KNNWithMeans(k=40, sim_options={'name': 'pearson_baseline', 'user_based': True}, verbose=True)

# 샘플링된 데이터로 모델 학습
user_based_cf.fit(trainset_sampled)

# SVD 예측
svd_preds = []
for _, row in val_data.iterrows():
    svd_preds.append(svd.predict(row['User-ID'], row['Book-ID']).est)

# 사용자 기반 협업 필터링 예측
user_based_cf_preds = []
for _, row in val_data.iterrows():
    user_based_cf_preds.append(user_based_cf.predict(row['User-ID'], row['Book-ID']).est)
# 앙상블 예측 (가중 평균)
ensemble_preds = []
alpha_svd = 0.8
alpha_user_based_cf = 0.2
for i in range(len(svd_preds)):
    ensemble_preds.append(alpha_svd * svd_preds[i] + alpha_user_based_cf * user_based_cf_preds[i])
# RMSE 계산
from sklearn.metrics import mean_squared_error
from math import sqrt

svd_rmse = sqrt(mean_squared_error(val_data['Book-Rating'], svd_preds))
user_based_cf_rmse = sqrt(mean_squared_error(val_data['Book-Rating'], user_based_cf_preds))
ensemble_rmse = sqrt(mean_squared_error(val_data['Book-Rating'], ensemble_preds))

print(f'SVD RMSE: {svd_rmse}')
>>> 3.3974045875897705

print(f'User-Based CF RMSE: {user_based_cf_rmse}')
>>> 3.6803972720108873

print(f'Ensemble RMSE: {ensemble_rmse}')
>>> 3.419238697809746

# 그리드 서치, 교차 검증


🙋‍♂️ 가설 6: **하이퍼파라미터 최적화**를 통해 **추천 시스템의 성능을 향상**시킬 수 있다.

- 결과물: 그리드 서치, 랜덤 서치, 베이지안 최적화 등의 기법을 사용하여 모델의 하이퍼파라미터를 최적화한다. 최적화된 하이퍼파라미터를 사용하여 모델의 성능을 평가하고, 기본 하이퍼파라미터를 사용한 모델과 성능을 비교하여 최적화 기법의 효과를 분석한다.


### GridSearch

In [None]:
from surprise.model_selection import GridSearchCV, cross_validate

param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}

gridcv = GridSearchCV(algo_class=SVD, 
                      param_grid=param_grid, 
                      measures=["rmse"], cv=3)
gridcv.fit(data)

gridcv.best_estimator
gridcv.best_params["rmse"]
gridcv.best_score
gridcv.algo_class
best_svd = gridcv.best_estimator["rmse"]
best_svd.fit(X_train)
X_test = [(f"TEST_{uid[6:]}", iid, rating) for uid, iid, rating in X_test]
X_test

predictions_bsvd = best_svd.test(X_test)
predictions_bsvd

predictions_bsvd[0]
accuracy.rmse(predictions_bsvd)

df_pred_svd = pd.DataFrame(predictions_bsvd)
df_pred_svd.head()

y_pred_svd = df_pred_svd["est"].unique()
y_pred_svd

y_pred_svd = [pred.est for pred in predictions_bsvd]
y_pred_svd = np.array(y_pred_svd)
y_pred_svd.shape

df_submit = pd.read_csv(f"{base_path}/sample_submission.csv")
df_submit.head()

y_pred_svd = y_pred_svd[:len(df_submit)]
y_pred_svd

df_submit["Book-Rating"] = abs(y_pred_svd)

file_name = f"{base_path}/submit_bsvd.csv"
df_submit.to_csv(file_name, index=False)
pd.read_csv(file_name)

In [None]:
from surprise.model_selection import GridSearchCV, cross_validate

# 데이터 준비 
# 유저 - 아이템 - 평점 데이터셋 만들기 
df = train_data[["User-ID", "Book-Title", "Book-Rating"]]

# reader 
r_min = df['Book-Rating'].min()
r_max = df['Book-Rating'].max()
reader = Reader(rating_scale=(r_min, r_max))


# Dataset.load_from_df 로 데이터를 로드합니다.
data = Dataset.load_from_df(
    df[["User-ID", "Book-Title", "Book-Rating"]], 
    reader)

# GridSearchCV로 최적 파라미터 탐색
param_grid = {'n_epochs': [5, 10, 20], 
              'lr_all': [0.002, 0.005],
              'n_factors' : [10, 20],
              'reg_all': [0.4, 0.6]}

gridcv = GridSearchCV(algo_class=SVD, 
                      param_grid=param_grid, 
                      measures=["rmse"], cv=3)

gridcv.fit(data)
gridcv.best_estimator
gridcv.best_params["rmse"]
gridcv.best_score

# 가장 잘나온 파라미터로 훈련 및 평가 
pd.DataFrame(gridcv.cv_results)
best_svd = gridcv.best_estimator["rmse"]
best_svd.fit(X_train)
predictions_bsvd = best_svd.test(X_test)
accuracy.rmse(predictions_bsvd)

In [None]:
from sklearn.model_selection import train_test_split
from surprise.model_selection import GridSearchCV

# Train/Validation 데이터 분할
train_data, val_data = train_test_split(train, test_size=0.2, random_state=42)
# 사용자별 평균 평점 특성 생성
user_mean_rating = train_data.groupby('User-ID')['Book-Rating'].mean().reset_index()
user_mean_rating.columns = ['User-ID', 'User-Mean-Rating']
# 사용자별 평균 평점을 train_data와 val_data에 병합
train_data = train_data.merge(user_mean_rating, on='User-ID', how='left')
val_data = val_data.merge(user_mean_rating, on='User-ID', how='left')
# 사용자별 평균 평점이 결측치인 경우, 전체 평균 평점으로 대체
mean_rating = train_data['Book-Rating'].mean()
train_data['User-Mean-Rating'] = train_data['User-Mean-Rating'].fillna(mean_rating)
val_data['User-Mean-Rating'] = val_data['User-Mean-Rating'].fillna(mean_rating)
# 사용자별 평가 횟수 특성 생성
user_rating_count = train_data.groupby('User-ID')['Book-Rating'].count().reset_index()
user_rating_count.columns = ['User-ID', 'User-Rating-Count']

# 사용자별 평가 횟수를 train_data와 val_data에 병합
train_data = train_data.merge(user_rating_count, on='User-ID', how='left')
val_data = val_data.merge(user_rating_count, on='User-ID', how='left')

user_item_rating_mean = train_data[["User-ID", "Book-ID", 'User-Mean-Rating']]
reader = Reader(rating_scale=(0, 10))
data_mean = Dataset.load_from_df(user_item_rating_mean, reader)

# GridSearchCV로 최적 파라미터 탐색
param_grid = {'n_epochs': [5, 20, 30], 
              'lr_all': [0.002, 0.005, 0.007],
              'n_factors' : [20, 50, 100],
              'reg_all': [0.02, 0.04]}

gridcv = GridSearchCV(algo_class=SVD, 
                      param_grid=param_grid, 
                      measures=["rmse"], cv=3)

gridcv.fit(data_mean)
gridcv.best_estimator
gridcv.best_params["rmse"]
gridcv.best_score

### Cross Validation

In [None]:
cv_result = cross_validate(best_svd, data, measures=["rmse"], n_jobs=2, verbose=True)

pd.DataFrame(cv_result)

def get_top_n(predictions, n=10):
    # 각 사용자의 예측데이터를 defaultdict에 저장
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # 정렬 후 Top N 개만 저장
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

top_num = get_top_n(predictions_bsvd, 10)

df_cust_recomm = pd.DataFrame(top_num.items())
df_cust_recomm.shape

stock_desc = df_valid[
    ["ID", "Book-ID","Book-Rating"]].drop_duplicates("ID").set_index("Book-ID")
stock_desc
stock_desc.loc["TRAIN_000004"]

df_recomm = pd.DataFrame(df_cust_recomm.iloc[:, 1]).set_index('Book-ID').join(stock_desc)
df_recomm

y_pred_recomm = df_recomm["Book-Rating"].unique()
y_pred_recomm

df_submit = pd.read_csv(f"{base_path}/sample_submission.csv")
df_submit

df_submit["Book-Rating"] = abs(y_pred_recomm)
file_name = f"{base_path}/submit.csv"
from google.colab import files

files.download(file_name)


In [None]:
# cross validation 
# Regularization 적용 

from surprise import SVD
from surprise.model_selection import cross_validate

# 유저 - 아이템 - 평점 데이터셋 만들기 
df = train[['User-ID','Book-Title', 'Book-Rating']]

# reader 
r_min = df['Book-Rating'].min()
r_max = df['Book-Rating'].max()
reader = Reader(rating_scale=(r_min, r_max))


# Dataset.load_from_df 로 데이터를 로드합니다.
data = Dataset.load_from_df(
    df[['User-ID','Book-Title', 'Book-Rating']], 
    reader)

svd_1 = SVD(n_factors=50, n_epochs=10, lr_all=0.005, reg_all=0.4, random_state=42)
svd_2 = SVD(n_factors=50, n_epochs=15, lr_all=0.01, reg_all=0.2, random_state=42)
svd_3 = SVD(n_factors=50, n_epochs=20, lr_all=0.02, reg_all=0.1, random_state=42)


# results_1 = cross_validate(svd_1, data, measures=['RMSE'], cv=5, n_jobs=2, verbose=False)
results_2 = cross_validate(svd_2, data, measures=['RMSE'], cv=5, n_jobs=2, verbose=False)
# results_3 = cross_validate(svd_3, data, measures=['RMSE'], cv=5, n_jobs=2, verbose=False)

# print("SVD 1 RMSE:", results_1['test_rmse'].mean())
print("SVD 2 RMSE:", results_2['test_rmse'].mean())
# print("SVD 3 RMSE:", results_3['test_rmse'].mean())

# # Best model
best_svd = svd_2

# 딥러닝

🙋‍♂️ 가설 4: **딥 러닝 기반의 추천 시스템**이 도서 평점 예측에 효과적일 것이다.

- 결과물: **딥 러닝 모델(신경망 기반 추천 모델 또는 잠재 요인 모델)**을 구현하고, 모델의 **평점 예측 성능**을 평가한다. 다른 추천 시스템과 성능을 비교하여 딥 러닝 기반 추천 시스템의 효과를 분석한다.

In [None]:
! pip install pad_sequences
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Flatten, Embedding, Conv1D, MaxPooling1D
from sklearn.model_selection import train_test_split

df = train_data[["User-ID", "Book-Title", "Book-Rating"]]

# 데이터 전처리 
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer(num_words=5000) # num_words ; 단어의 빈도수가 높은 순으로 n개
tokenizer.fit_on_texts(df['Book-Title'])
X = tokenizer.texts_to_sequences(df['Book-Title'])
X = pad_sequences(X, maxlen=100)

# 데이터 셋 분리 
y = df['Book-Rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# cnn 모델 정의 
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=32, input_length=100))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(units=10, activation='relu'))
model.add(Dense(units=1))

model.compile(loss='mean_squared_error', optimizer='adam')
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=128, epochs=50, verbose=1, validation_split=0.2)


from sklearn.metrics import mean_squared_error
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error:", rmse)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from keras.regularizers import l1_l2

model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dense(1))

model.compile(optimizer=Adam(lr=0.001), loss='mean_squared_error')

history = model.fit(X_train, y_train, batch_size=128, epochs=500, verbose=1, validation_split=0.2)

y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error:", rmse)
>>> 3.8194436625733723

### 정규화, Early Stopping

In [None]:
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],), kernel_regularizer=l1_l2(l1=0.01, l2=0.01)))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu', kernel_regularizer=l1_l2(l1=0.01, l2=0.01)))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu', kernel_regularizer=l1_l2(l1=0.01, l2=0.01)))
model.add(Dense(1))

model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = model.fit(X_train, y_train, batch_size=128, epochs=500, verbose=1, validation_split=0.2, callbacks=[early_stopping])

y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error:", rmse)
>>> 3.848602675184224

In [None]:
### MinMaxScaler를 이용한 콘텐츠기반딥러닝

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

train = pd.read_csv("reco/train.csv")
test = pd.read_csv("reco/test.csv")
sub = pd.read_csv("reco/sample_submission.csv")
train['Age'] = np.where((train['Age'] == 0) | (train['Age'] >= 80), 35, train['Age'])
test['Age'] = np.where((test['Age'] == 0) | (test['Age'] >= 80), 35, test['Age'])
train['Year-Of-Publication'] = np.where(train['Year-Of-Publication'] == -1, 1997, train['Year-Of-Publication'])
test['Year-Of-Publication'] = np.where(test['Year-Of-Publication'] == -1, 1997, test['Year-Of-Publication'])
# 데이터 전처리 범주형 열을 one-hot 인코딩 시켜준다. 
# 'User-ID', 'Book-ID', 'Location', 'Book-Title', 'Book-Author', 'Publisher' 다했더니 커널이 죽는다. 챗지피티 질문
# 1. Book-Author와 Publisher만 원핫인코딩
train_encoded = pd.get_dummies(train, columns=['Book-Author', 'Publisher'])
# 2. 범주형 변수에서 가장 빈도가 높은 상위 N개의 범주만 사용하여 원-핫 인코딩
# 상위 N개 범주만 유지
N = 200
top_N_authors = train['Book-Author'].value_counts().head(N).index
top_N_publishers = train['Publisher'].value_counts().head(N).index

# 상위 N개 범주에 속하지 않는 경우 'Other'로 대체
train['Book-Author'] = train['Book-Author'].apply(lambda x: x if x in top_N_authors else 'Other')
train['Publisher'] = train['Publisher'].apply(lambda x: x if x in top_N_publishers else 'Other')

# 원-핫 인코딩 적용
train_encoded = pd.get_dummies(train, columns=['Book-Author', 'Publisher'])
scaler = MinMaxScaler()
train_encoded[['Age', 'Year-Of-Publication']] = scaler.fit_transform(train_encoded[['Age', 'Year-Of-Publication']])
from sklearn.model_selection import train_test_split

X = train_encoded.drop('Book-Rating', axis=1)
y = train_encoded['Book-Rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1)
])

model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

early_stopping = EarlyStopping(monitor='val_loss', patience=30)
history = model.fit(X_train, y_train, batch_size=128, epochs=1000, verbose=1, validation_split=0.2, callbacks=[early_stopping])
y_pred = model.predict(X_test)
mse = tf.keras.losses.mean_squared_error(y_test, y_pred)
rmse = tf.sqrt(mse)
print("Root Mean Squared Error:", rmse.numpy())

# LinearRegression

In [None]:
# object type을 catetory화 시키기

for column in train.columns:
    if train[column].dtype == 'object':
        train[column] = train[column].astype('category')
# 데이터를 학습 및 테스트 세트로 분할
X_train, X_test, y_train, y_test = train_test_split(X, train['Book-Rating'], test_size=0.2, random_state=42)

# 회귀 모델 학습
model = LinearRegression()
model.fit(X_train, y_train)

# 예측 및 성능 평가
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)
>>> 3.813516138856568

# SVD++

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import Reader, Dataset, SVD, SVDpp
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sub = pd.read_csv("sample_submission.csv")
train['Age'] = np.where(train['Age'] == 0, 15, np.where(train['Age'] >= 80, 80, train['Age']))
test['Age'] = np.where(test['Age'] == 0, 15, np.where(test['Age'] >= 80, 80, test['Age']))
train['Year-Of-Publication'] = np.where(train['Year-Of-Publication'] == -1, 1997, train['Year-Of-Publication'])
test['Year-Of-Publication'] = np.where(test['Year-Of-Publication'] == -1, 1997, test['Year-Of-Publication'])
# train = train[~train['User-ID'].isin(single_rating_users.index)] svd++에서 시도하는것
# test = test[~test['User-ID'].isin(single_rating_users.index)]
from sklearn.model_selection import train_test_split

# Train/Validation 데이터 분할
train_data, val_data = train_test_split(train, test_size=0.2, random_state=42)

# 사용자별 평균 평점 특성 생성
user_mean_rating = train_data.groupby('User-ID')['Book-Rating'].mean().reset_index()
user_mean_rating.columns = ['User-ID', 'User-Mean-Rating']

# 사용자별 평균 평점을 train_data와 val_data에 병합
train_data = train_data.merge(user_mean_rating, on='User-ID', how='left')
val_data = val_data.merge(user_mean_rating, on='User-ID', how='left')

# 사용자별 평균 평점이 결측치인 경우, 전체 평균 평점으로 대체
mean_rating = train_data['Book-Rating'].mean()
train_data['User-Mean-Rating'] = train_data['User-Mean-Rating'].fillna(mean_rating)
val_data['User-Mean-Rating'] = val_data['User-Mean-Rating'].fillna(mean_rating)

# 사용자별 평가 횟수 특성 생성
user_rating_count = train_data.groupby('User-ID')['Book-Rating'].count().reset_index()
user_rating_count.columns = ['User-ID', 'User-Rating-Count']

# 사용자별 평가 횟수를 train_data와 val_data에 병합
train_data = train_data.merge(user_rating_count, on='User-ID', how='left')
val_data = val_data.merge(user_rating_count, on='User-ID', how='left')

# 사용자별 평균 평점과 책별 평균 평점을 포함한 새로운 데이터프레임 생성
user_item_rating_mean = train_data[["User-ID", "Book-ID", 'User-Mean-Rating']]
# 데이터를 Surprise 라이브러리 형식으로 변환
reader = Reader(rating_scale=(0, 10))
data_mean = Dataset.load_from_df(user_item_rating_mean, reader)
# 전체 데이터를 trainset으로 변환
trainset = data_mean.build_full_trainset()

# SVD 알고리즘을 사용하여 모델 구축 및 학습
svdpp = SVDpp(random_state=42,
          n_epochs= 50, 
          verbose=True)
svdpp.fit(trainset) #svdpp를 svd로 바꾸기만 하면 됌
# 검증 데이터에 대한 예측
val_data["predicted_rating"] = val_data.apply(lambda row: svdpp.predict(row["User-ID"], row["Book-ID"]).est, axis=1)
# 성능 평가 (RMSE)
from sklearn.metrics import mean_squared_error
rmse = mean_squared_error(val_data["Book-Rating"], val_data["predicted_rating"], squared=False)
print(f"RMSE: {rmse:.4f}")
# 원래의 테스트 데이터에서 Book-Rating 예측
test["Book-Rating"] = test.apply(lambda row: svdpp.predict(row["User-ID"], row["Book-ID"]).est, axis=1)

# 결과를 저장할 데이터프레임 생성 및 결합
sub = sub.copy()
sub["Book-Rating"] = test["Book-Rating"]

# 결과를 CSV 파일로 저장
sub.to_csv(f"svd{rmse:.4f}.csv", index=False)
```