# Version 1

In [None]:
import os
import torch
import torch.nn.functional as F
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GATConv
from torch_geometric.utils import to_undirected
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import numpy as np
import pandas as pd

# 데이터 로드
book_embeddings = pd.read_csv('/kaggle/input/bookembedding/Book_Embedding_KoBERT.csv').values
movie_embeddings = pd.read_csv('/kaggle/input/movieembedding/Movie_Embedding_KoBERT.csv').values
book_data = pd.read_excel('/kaggle/input/book-data/Books_Data.xlsx')
movie_data = pd.read_csv('/kaggle/input/moviedata/KoBERT_movie_keyword.csv')

# 스케일링
scaler = StandardScaler()
book_embeddings = scaler.fit_transform(book_embeddings)
movie_embeddings = scaler.fit_transform(movie_embeddings)

# 텐서로 변환 및 평점 정규화
book_embeddings = torch.tensor(book_embeddings, dtype=torch.float32)
movie_embeddings = torch.tensor(movie_embeddings, dtype=torch.float32)
book_ratings = torch.tensor(book_data['평점'].values, dtype=torch.float32)
movie_ratings = torch.tensor(movie_data['평점'].values, dtype=torch.float32)

# 평점 정규화
book_ratings = (book_ratings - book_ratings.mean()) / book_ratings.std()
movie_ratings = (movie_ratings - movie_ratings.mean()) / movie_ratings.std()

# 평점과 임베딩 결합
rating_weight = 1
book_features = torch.cat([book_embeddings, book_ratings.unsqueeze(1) * rating_weight], dim=1)
movie_features = torch.cat([movie_embeddings, movie_ratings.unsqueeze(1) * rating_weight], dim=1)

# 트레인/테스트 데이터셋 분할
train_movie_features, _, train_movie_ratings, _ = train_test_split(
    movie_features, movie_ratings, test_size=0.2, random_state=42
)
train_book_features, _, train_book_ratings, _ = train_test_split(
    book_features, book_ratings, test_size=0.2, random_state=42
)

# 책과 영화의 임베딩 차원 맞추기 (차원 일치)
max_dim = max(train_movie_features.size(1), train_book_features.size(1))
if train_movie_features.size(1) < max_dim:
    padding_size = max_dim - train_movie_features.size(1)
    train_movie_features = torch.cat([train_movie_features, torch.zeros(train_movie_features.size(0), padding_size)], dim=1)
if train_book_features.size(1) < max_dim:
    padding_size = max_dim - train_book_features.size(1)
    train_book_features = torch.cat([train_book_features, torch.zeros(train_book_features.size(0), padding_size)], dim=1)

# 임베딩 기반 유사도로 엣지 생성 함수
def create_edges_based_on_embeddings(movie_embeddings, book_embeddings):
    edge_index = []
    num_movies = movie_embeddings.size(0)
    num_books = book_embeddings.size(0)
    
    # 코사인 유사도 계산
    movie_embeddings = F.normalize(movie_embeddings, p=2, dim=1)
    book_embeddings = F.normalize(book_embeddings, p=2, dim=1)
    
    similarity_matrix = torch.matmul(movie_embeddings, book_embeddings.t())
    
    # 각 영화에 대해 가장 유사한 도서를 찾고 엣지 생성
    for movie_idx in range(num_movies):
        best_book_idx = similarity_matrix[movie_idx].argmax().item()
        edge_index.append([movie_idx, best_book_idx + num_movies])  # 영화 -> 도서 연결
    
    # 엣지 인덱스를 텐서로 변환하고 양방향 엣지 생성
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    edge_index = to_undirected(edge_index)
    
    return edge_index

# 엣지 생성 및 데이터 구성
combined_edge_index = create_edges_based_on_embeddings(train_movie_features, train_book_features)

# 그래프 데이터 구성
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
x_train = torch.cat([train_movie_features, train_book_features], dim=0).to(device)
data_train = Data(x=x_train, edge_index=combined_edge_index.to(device))


# 모델 + 평가


# Version 2

In [None]:
import os
import torch
import torch.nn.functional as F
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GATConv
from torch_geometric.utils import to_undirected
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import numpy as np
import pandas as pd

# 데이터 로드
book_embeddings = pd.read_csv('/kaggle/input/bookembedding/Book_Embedding_KoBERT.csv').values
movie_embeddings = pd.read_csv('/kaggle/input/movieembedding/Movie_Embedding_KoBERT.csv').values
book_data = pd.read_excel('/kaggle/input/book-data/Books_Data.xlsx')
movie_data = pd.read_csv('/kaggle/input/moviedata/KoBERT_movie_keyword.csv')

# 스케일링
scaler = StandardScaler()
book_embeddings = scaler.fit_transform(book_embeddings)
movie_embeddings = scaler.fit_transform(movie_embeddings)

# 텐서로 변환 및 평점 정규화
book_embeddings = torch.tensor(book_embeddings, dtype=torch.float32)
movie_embeddings = torch.tensor(movie_embeddings, dtype=torch.float32)
book_ratings = torch.tensor(book_data['평점'].values, dtype=torch.float32)
movie_ratings = torch.tensor(movie_data['평점'].values, dtype=torch.float32)

# 평점 정규화
book_ratings = (book_ratings - book_ratings.mean()) / book_ratings.std()
movie_ratings = (movie_ratings - movie_ratings.mean()) / movie_ratings.std()

# 평점과 임베딩 결합
rating_weight = 1
book_features = torch.cat([book_embeddings, book_ratings.unsqueeze(1) * rating_weight], dim=1)
movie_features = torch.cat([movie_embeddings, movie_ratings.unsqueeze(1) * rating_weight], dim=1)

# 트레인/테스트 데이터셋 분할
train_movie_features, _, train_movie_ratings, _ = train_test_split(
    movie_features, movie_ratings, test_size=0.2, random_state=42
)
train_book_features, _, train_book_ratings, _ = train_test_split(
    book_features, book_ratings, test_size=0.2, random_state=42
)

# 책과 영화의 임베딩 차원 맞추기 (차원 일치)
max_dim = max(train_movie_features.size(1), train_book_features.size(1))
if train_movie_features.size(1) < max_dim:
    padding_size = max_dim - train_movie_features.size(1)
    train_movie_features = torch.cat([train_movie_features, torch.zeros(train_movie_features.size(0), padding_size)], dim=1)
if train_book_features.size(1) < max_dim:
    padding_size = max_dim - train_book_features.size(1)
    train_book_features = torch.cat([train_book_features, torch.zeros(train_book_features.size(0), padding_size)], dim=1)

# 임베딩 기반 유사도로 엣지 생성 함수
def create_edges_based_on_embeddings(movie_embeddings, book_embeddings):
    edge_index = []
    num_movies = movie_embeddings.size(0)
    num_books = book_embeddings.size(0)
    
    # 코사인 유사도 계산
    movie_embeddings = F.normalize(movie_embeddings, p=2, dim=1)
    book_embeddings = F.normalize(book_embeddings, p=2, dim=1)
    
    similarity_matrix = torch.matmul(movie_embeddings, book_embeddings.t())
    
    # 각 영화에 대해 가장 유사한 도서를 찾고 엣지 생성
    for movie_idx in range(num_movies):
        best_book_idx = similarity_matrix[movie_idx].argmax().item()
        edge_index.append([movie_idx, best_book_idx + num_movies])  # 영화 -> 도서 연결
    
    # 엣지 인덱스를 텐서로 변환하고 양방향 엣지 생성
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    edge_index = to_undirected(edge_index)
    
    return edge_index

# 엣지 생성 및 데이터 구성
combined_edge_index = create_edges_based_on_embeddings(train_movie_features, train_book_features)

# 그래프 데이터 구성
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
x_train = torch.cat([train_movie_features, train_book_features], dim=0).to(device)
data_train = Data(x=x_train, edge_index=combined_edge_index.to(device))

# 데이터셋 및 데이터로더 생성
class GraphDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return 1

    def __getitem__(self, idx):
        return self.data

dataset = GraphDataset(data_train)

# 사용자 정의 collate 함수
def custom_collate_fn(batch):
    return batch[0]

# 데이터 로더 초기화
train_loader = DataLoader(dataset, batch_size=1, collate_fn=custom_collate_fn)

# GAT 모델 정의
class GATNet(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GATNet, self).__init__()
        self.gat1 = GATConv(input_dim, hidden_dim, heads=4, concat=True)
        self.gat2 = GATConv(hidden_dim * 4, output_dim, heads=1, concat=False)

    def forward(self, x, edge_index):
        x = self.gat1(x, edge_index)
        x = F.elu(x)
        x = self.gat2(x, edge_index)
        return x

# 모델 및 옵티마이저 초기화
input_dim = x_train.size(1)
hidden_dim = 64
output_dim = input_dim

model = GATNet(input_dim, hidden_dim, output_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=1e-4)
criterion = torch.nn.MSELoss()

# 커스텀 손실 함수
def custom_loss(output, target):
    mse_loss = criterion(output, target)
    penalty = torch.sum(torch.abs(output - target) ** 2) / output.size(0)
    return mse_loss + 0.1 * penalty

# 학습 루프
model.train()
for epoch in range(1, 501):
    total_loss = 0
    for batch in train_loader:
        batch = batch.to(device)
        x, edge_index = batch.x, batch.edge_index
        optimizer.zero_grad()
        out = model(x, edge_index)
        loss = custom_loss(out, x)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    if epoch % 100 == 0:  # Logging more frequently for debugging
        print(f'Epoch {epoch}, Loss: {total_loss:.4f}')

# 모델 저장
checkpoint_path = 'checkpoint_gat.pth'
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': total_loss,
}, checkpoint_path)

print(f"Model saved to {checkpoint_path}")

import torch
import torch.nn.functional as F
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Model evaluation mode
model.eval()

# Ensure all data is on the same device as the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move x_train and edge_index to the correct device
x_train = x_train.to(device)
combined_edge_index = combined_edge_index.to(device)

# Run model inference
with torch.no_grad():
    output_embeddings = model(x_train, combined_edge_index)

# Separate movie and book embeddings
movie_embeddings_output = output_embeddings[:train_movie_features.size(0)]
book_embeddings_output = output_embeddings[train_movie_features.size(0):]

# Move tensors to CPU for cosine similarity calculation
movie_embeddings_output_cpu = movie_embeddings_output.cpu()
book_embeddings_output_cpu = book_embeddings_output.cpu()

# Calculate cosine similarity on the CPU
similarity_matrix = cosine_similarity(movie_embeddings_output_cpu, book_embeddings_output_cpu)

# Implementing a diversity-aware recommendation
top_k_books = 5  # Number of books to recommend per movie
recommendations = {}
used_books = set()  # Track used books to promote diversity

for movie_idx in range(similarity_matrix.shape[0]):
    # Get sorted indices based on similarity scores
    sorted_indices = similarity_matrix[movie_idx].argsort()[::-1]
    
    # Filter out already recommended books
    diverse_recommendations = []
    for idx in sorted_indices:
        if idx not in used_books:
            diverse_recommendations.append(idx)
        if len(diverse_recommendations) == top_k_books:
            break

    # Update the recommendations and the set of used books
    recommendations[movie_idx] = diverse_recommendations
    used_books.update(diverse_recommendations)

# Save recommendations to an Excel file
book_titles = book_data['도서명'].values
movie_titles = movie_data['제목'].values

recommendation_list = []

for movie_idx, book_indices in recommendations.items():
    for book_idx in book_indices:
        recommendation_list.append({
            '영화명': movie_titles[movie_idx],
            '추천 도서명': book_titles[book_idx],
            '유사도': similarity_matrix[movie_idx, book_idx]
        })

# Convert to DataFrame
recommendation_df = pd.DataFrame(recommendation_list)

# Save to Excel
output_excel_path = 'book_recommendations.xlsx'
recommendation_df.to_excel(output_excel_path, index=False)

print(f"Recommendations saved to {output_excel_path}")



# Version 3

In [None]:
# 필요한 라이브러리 임포트
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GATConv
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np

# 데이터셋 및 데이터로더 생성
class GraphDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return 1

    def __getitem__(self, idx):
        return self.data

def collate_fn(batch):
    return batch[0]

dataset = GraphDataset(data_train)
train_loader = DataLoader(dataset, batch_size=1, shuffle=True, collate_fn=collate_fn)

# GAT 모델 정의
class GATNet(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GATNet, self).__init__()
        self.gat1 = GATConv(input_dim, hidden_dim, heads=4, concat=True)
        self.gat2 = GATConv(hidden_dim * 4, output_dim, heads=1, concat=False)

    def forward(self, x, edge_index):
        x = self.gat1(x, edge_index)
        x = F.elu(x)
        x = self.gat2(x, edge_index)
        return x

# 모델 및 옵티마이저 초기화
input_dim = x_train.size(1)
hidden_dim = 64
output_dim = input_dim

model = GATNet(input_dim, hidden_dim, output_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=1e-4)
criterion = torch.nn.MSELoss()

# 패널티 기반 손실 함수
def custom_loss(output, target):
    mse_loss = criterion(output, target)
    penalty = torch.sum(torch.abs(output - target) ** 2) / output.size(0)
    return mse_loss + 0.1 * penalty

# 학습 루프
model.train()
for epoch in range(1, 2001):
    total_loss = 0
    for batch in train_loader:
        batch = batch.to(device)
        x, edge_index = batch.x, batch.edge_index
        optimizer.zero_grad()
        out = model(x, edge_index)
        loss = custom_loss(out, x)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    if epoch % 50 == 0:
        print(f'Epoch {epoch}, Loss: {total_loss:.4f}')

# 모델 저장
checkpoint_path = 'checkpoint_gat.pth'
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': total_loss,
}, checkpoint_path)

print(f"모델이 {checkpoint_path}에 저장되었습니다.")

# movie_features와 movie_edge_index 정의하기
# 예시: movie_features는 영화의 특징을 담는 tensor, movie_edge_index는 영화 간의 연결을 나타내는 tensor
movie_features = torch.tensor(x_movie, dtype=torch.float).to(device)  # 영화 특징
movie_edge_index = torch.tensor(edge_index_movie, dtype=torch.long).to(device)  # 영화 간의 엣지 인덱스

# 영화 임베딩 생성
model.eval()
with torch.no_grad():
    movie_embeddings = model(movie_features, movie_edge_index)

# 추천 함수 (영화 -> 도서)
def recommend_books_for_movie(movie_idx, top_n=1):
    movie_embedding = movie_embeddings[movie_idx].unsqueeze(0)
    similarities = F.cosine_similarity(movie_embedding, book_embeddings).cpu().numpy()
    
    # 대출 건수 기반 패널티 적용
    loan_counts = book_data['대출건수'].values
    weighted_similarities = similarities - 0.01 * loan_counts
    
    # 추천된 도서의 인덱스를 필터링하지 않고 점수 조정
    top_indices = np.argpartition(-weighted_similarities, top_n)[:top_n]
    top_indices = top_indices[np.argsort(-weighted_similarities[top_indices])]
    
    recommended_books = book_data.iloc[top_indices]
    return recommended_books, top_indices

# 모든 영화에 대해 추천 생성
all_recommendations = []
recommendation_history = {}

for movie_idx in range(movie_features.size(0)):
    movie_title = movie_data.iloc[movie_idx]['제목']
    
    # 이미 추천된 도서 목록 초기화
    already_recommended = set(recommendation_history.get(movie_idx, []))
    
    # 영화에 대한 도서 추천
    recommended_books, top_indices = recommend_books_for_movie(movie_idx)
    
    # 추천 도서의 중복 방지
    for idx in top_indices:
        if idx not in already_recommended:
            book = book_data.iloc[idx]
            all_recommendations.append({
                'Movie Title': movie_title,
                'Book Title': book['도서명'],
                'Author': book['저자명'],
                'Publisher': book['출판사']
            })
            already_recommended.add(idx)
    
    # 추천된 도서 기록 업데이트
    recommendation_history[movie_idx] = list(already_recommended)

# 추천 결과를 Excel로 저장
recommendations_df = pd.DataFrame(all_recommendations)
recommendations_df.to_excel('movie_book_recommendations_gat.xlsx', index=False)
print("Recommendations saved to 'movie_book_recommendations_gat.xlsx'")


# Version 4

In [None]:
# 데이터셋 및 데이터로더 생성
class GraphDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return 1

    def __getitem__(self, idx):
        return self.data

def collate_fn(batch):
    return batch[0]

dataset = GraphDataset(data_train)
train_loader = DataLoader(dataset, batch_size=1, shuffle=True, collate_fn=collate_fn)

# 모델 설정
input_dim = x_train.size(1)  # 입력 차원 설정
hidden_dim = 64  # 중간 계층 크기 설정
output_dim = input_dim  # 출력 차원은 입력 차원과 동일하게 설정

# GraphSAGE 모델 정의
class GraphSAGENet(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GraphSAGENet, self).__init__()
        self.sage1 = SAGEConv(input_dim, hidden_dim)
        self.sage2 = SAGEConv(hidden_dim, output_dim)

    def forward(self, x, edge_index):
        x = self.sage1(x, edge_index)
        x = F.relu(x)
        x = self.sage2(x, edge_index)
        return x

# 모델 및 옵티마이저 초기화
model = GraphSAGENet(input_dim, hidden_dim, output_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=1e-4)
criterion = torch.nn.MSELoss()

# 패널티 기반 손실 함수
def custom_loss(output, target):
    mse_loss = criterion(output, target)
    # 반복되는 도서에 패널티 부여
    penalty = torch.sum(torch.abs(output - target) ** 2) / output.size(0)
    return mse_loss + 0.1 * penalty

# 학습 루프
model.train()
for epoch in range(1, 5001):
    total_loss = 0
    for batch in train_loader:
        batch = batch.to(device)
        x, edge_index = batch.x, batch.edge_index
        optimizer.zero_grad()
        out = model(x, edge_index)
        loss = custom_loss(out, x)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    if epoch % 50 == 0:
        print(f'Epoch {epoch}, Loss: {total_loss:.4f}')

# 모델 저장
checkpoint_path = 'checkpoint_sage.pth'
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': total_loss,
}, checkpoint_path)

print(f"모델이 {checkpoint_path}에 저장되었습니다.")

# 추천 함수 (영화 -> 도서)
def recommend_books_for_movie(movie_idx):
    movie_embedding = movie_embeddings[movie_idx].unsqueeze(0)
    similarities = F.cosine_similarity(movie_embedding, book_embeddings).cpu().numpy()
    
    top_index = np.argmax(similarities)
    recommended_book = book_data.iloc[[top_index]]  # DataFrame으로 반환
    
    return recommended_book

# 모든 영화에 대해 추천 생성
all_recommendations = []

for movie_idx in range(movie_features.size(0)):  # 영화 개수
    movie_title = movie_data.iloc[movie_idx]['제목']
    recommended_book = recommend_books_for_movie(movie_idx)
    
    all_recommendations.append({
        'Movie Title': movie_title,
        'Book Title': recommended_book['도서명'].values[0],
        'Author': recommended_book['저자명'].values[0],
        'Publisher': recommended_book['출판사'].values[0]
    })

# 추천 결과를 Excel로 저장
recommendations_df = pd.DataFrame(all_recommendations)
recommendations_df.to_excel('movie_book_recommendations_sage.xlsx', index=False)
print("Recommendations saved to 'movie_book_recommendations_sage.xlsx'")

# 도서 추천 횟수 계산
book_recommendation_count = pd.DataFrame(all_recommendations)['Book Title'].value_counts()
recommendation_counts_df = pd.DataFrame({
    'Book Title': book_recommendation_count.index,
    'Recommendation Count': book_recommendation_count.values
}).sort_values(by='Recommendation Count', ascending=False)

# 도서 추천 횟수 Excel로 저장
recommendation_counts_df.to_excel('book_recommendation_counts_sage.xlsx', index=False)
print("Book recommendation counts saved to 'book_recommendation_counts_sage.xlsx'")