In [None]:
!pip install torch_geometric

# 1. 데이터 로드 및 준비

In [None]:
import os
import torch
import torch.nn.functional as F
from torch_geometric.data import Data, DataLoader
from torch_geometric.utils import to_undirected
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

# 데이터 로드 및 스케일링
book_embeddings = pd.read_csv('/kaggle/input/bookembedding/Book_Embedding_KoBERT.csv').values
movie_embeddings = pd.read_csv('/kaggle/input/movieembedding/Movie_Embedding_KoBERT.csv').values
book_data = pd.read_excel('/kaggle/input/book-data/Books_Data.xlsx')
movie_data = pd.read_csv('/kaggle/input/moviedata/KoBERT_movie_keyword.csv')

scaler = StandardScaler()
book_embeddings = scaler.fit_transform(book_embeddings)
movie_embeddings = scaler.fit_transform(movie_embeddings)

# 텐서로 변환 및 평점 정규화
book_embeddings = torch.tensor(book_embeddings, dtype=torch.float32)
movie_embeddings = torch.tensor(movie_embeddings, dtype=torch.float32)
book_ratings = torch.tensor(book_data['평점'].values, dtype=torch.float32)
movie_ratings = torch.tensor(movie_data['평점'].values, dtype=torch.float32)

# 평점 정규화 및 임베딩 결합
rating_weight = 2
book_features = torch.cat([book_embeddings, book_ratings.unsqueeze(1) * rating_weight], dim=1)
movie_features = torch.cat([movie_embeddings, movie_ratings.unsqueeze(1) * rating_weight], dim=1)

# 트레인/테스트 데이터셋 분할 및 차원 맞추기
train_movie_features, _, train_movie_ratings, _ = train_test_split(
    movie_features, movie_ratings, test_size=0.2, random_state=42
)
train_book_features, _, train_book_ratings, _ = train_test_split(
    book_features, book_ratings, test_size=0.2, random_state=42
)

max_dim = max(train_movie_features.size(1), train_book_features.size(1))
if train_movie_features.size(1) < max_dim:
    padding_size = max_dim - train_movie_features.size(1)
    train_movie_features = torch.cat([train_movie_features, torch.zeros(train_movie_features.size(0), padding_size)], dim=1)
if train_book_features.size(1) < max_dim:
    padding_size = max_dim - train_book_features.size(1)
    train_book_features = torch.cat([train_book_features, torch.zeros(train_book_features.size(0), padding_size)], dim=1)

# 2. 그래프 데이터 구성

In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.utils import to_undirected

# 임베딩과 평점 결합 함수
def combine_embeddings_with_ratings(embeddings, ratings, rating_weight=2.0):
    ratings = ratings.unsqueeze(1) * rating_weight
    return torch.cat([embeddings, ratings], dim=1)

# 배치 처리 기반의 개선된 엣지 생성 함수 (메모리 최적화)
def create_edges_in_batches(movie_features, book_features, top_k=5, batch_size=1024):
    edge_index = []
    num_movies = movie_features.size(0)
    num_books = book_features.size(0)
    
    for start_idx in range(0, num_movies, batch_size):
        end_idx = min(start_idx + batch_size, num_movies)
        movie_batch = movie_features[start_idx:end_idx]

        # 코사인 유사도 계산 (배치별)
        cos_sim = F.cosine_similarity(movie_batch.unsqueeze(1), book_features.unsqueeze(0), dim=-1)

        # 각 영화에 대해 상위 k개의 도서만 선택
        for movie_idx in range(cos_sim.size(0)):
            top_k_books = cos_sim[movie_idx].topk(top_k, largest=True).indices
            for book_idx in top_k_books:
                edge_index.append([start_idx + movie_idx, book_idx.item() + num_movies])
    
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    edge_index = to_undirected(edge_index)
    
    return edge_index

# 임베딩과 평점 결합
train_movie_combined = combine_embeddings_with_ratings(train_movie_features, train_movie_ratings, rating_weight)
train_book_combined = combine_embeddings_with_ratings(train_book_features, train_book_ratings, rating_weight)

# 배치 처리 기반의 엣지 생성
combined_edge_index = create_edges_in_batches(train_movie_combined, train_book_combined, top_k=3, batch_size=1024)

# 그래프 데이터 구성
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
x_train = torch.cat([train_movie_combined, train_book_combined], dim=0).to(device)
combined_edge_index = combined_edge_index.to(device)

# Data 객체 생성
data_train = Data(x=x_train, edge_index=combined_edge_index)

# 데이터 확인
print(data_train)

# 3. 모델 학습

In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GATConv
from torch_geometric.data import Data

# GATNet 모델 정의
class GATNet(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, heads=8, dropout=0.2):
        super(GATNet, self).__init__()
        self.gat1 = GATConv(input_dim, hidden_dim, heads=heads, concat=True, dropout=dropout)
        self.gat2 = GATConv(hidden_dim * heads, output_dim, heads=1, concat=False, dropout=dropout)
        self.dropout = torch.nn.Dropout(dropout)
        self.batch_norm1 = torch.nn.BatchNorm1d(hidden_dim * heads)  # 배치 정규화 추가

    def forward(self, x, edge_index):
        x = self.gat1(x, edge_index)
        x = self.batch_norm1(x)  # 배치 정규화
        x = F.elu(x)
        x = self.dropout(x)
        x = self.gat2(x, edge_index)
        return x

# 모델 및 옵티마이저 초기화
input_dim = x_train.size(1)
hidden_dim = 128  # 히든 차원 확장
output_dim = input_dim

model = GATNet(input_dim, hidden_dim, output_dim, heads=8, dropout=0.3).to(device)  # 드롭아웃 증가
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)  # 학습률 감소, weight_decay 증가
criterion = torch.nn.MSELoss()

# 학습 루프
model.train()
for epoch in range(1, 5001):  # 에포크 수를 줄이고 조기 종료 조건 추가 가능
    optimizer.zero_grad()
    out = model(data_train.x, data_train.edge_index)
    loss = criterion(out, data_train.x)
    loss.backward()
    optimizer.step()
    
    if epoch % 100 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item():.4f}')


# 4. 모델 평가

In [None]:
from sklearn.model_selection import train_test_split

# 데이터 분할 시 데이터프레임과 임베딩을 함께 분할
movie_data_train, movie_data_test, train_movie_features, test_movie_features = train_test_split(
    movie_data, movie_features, test_size=0.2, random_state=42)

book_data_train, book_data_test, train_book_features, test_book_features = train_test_split(
    book_data, book_features, test_size=0.2, random_state=42)

# 영화와 도서 데이터셋이 있는지 확인 (수정 후)
assert len(movie_data_train) == len(train_movie_features), "영화 데이터의 길이와 학습 영화 임베딩의 길이가 다릅니다."
assert len(book_data_train) == len(train_book_features), "도서 데이터의 길이와 학습 도서 임베딩의 길이가 다릅니다."

# 추천 시스템 함수 (영화에 대해 도서 추천)
def recommend_book_for_movie(model, movie_embeddings, all_book_embeddings, top_k=1):
    model.eval()
    recommended_books = []
    with torch.no_grad():
        # 각 영화에 대해 유사한 도서 추천
        for movie_embedding in movie_embeddings:
            movie_embedding = movie_embedding.to(device).unsqueeze(0)  # 배치 차원 추가
            book_embeddings = all_book_embeddings.to(device)
            
            
            distances = torch.cdist(movie_embedding, book_embeddings, p=10)  
            top_k_indices = distances.topk(top_k, largest=False).indices
            recommended_books.append([idx.item() for idx in top_k_indices.squeeze()])
    
    return recommended_books

# 모델에 사용할 임베딩들만 선택하여 추천 수행
recommended_books_indices = recommend_book_for_movie(model, train_movie_features, train_book_features, top_k=3)

# 영화 및 도서 제목 가져오기
movie_titles = movie_data_train['제목'].tolist()  # 학습 데이터의 영화 제목 리스트
book_titles = book_data_train['도서명'].tolist()  # 학습 데이터의 도서 제목 리스트

# 추천 도서의 제목 리스트 생성
recommended_books = [[book_titles[idx] for idx in indices] for indices in recommended_books_indices]

# 결과를 DataFrame으로 저장
result_df = pd.DataFrame({
    '영화 제목': movie_titles[:len(recommended_books_indices)],
    '추천 도서': [' / '.join(book_titles[idx] for idx in indices) for indices in recommended_books_indices]
})

# 결과를 xlsx 파일로 저장
output_path = '영화_도서_추천_결과.xlsx'
result_df.to_excel(output_path, index=False)

print(f"Recommendation results saved to {output_path}")
