## Cargar Librerías

In [21]:
import pandas as pd
import numpy as np
import re
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf

## Cargar Datasets

In [22]:
df_books = pd.read_csv('clean_datasets/books.csv')
df_ratings = pd.read_csv('clean_datasets/ratings.csv')
df_tags = pd.read_csv('clean_datasets/tags.csv')
df_to_read = pd.read_csv('clean_datasets/to_read.csv')

##  Modelo Filtrado Colaborativo

In [31]:
# Crear el dataframe con las calificaciones de usuarios
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_ratings[['user_id', 'book_id', 'rating']], reader)

# Crear el modelo SVD
svd_model = SVD(n_factors=100, n_epochs=40, lr_all=0.01, reg_all=0.2)

## Evaluación modelo filtrado colaborativo

In [None]:
# Evaluar el modelo usando validación cruzada
cross_validate(svd_model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

## Modelo Filtrado Basado en Contenido

### Preparar datos

In [4]:
# Crear un subconjunto de datos con solo algunas columnas
subset_books = df_books[['book_id', 'title', 'authors', 'average_rating', 'original_publication_year']]
subset_tags = df_tags[['tag_name', 'book_id']]

# Combinar los tags en una sola fila por libro
subset_tags = subset_tags.groupby('book_id')['tag_name'].apply(' '.join).reset_index()

# Combinar los datos de libros y tags
subset_books_profile = pd.merge(subset_books, subset_tags, on='book_id', how='inner')


### Preprocesar datos para el entrenamiento y evaluación de modelo

In [None]:
def preprocess_data(books_df, author_weight, tag_weight):
    
    # Crear embeddings de autores
    unique_authors = books_df['authors'].unique()
    author_to_index = {author: idx for idx, author in enumerate(unique_authors)}
    
    # Representar autores como un vector esparcido
    author_embeddings = np.zeros((len(books_df), len(unique_authors)))
    for i, author in enumerate(books_df['authors']):
        author_embeddings[i, author_to_index[author]] = 1.0
    
    # Escalar el peso del autor
    author_embeddings *= author_weight
    
    # Procesar las etiquetas usando TF-IDF y escalar el peso de las etiquetas
    tfidf = TfidfVectorizer(stop_words='english')
    tags_tfidf_matrix = tfidf.fit_transform(books_df['tag_name']).toarray() * tag_weight
    
    # Normalizar las fechas de publicación
    books_df['year_normalized'] = (books_df['original_publication_year'] - books_df['original_publication_year'].min()) / (
                                  books_df['original_publication_year'].max() - books_df['original_publication_year'].min())
    
    # Concatenar todas las características en una única matriz de características
    X = np.hstack([
        author_embeddings,  # Embeddings de autores ponderados
        books_df[['average_rating', 'year_normalized']].values,  # Calificaciones y años
        tags_tfidf_matrix  # Etiquetas (tags) con TF-IDF ponderados
    ])
    
    return X

### Construcción del modelo siamesa

In [None]:
def build_siamese_model(input_shape):
    # Modelo base que compartirá pesos
    base_model = tf.keras.Sequential([
        tf.keras.layers.InputLayer(input_shape=input_shape),
        tf.keras.layers.Dense(512, activation='relu'),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(64, activation='relu')
    ])
    
    # Entradas para dos libros
    input_1 = tf.keras.layers.Input(shape=input_shape)
    input_2 = tf.keras.layers.Input(shape=input_shape)

    # Extraemos las representaciones usando el modelo base
    encoded_1 = base_model(input_1)
    encoded_2 = base_model(input_2)

    # Cálculo de la distancia euclidiana entre las dos representaciones
    distance = tf.keras.layers.Lambda(lambda embeddings: tf.reduce_sum(tf.square(embeddings[0] - embeddings[1]), axis=1))([encoded_1, encoded_2])
    
    # Modelo completo con dos entradas y una salida
    siamese_model = tf.keras.Model(inputs=[input_1, input_2], outputs=distance)
    
    siamese_model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
    
    return siamese_model


### Filtros para recomendaciones

In [None]:

def is_first_book_in_series(title):
    return bool(re.search(r'(#1)', title)) or not re.search(r'#\d', title)

# Filtrar duplicados o títulos que son colecciones de series, incluyendo boxsets de trilogías o colecciones
def filter_duplicate_titles(recommended_indices, books_df):
    filtered_recommendations = []
    titles_seen = set()

    for idx in recommended_indices:
        title = books_df.iloc[idx]['title']
        # Si el título no incluye "boxset" o algo similar, y no se ha visto antes
        if "boxset" not in title.lower() and "complete collection" not in title.lower() and title not in titles_seen:
            filtered_recommendations.append(idx)
            titles_seen.add(title)
    
    return filtered_recommendations

def apply_diversity_filter(recommended_indices, books_df, max_books_per_author=2, top_n=5, prioritize_first_books=True, require_independent_books=2):
    filtered_recommendations = []
    authors_seen = {}
    independent_books_found = 0

    for idx in recommended_indices:
        author = books_df.iloc[idx]['authors']
        title = books_df.iloc[idx]['title']
        
        # Comprobar si el autor ya tiene demasiadas recomendaciones
        if authors_seen.get(author, 0) < max_books_per_author:
            if not prioritize_first_books or is_first_book_in_series(title):
                filtered_recommendations.append(idx)
                authors_seen[author] = authors_seen.get(author, 0) + 1
                
                # Identificar libros independientes
                if is_first_book_in_series(title):
                    independent_books_found += 1

        # Terminar cuando tengamos suficientes recomendaciones
        if len(filtered_recommendations) >= top_n:
            break

    # Garantizar al menos un par de libros independientes
    if independent_books_found < require_independent_books:
        for idx in recommended_indices:
            title = books_df.iloc[idx]['title']
            if is_first_book_in_series(title):
                filtered_recommendations.append(idx)
                independent_books_found += 1
            if independent_books_found >= require_independent_books:
                break

    return filtered_recommendations[:top_n]



### Obtención de recomendaciones

In [None]:

# Buscar el índice por book_id
def get_book_index_by_id(book_id, books_df):
    """
    Dado un book_id, devuelve el índice correspondiente en el DataFrame.
    """
    # Buscar el índice en el DataFrame que corresponda con el book_id
    try:
        return books_df.loc[books_df['book_id'] == book_id].index[0]
    except IndexError:
        raise ValueError(f"El book_id {book_id} no se encontró en el DataFrame")

# Función para calcular recomendaciones en lotes (batch) usando la GPU
def get_recommendations_with_batches(book_id, X, model, books_df, batch_size, top_n, max_books_per_author):
    book_index = get_book_index_by_id(book_id, books_df)
    
    book_vector = X[book_index].reshape(1, -1)
    distances = []
    
    for i in range(0, len(X), batch_size):
        batch = X[i:i+batch_size]
        batch_size_actual = len(batch)
        
        book_batch = np.tile(book_vector, (batch_size_actual, 1))
        batch_distances = model.predict([book_batch, batch])
        
        distances.extend(batch_distances)

    distances = np.array(distances).flatten()
    recommended_indices = np.argsort(distances)[:top_n * 2]

    # Filtrar duplicados (por ejemplo, boxsets) y aplicar el filtro de diversidad
    filtered_recommendations = filter_duplicate_titles(recommended_indices, books_df)
    final_recommendations = apply_diversity_filter(filtered_recommendations, books_df, max_books_per_author, top_n, prioritize_first_books=True, require_independent_books=2)
    
    return final_recommendations

### Proceso completo

In [None]:

# Cargar y preprocesar los datos
X = preprocess_data(subset_books_profile, author_weight=0.05, tag_weight=3.0)

# Dividir datos para entrenamiento y prueba
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

# Construir y entrenar el modelo siamesa
content_model = build_siamese_model(X_train.shape[1])
with tf.device('/GPU:0'):  # Aseguramos que el entrenamiento se haga en la GPU
    content_model.fit([X_train, X_train], np.zeros(len(X_train)), epochs=10, batch_size=128, validation_split=0.1)

# Obtener recomendaciones usando el book_id en lugar del índice
book_id = 2767052  # Ejemplo de un book_id
with tf.device('/GPU:0'):  # Usamos la GPU para la inferencia
    recommended_books_indices = get_recommendations_with_batches(
        book_id, X, content_model, subset_books_profile, batch_size=256, top_n=10, max_books_per_author=2
    )

# Mostrar libros recomendados
recommended_books = subset_books_profile.iloc[recommended_books_indices]['title'].values
print("Recomendaciones por book_id:")
for book in recommended_books:
    print("-", book)



## Modelo Hibrido