# Desenvolvimento de Modelos de Recomenda√ß√£o
3 abordagens: Baseline ‚Üí Collaborative Filtering ‚Üí H√≠brido

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from collections import defaultdict
import pickle
from pathlib import Path

# Criar pasta para modelos
Path('models').mkdir(exist_ok=True)

## PREPARA√á√ÉO DOS DADOS

In [2]:
print("=" * 80)
print("üìÇ PREPARANDO DADOS PARA MODELAGEM")
print("=" * 80)

# Carregar dados
df = pd.read_csv('../data/processed/ratings.csv')
print(f"‚úÖ {len(df):,} avalia√ß√µes carregadas")

# Split train/test (80/20)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
print(f"\nüìä Split dos dados:")
print(f"   Treino: {len(train_df):,} avalia√ß√µes ({len(train_df)/len(df)*100:.1f}%)")
print(f"   Teste: {len(test_df):,} avalia√ß√µes ({len(test_df)/len(df)*100:.1f}%)")

üìÇ PREPARANDO DADOS PARA MODELAGEM
‚úÖ 100,000 avalia√ß√µes carregadas

üìä Split dos dados:
   Treino: 80,000 avalia√ß√µes (80.0%)
   Teste: 20,000 avalia√ß√µes (20.0%)


### MODELO 1: BASELINE (M√©dia Global)

In [3]:
print("\n" + "=" * 80)
print("üîπ MODELO 1: BASELINE - M√©dia Global")
print("=" * 80)

class BaselineRecommender:
    """
    Prediz sempre a m√©dia global dos ratings.
    M√©todos:
        * fit(ratings_df: pd.DataFrame) -> BaselineRecommender: Treina o modelo.
        * predict(user_id: int, item_id: int) -> float: Prediz a avalia√ß√£o para um par usu√°rio-item.
        * evaluate(test_df: pd.DataFrame) -> dict: Avalia o modelo usando RMSE e MAE.
    """
    
    def __init__(self):
        self.global_mean = None
    
    def fit(self, ratings_df: pd.DataFrame) -> 'BaselineRecommender':
        """
        Treina o modelo calculando a m√©dia global dos ratings.
        Par√¢metros:
            * ratings_df: pd.DataFrame -> DataFrame contendo as avalia√ß√µes com colunas ['user_id', 'item_id', 'rating'].
        Retorna:
            * self: BaselineRecommender -> Inst√¢ncia do modelo treinado.
        """

        self.global_mean = ratings_df['rating'].mean()
        print(f"   M√©dia global treinada: {self.global_mean:.2f}")
        return self
    
    def predict(self, user_id: int, item_id: int) -> float:
        """
        Prediz a avalia√ß√£o para um par usu√°rio-item.
        Par√¢metros:
            * user_id: int -> ID do usu√°rio.
            * item_id: int -> ID do item.
        Retorna:
            * float -> Avalia√ß√£o predita (m√©dia global).
        """

        return self.global_mean
    
    def evaluate(self, test_df: pd.DataFrame) -> dict:
        """
        Avalia o modelo usando RMSE e MAE.
        Par√¢metros:
            * test_df: pd.DataFrame -> DataFrame de teste com colunas ['user_id', 'item_id', 'rating'].
        Retorna:
            * dict -> Dicion√°rio com m√©tricas {'RMSE': float, 'MAE': float}.
        """
        
        predictions = [self.predict(row['user_id'], row['item_id']) 
                      for _, row in test_df.iterrows()]
        
        rmse = np.sqrt(mean_squared_error(test_df['rating'], predictions))
        mae = mean_absolute_error(test_df['rating'], predictions)
        
        return {'RMSE': rmse, 'MAE': mae}

# Treinar baseline
baseline_model = BaselineRecommender()
baseline_model.fit(train_df)

# Avaliar
baseline_metrics = baseline_model.evaluate(test_df)
print(f"\nüìä M√©tricas no teste:")
print(f"   RMSE: {baseline_metrics['RMSE']:.4f}")
print(f"   MAE: {baseline_metrics['MAE']:.4f}")

# Salvar modelo
with open('../models/baseline_model.pkl', 'wb') as f:
    pickle.dump(baseline_model, f)
print("üíæ Modelo salvo: ../models/baseline_model.pkl")


üîπ MODELO 1: BASELINE - M√©dia Global
   M√©dia global treinada: 3.53

üìä M√©tricas no teste:
   RMSE: 1.1239
   MAE: 0.9420
üíæ Modelo salvo: ../models/baseline_model.pkl


### MODELO 2: USER-BASED COLLABORATIVE FILTERING

In [4]:
print("\n" + "=" * 80)
print("üîπ MODELO 2: USER-BASED COLLABORATIVE FILTERING")
print("=" * 80)

class UserBasedCF:
    """
    Collaborative Filtering baseado em similaridade de usu√°rios.
    
    CORRE√á√ïES IMPLEMENTADAS:
    - Valida√ß√£o de valores NaN/Inf em todas as predi√ß√µes
    - Fallback robusto para casos extremos
    - Logging de predi√ß√µes problem√°ticas
    - Normaliza√ß√£o correta dos ratings
    """
    
    def __init__(self, k_neighbors: int = 20, min_neighbors: int = 3):
        self.k_neighbors = k_neighbors
        self.min_neighbors = min_neighbors  # M√≠nimo de vizinhos para confiar na predi√ß√£o
        self.user_item_matrix = None
        self.user_similarity = None
        self.user_mean = None
        self.global_mean = None
        self.problem_predictions = []  # Para debug
    
    def fit(self, ratings_df: pd.DataFrame) -> 'UserBasedCF':
        """Treina o modelo com valida√ß√µes adicionais"""
        
        print("   Criando matriz usu√°rio-item...")
        self.user_item_matrix = ratings_df.pivot_table(
            index='user_id',
            columns='item_id',
            values='rating'
        )
        
        # Calcular m√©dias (IMPORTANTE: com valida√ß√£o)
        self.user_mean = self.user_item_matrix.mean(axis=1)
        self.global_mean = ratings_df['rating'].mean()
        
        # Verificar se h√° NaN nas m√©dias
        if self.user_mean.isna().any():
            print(f"   ‚ö†Ô∏è {self.user_mean.isna().sum()} usu√°rios sem m√©dia v√°lida")
            self.user_mean = self.user_mean.fillna(self.global_mean)
        
        # Normalizar ratings (subtrair m√©dia do usu√°rio)
        user_item_normalized = self.user_item_matrix.sub(self.user_mean, axis=0).fillna(0)
        
        print("   Calculando similaridade entre usu√°rios...")
        self.user_similarity = cosine_similarity(user_item_normalized)
        
        # Verificar se h√° NaN/Inf na matriz de similaridade
        if np.isnan(self.user_similarity).any() or np.isinf(self.user_similarity).any():
            print("   ‚ö†Ô∏è Similaridades inv√°lidas detectadas, corrigindo...")
            self.user_similarity = np.nan_to_num(self.user_similarity, nan=0.0, posinf=1.0, neginf=0.0)
        
        print(f"   ‚úÖ Matriz: {self.user_item_matrix.shape} | K-vizinhos: {self.k_neighbors}")
        print(f"   üìä M√©dia global: {self.global_mean:.2f}")
        return self
    
    def predict(self, user_id: int, item_id: int) -> float:
        """Prediz rating com valida√ß√µes robustas"""
        
        # CASO 1: Usu√°rio ou item n√£o existe ‚Üí m√©dia global
        if user_id not in self.user_item_matrix.index:
            return self.global_mean
        
        if item_id not in self.user_item_matrix.columns:
            return self.user_mean[user_id]
        
        # CASO 2: Usu√°rio j√° avaliou o item ‚Üí retornar rating conhecido
        user_rating = self.user_item_matrix.loc[user_id, item_id]
        if not np.isnan(user_rating):
            return float(user_rating)
        
        # CASO 3: Fazer predi√ß√£o baseada em vizinhos
        try:
            user_idx = self.user_item_matrix.index.get_loc(user_id)
            similarities = self.user_similarity[user_idx]
            
            # Pegar ratings dos vizinhos para este item
            item_ratings = self.user_item_matrix[item_id]
            rated_users_mask = ~item_ratings.isna()
            
            # Se ningu√©m avaliou este item ‚Üí m√©dia do usu√°rio
            if rated_users_mask.sum() == 0:
                return self.user_mean[user_id]
            
            neighbor_ratings = item_ratings[rated_users_mask]
            neighbor_similarities = similarities[rated_users_mask]
            
            # Remover similaridades negativas ou zero
            positive_mask = neighbor_similarities > 0
            if positive_mask.sum() < self.min_neighbors:
                # Poucos vizinhos confi√°veis ‚Üí usar m√©dia do usu√°rio
                return self.user_mean[user_id]
            
            neighbor_ratings = neighbor_ratings[positive_mask]
            neighbor_similarities = neighbor_similarities[positive_mask]
            
            # Pegar top-k vizinhos
            if len(neighbor_similarities) > self.k_neighbors:
                top_k_idx = np.argsort(neighbor_similarities)[-self.k_neighbors:]
                neighbor_ratings = neighbor_ratings.iloc[top_k_idx]
                neighbor_similarities = neighbor_similarities[top_k_idx]
            
            # Calcular predi√ß√£o ponderada
            total_similarity = neighbor_similarities.sum()
            
            if total_similarity == 0:
                prediction = self.user_mean[user_id]
            else:
                prediction = np.average(neighbor_ratings, weights=neighbor_similarities)
            
            # VALIDA√á√ÉO FINAL: Verificar se predi√ß√£o √© v√°lida
            if np.isnan(prediction) or np.isinf(prediction):
                self.problem_predictions.append({
                    'user_id': user_id,
                    'item_id': item_id,
                    'prediction': prediction,
                    'fallback': self.user_mean[user_id]
                })
                return self.user_mean[user_id]
            
            # Limitar predi√ß√£o ao range v√°lido (ex: 1-5 para MovieLens)
            min_rating = 1.0
            max_rating = 5.0
            prediction = np.clip(prediction, min_rating, max_rating)
            
            return float(prediction)
        
        except Exception as e:
            # Qualquer erro ‚Üí fallback seguro
            print(f"   ‚ö†Ô∏è Erro na predi√ß√£o: {e}")
            return self.user_mean[user_id]
    
    def recommend(self, user_id: int, n: int = 5) -> list[int]:
        """Retorna top-N recomenda√ß√µes"""
        if user_id not in self.user_item_matrix.index:
            return []
        
        user_ratings = self.user_item_matrix.loc[user_id]
        unrated_items = user_ratings[user_ratings.isna()].index
        
        predictions = []
        for item_id in unrated_items:
            pred_rating = self.predict(user_id, item_id)
            predictions.append((item_id, pred_rating))
        
        predictions.sort(key=lambda x: x[1], reverse=True)
        return [item_id for item_id, _ in predictions[:n]]
    
    def evaluate(self, test_df: pd.DataFrame, verbose: bool = True) -> dict:
        """Avalia modelo com diagn√≥stico detalhado"""
        
        if verbose:
            print("   Avaliando modelo no conjunto de teste...")
        
        predictions = []
        actuals = []
        invalid_count = 0
        
        for idx, row in test_df.iterrows():
            pred = self.predict(row['user_id'], row['item_id'])
            
            # Verificar validade da predi√ß√£o
            if np.isnan(pred) or np.isinf(pred):
                invalid_count += 1
                pred = self.global_mean
            
            predictions.append(pred)
            actuals.append(row['rating'])
            
            if verbose and (idx + 1) % 5000 == 0:
                print(f"      Progresso: {idx+1}/{len(test_df)} predi√ß√µes")
        
        # Calcular m√©tricas
        rmse = np.sqrt(mean_squared_error(actuals, predictions))
        mae = mean_absolute_error(actuals, predictions)
        
        # Diagn√≥stico adicional
        predictions_array = np.array(predictions)
        
        if verbose:
            print(f"\n   üîç Diagn√≥stico das predi√ß√µes:")
            print(f"      Predi√ß√µes inv√°lidas: {invalid_count} ({invalid_count/len(predictions)*100:.2f}%)")
            print(f"      Predi√ß√µes m√≠nima: {predictions_array.min():.2f}")
            print(f"      Predi√ß√µes m√°xima: {predictions_array.max():.2f}")
            print(f"      Predi√ß√µes m√©dia: {predictions_array.mean():.2f}")
            print(f"      Predi√ß√µes std: {predictions_array.std():.2f}")
        
        return {
            'RMSE': rmse,
            'MAE': mae,
            'invalid_predictions': invalid_count,
            'predictions_stats': {
                'min': float(predictions_array.min()),
                'max': float(predictions_array.max()),
                'mean': float(predictions_array.mean()),
                'std': float(predictions_array.std())
            }
        }

# Treinar modelo
cf_model = UserBasedCF(k_neighbors=30)
cf_model.fit(train_df)

# Avaliar
cf_metrics = cf_model.evaluate(test_df)
print(f"\nüìä M√©tricas no teste:")
print(f"   RMSE: {cf_metrics['RMSE']:.4f}")
print(f"   MAE: {cf_metrics['MAE']:.4f}")

# Compara√ß√£o com baseline
print(f"\nüìà Melhoria sobre baseline:")
print(f"   RMSE: {(1 - cf_metrics['RMSE']/baseline_metrics['RMSE'])*100:+.2f}%")
print(f"   MAE: {(1 - cf_metrics['MAE']/baseline_metrics['MAE'])*100:+.2f}%")

# Salvar modelo
with open('../models/collaborative_filtering_model.pkl', 'wb') as f:
    pickle.dump(cf_model, f)
print("üíæ Modelo salvo: ../models/collaborative_filtering_model.pkl")



üîπ MODELO 2: USER-BASED COLLABORATIVE FILTERING
   Criando matriz usu√°rio-item...
   Calculando similaridade entre usu√°rios...
   ‚úÖ Matriz: (943, 1653) | K-vizinhos: 30
   üìä M√©dia global: 3.53
   Avaliando modelo no conjunto de teste...
      Progresso: 70000/20000 predi√ß√µes
      Progresso: 10000/20000 predi√ß√µes
      Progresso: 25000/20000 predi√ß√µes
      Progresso: 100000/20000 predi√ß√µes

   üîç Diagn√≥stico das predi√ß√µes:
      Predi√ß√µes inv√°lidas: 0 (0.00%)
      Predi√ß√µes m√≠nima: 1.00
      Predi√ß√µes m√°xima: 4.95
      Predi√ß√µes m√©dia: 3.52
      Predi√ß√µes std: 0.58

üìä M√©tricas no teste:
   RMSE: 0.9953
   MAE: 0.7900

üìà Melhoria sobre baseline:
   RMSE: +11.44%
   MAE: +16.13%
üíæ Modelo salvo: ../models/collaborative_filtering_model.pkl
