# Desenvolvimento de Modelos de Recomenda√ß√£o
3 abordagens: Baseline ‚Üí Collaborative Filtering ‚Üí H√≠brido

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from collections import defaultdict
import pickle
from pathlib import Path

# Criar pasta para modelos
Path('models').mkdir(exist_ok=True)

## PREPARA√á√ÉO DOS DADOS

In [3]:
print("=" * 80)
print("üìÇ PREPARANDO DADOS PARA MODELAGEM")
print("=" * 80)

# Carregar dados
df = pd.read_csv('../data/processed/ratings.csv')
print(f"‚úÖ {len(df):,} avalia√ß√µes carregadas")

# Split train/test (80/20)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
print(f"\nüìä Split dos dados:")
print(f"   Treino: {len(train_df):,} avalia√ß√µes ({len(train_df)/len(df)*100:.1f}%)")
print(f"   Teste: {len(test_df):,} avalia√ß√µes ({len(test_df)/len(df)*100:.1f}%)")

üìÇ PREPARANDO DADOS PARA MODELAGEM
‚úÖ 100,000 avalia√ß√µes carregadas

üìä Split dos dados:
   Treino: 80,000 avalia√ß√µes (80.0%)
   Teste: 20,000 avalia√ß√µes (20.0%)


### MODELO 1: BASELINE (M√©dia Global)

In [None]:
print("\n" + "=" * 80)
print("üîπ MODELO 1: BASELINE - M√©dia Global")
print("=" * 80)

class BaselineRecommender:
    """
    Prediz sempre a m√©dia global dos ratings.
    M√©todos:
        * fit(ratings_df: pd.DataFrame) -> BaselineRecommender: Treina o modelo.
        * predict(user_id: int, item_id: int) -> float: Prediz a avalia√ß√£o para um par usu√°rio-item.
        * evaluate(test_df: pd.DataFrame) -> dict: Avalia o modelo usando RMSE e MAE.
    """
    
    def __init__(self):
        self.global_mean = None
    
    def fit(self, ratings_df: pd.DataFrame) -> 'BaselineRecommender':
        """
        Treina o modelo calculando a m√©dia global dos ratings.
        Par√¢metros:
            * ratings_df: pd.DataFrame -> DataFrame contendo as avalia√ß√µes com colunas ['user_id', 'item_id', 'rating'].
        Retorna:
            * self: BaselineRecommender -> Inst√¢ncia do modelo treinado.
        """

        self.global_mean = ratings_df['rating'].mean()
        print(f"   M√©dia global treinada: {self.global_mean:.2f}")
        return self
    
    def predict(self, user_id: int, item_id: int) -> float:
        """
        Prediz a avalia√ß√£o para um par usu√°rio-item.
        Par√¢metros:
            * user_id: int -> ID do usu√°rio.
            * item_id: int -> ID do item.
        Retorna:
            * float -> Avalia√ß√£o predita (m√©dia global).
        """

        return self.global_mean
    
    def evaluate(self, test_df: pd.DataFrame) -> dict:
        """
        Avalia o modelo usando RMSE e MAE.
        Par√¢metros:
            * test_df: pd.DataFrame -> DataFrame de teste com colunas ['user_id', 'item_id', 'rating'].
        Retorna:
            * dict -> Dicion√°rio com m√©tricas {'RMSE': float, 'MAE': float}.
        """
        
        predictions = [self.predict(row['user_id'], row['item_id']) 
                      for _, row in test_df.iterrows()]
        
        rmse = np.sqrt(mean_squared_error(test_df['rating'], predictions))
        mae = mean_absolute_error(test_df['rating'], predictions)
        
        return {'RMSE': rmse, 'MAE': mae}

# Treinar baseline
baseline_model = BaselineRecommender()
baseline_model.fit(train_df)

# Avaliar
baseline_metrics = baseline_model.evaluate(test_df)
print(f"\nüìä M√©tricas no teste:")
print(f"   RMSE: {baseline_metrics['RMSE']:.4f}")
print(f"   MAE: {baseline_metrics['MAE']:.4f}")

# Salvar modelo
with open('../models/baseline_model.pkl', 'wb') as f:
    pickle.dump(baseline_model, f)
print("üíæ Modelo salvo: ../models/baseline_model.pkl")


üîπ MODELO 1: BASELINE - M√©dia Global
   M√©dia global treinada: 3.53

üìä M√©tricas no teste:
   RMSE: 1.1239
   MAE: 0.9420
üíæ Modelo salvo: ../models/baseline_model.pkl


### MODELO 2: USER-BASED COLLABORATIVE FILTERING

In [5]:
print("\n" + "=" * 80)
print("üîπ MODELO 2: USER-BASED COLLABORATIVE FILTERING")
print("=" * 80)

class UserBasedCF:
    """
    Collaborative Filtering baseado em similaridade de usu√°rios.
    
    CORRE√á√ïES IMPLEMENTADAS:
    - Valida√ß√£o de valores NaN/Inf em todas as predi√ß√µes
    - Fallback robusto para casos extremos
    - Logging de predi√ß√µes problem√°ticas
    - Normaliza√ß√£o correta dos ratings
    """
    
    def __init__(self, k_neighbors: int = 20, min_neighbors: int = 3):
        self.k_neighbors = k_neighbors
        self.min_neighbors = min_neighbors  # M√≠nimo de vizinhos para confiar na predi√ß√£o
        self.user_item_matrix = None
        self.user_similarity = None
        self.user_mean = None
        self.global_mean = None
        self.problem_predictions = []  # Para debug
    
    def fit(self, ratings_df: pd.DataFrame) -> 'UserBasedCF':
        """Treina o modelo com valida√ß√µes adicionais"""
        
        print("   Criando matriz usu√°rio-item...")
        self.user_item_matrix = ratings_df.pivot_table(
            index='user_id',
            columns='item_id',
            values='rating'
        )
        
        # Calcular m√©dias (IMPORTANTE: com valida√ß√£o)
        self.user_mean = self.user_item_matrix.mean(axis=1)
        self.global_mean = ratings_df['rating'].mean()
        
        # Verificar se h√° NaN nas m√©dias
        if self.user_mean.isna().any():
            print(f"   ‚ö†Ô∏è {self.user_mean.isna().sum()} usu√°rios sem m√©dia v√°lida")
            self.user_mean = self.user_mean.fillna(self.global_mean)
        
        # Normalizar ratings (subtrair m√©dia do usu√°rio)
        user_item_normalized = self.user_item_matrix.sub(self.user_mean, axis=0).fillna(0)
        
        print("   Calculando similaridade entre usu√°rios...")
        self.user_similarity = cosine_similarity(user_item_normalized)
        
        # Verificar se h√° NaN/Inf na matriz de similaridade
        if np.isnan(self.user_similarity).any() or np.isinf(self.user_similarity).any():
            print("   ‚ö†Ô∏è Similaridades inv√°lidas detectadas, corrigindo...")
            self.user_similarity = np.nan_to_num(self.user_similarity, nan=0.0, posinf=1.0, neginf=0.0)
        
        print(f"   ‚úÖ Matriz: {self.user_item_matrix.shape} | K-vizinhos: {self.k_neighbors}")
        print(f"   üìä M√©dia global: {self.global_mean:.2f}")
        return self
    
    def predict(self, user_id: int, item_id: int) -> float:
        """Prediz rating com valida√ß√µes robustas"""
        
        # CASO 1: Usu√°rio ou item n√£o existe ‚Üí m√©dia global
        if user_id not in self.user_item_matrix.index:
            return self.global_mean
        
        if item_id not in self.user_item_matrix.columns:
            return self.user_mean[user_id]
        
        # CASO 2: Usu√°rio j√° avaliou o item ‚Üí retornar rating conhecido
        user_rating = self.user_item_matrix.loc[user_id, item_id]
        if not np.isnan(user_rating):
            return float(user_rating)
        
        # CASO 3: Fazer predi√ß√£o baseada em vizinhos
        try:
            user_idx = self.user_item_matrix.index.get_loc(user_id)
            similarities = self.user_similarity[user_idx]
            
            # Pegar ratings dos vizinhos para este item
            item_ratings = self.user_item_matrix[item_id]
            rated_users_mask = ~item_ratings.isna()
            
            # Se ningu√©m avaliou este item ‚Üí m√©dia do usu√°rio
            if rated_users_mask.sum() == 0:
                return self.user_mean[user_id]
            
            neighbor_ratings = item_ratings[rated_users_mask]
            neighbor_similarities = similarities[rated_users_mask]
            
            # Remover similaridades negativas ou zero
            positive_mask = neighbor_similarities > 0
            if positive_mask.sum() < self.min_neighbors:
                # Poucos vizinhos confi√°veis ‚Üí usar m√©dia do usu√°rio
                return self.user_mean[user_id]
            
            neighbor_ratings = neighbor_ratings[positive_mask]
            neighbor_similarities = neighbor_similarities[positive_mask]
            
            # Pegar top-k vizinhos
            if len(neighbor_similarities) > self.k_neighbors:
                top_k_idx = np.argsort(neighbor_similarities)[-self.k_neighbors:]
                neighbor_ratings = neighbor_ratings.iloc[top_k_idx]
                neighbor_similarities = neighbor_similarities[top_k_idx]
            
            # Calcular predi√ß√£o ponderada
            total_similarity = neighbor_similarities.sum()
            
            if total_similarity == 0:
                prediction = self.user_mean[user_id]
            else:
                prediction = np.average(neighbor_ratings, weights=neighbor_similarities)
            
            # VALIDA√á√ÉO FINAL: Verificar se predi√ß√£o √© v√°lida
            if np.isnan(prediction) or np.isinf(prediction):
                self.problem_predictions.append({
                    'user_id': user_id,
                    'item_id': item_id,
                    'prediction': prediction,
                    'fallback': self.user_mean[user_id]
                })
                return self.user_mean[user_id]
            
            # Limitar predi√ß√£o ao range v√°lido (ex: 1-5 para MovieLens)
            min_rating = 1.0
            max_rating = 5.0
            prediction = np.clip(prediction, min_rating, max_rating)
            
            return float(prediction)
        
        except Exception as e:
            # Qualquer erro ‚Üí fallback seguro
            print(f"   ‚ö†Ô∏è Erro na predi√ß√£o: {e}")
            return self.user_mean[user_id]
    
    def recommend(self, user_id: int, n: int = 5) -> list[int]:
        """Retorna top-N recomenda√ß√µes"""
        if user_id not in self.user_item_matrix.index:
            return []
        
        user_ratings = self.user_item_matrix.loc[user_id]
        unrated_items = user_ratings[user_ratings.isna()].index
        
        predictions = []
        for item_id in unrated_items:
            pred_rating = self.predict(user_id, item_id)
            predictions.append((item_id, pred_rating))
        
        predictions.sort(key=lambda x: x[1], reverse=True)
        return [item_id for item_id, _ in predictions[:n]]
    
    def evaluate(self, test_df: pd.DataFrame, verbose: bool = True) -> dict:
        """Avalia modelo com diagn√≥stico detalhado"""
        
        if verbose:
            print("   Avaliando modelo no conjunto de teste...")
        
        predictions = []
        actuals = []
        invalid_count = 0
        
        for idx, row in test_df.iterrows():
            pred = self.predict(row['user_id'], row['item_id'])
            
            # Verificar validade da predi√ß√£o
            if np.isnan(pred) or np.isinf(pred):
                invalid_count += 1
                pred = self.global_mean
            
            predictions.append(pred)
            actuals.append(row['rating'])
            
            if verbose and (idx + 1) % 5000 == 0:
                print(f"      Progresso: {idx+1}/{len(test_df)} predi√ß√µes")
        
        # Calcular m√©tricas
        rmse = np.sqrt(mean_squared_error(actuals, predictions))
        mae = mean_absolute_error(actuals, predictions)
        
        # Diagn√≥stico adicional
        predictions_array = np.array(predictions)
        
        if verbose:
            print(f"\n   üîç Diagn√≥stico das predi√ß√µes:")
            print(f"      Predi√ß√µes inv√°lidas: {invalid_count} ({invalid_count/len(predictions)*100:.2f}%)")
            print(f"      Predi√ß√µes m√≠nima: {predictions_array.min():.2f}")
            print(f"      Predi√ß√µes m√°xima: {predictions_array.max():.2f}")
            print(f"      Predi√ß√µes m√©dia: {predictions_array.mean():.2f}")
            print(f"      Predi√ß√µes std: {predictions_array.std():.2f}")
        
        return {
            'RMSE': rmse,
            'MAE': mae,
            'invalid_predictions': invalid_count,
            'predictions_stats': {
                'min': float(predictions_array.min()),
                'max': float(predictions_array.max()),
                'mean': float(predictions_array.mean()),
                'std': float(predictions_array.std())
            }
        }

# Treinar modelo
cf_model = UserBasedCF(k_neighbors=30)
cf_model.fit(train_df)

# Avaliar
cf_metrics = cf_model.evaluate(test_df)
print(f"\nüìä M√©tricas no teste:")
print(f"   RMSE: {cf_metrics['RMSE']:.4f}")
print(f"   MAE: {cf_metrics['MAE']:.4f}")

# Compara√ß√£o com baseline
print(f"\nüìà Melhoria sobre baseline:")
print(f"   RMSE: {(1 - cf_metrics['RMSE']/baseline_metrics['RMSE'])*100:+.2f}%")
print(f"   MAE: {(1 - cf_metrics['MAE']/baseline_metrics['MAE'])*100:+.2f}%")

# Salvar modelo
with open('../models/collaborative_filtering_model.pkl', 'wb') as f:
    pickle.dump(cf_model, f)
print("üíæ Modelo salvo: ../models/collaborative_filtering_model.pkl")



üîπ MODELO 2: USER-BASED COLLABORATIVE FILTERING
   Criando matriz usu√°rio-item...
   Calculando similaridade entre usu√°rios...
   ‚úÖ Matriz: (943, 1653) | K-vizinhos: 30
   üìä M√©dia global: 3.53
   Avaliando modelo no conjunto de teste...
      Progresso: 70000/20000 predi√ß√µes
      Progresso: 10000/20000 predi√ß√µes
      Progresso: 25000/20000 predi√ß√µes
      Progresso: 100000/20000 predi√ß√µes

   üîç Diagn√≥stico das predi√ß√µes:
      Predi√ß√µes inv√°lidas: 0 (0.00%)
      Predi√ß√µes m√≠nima: 1.00
      Predi√ß√µes m√°xima: 4.95
      Predi√ß√µes m√©dia: 3.52
      Predi√ß√µes std: 0.58

üìä M√©tricas no teste:
   RMSE: 0.9953
   MAE: 0.7900

üìà Melhoria sobre baseline:
   RMSE: +11.44%
   MAE: +16.13%
üíæ Modelo salvo: ../models/collaborative_filtering_model.pkl


#### TESTE PR√ÅTICO: GERAR RECOMENDA√á√ïES

In [6]:
print("\n" + "=" * 80)
print("üéØ TESTE PR√ÅTICO DE RECOMENDA√á√ïES")
print("=" * 80)

# Escolher um usu√°rio de teste
test_user = train_df['user_id'].value_counts().index[5]  # 6¬∫ usu√°rio mais ativo
print(f"\nüë§ Testando recomenda√ß√µes para usu√°rio {test_user}:")

# Ver hist√≥rico do usu√°rio
user_history = train_df[train_df['user_id'] == test_user].sort_values('rating', ascending=False)
print(f"\nüìú Hist√≥rico (top 10 ratings):")
print(user_history.head(10)[['item_id', 'rating']].to_string(index=False))

# Gerar recomenda√ß√µes
recommendations = cf_model.recommend(test_user, n=10)
print(f"\n‚≠ê Top 10 recomenda√ß√µes:")
for i, item_id in enumerate(recommendations, 1):
    pred_rating = cf_model.predict(test_user, item_id)
    print(f"   {i}. Item {item_id} (rating estimado: {pred_rating:.2f})")



üéØ TESTE PR√ÅTICO DE RECOMENDA√á√ïES

üë§ Testando recomenda√ß√µes para usu√°rio 416:

üìú Hist√≥rico (top 10 ratings):
 item_id  rating
     213       5
     684       5
      65       5
     471       5
     385       5
    1007       5
     121       5
       8       5
      83       5
     531       5

‚≠ê Top 10 recomenda√ß√µes:
   1. Item 1125 (rating estimado: 4.81)
   2. Item 113 (rating estimado: 4.78)
   3. Item 12 (rating estimado: 4.63)
   4. Item 114 (rating estimado: 4.61)
   5. Item 50 (rating estimado: 4.54)
   6. Item 313 (rating estimado: 4.52)
   7. Item 1449 (rating estimado: 4.51)
   8. Item 963 (rating estimado: 4.49)
   9. Item 170 (rating estimado: 4.43)
   10. Item 1514 (rating estimado: 4.41)


### MODELO 3: ITEM-BASED COLLABORATIVE FILTERING

In [7]:
print("\n" + "=" * 80)
print("üîπ MODELO 3: ITEM-BASED COLLABORATIVE FILTERING")
print("=" * 80)

class ItemBasedCF:
    """Collaborative Filtering baseado em similaridade de itens"""
    
    def __init__(self, k_neighbors=20):
        self.k_neighbors = k_neighbors
        self.item_similarity = None
        self.user_item_matrix = None
    
    def fit(self, ratings_df):
        print("   Criando matriz item-usu√°rio...")
        # Transpor para ter itens nas linhas
        self.user_item_matrix = ratings_df.pivot_table(
            index='user_id',
            columns='item_id',
            values='rating'
        )
        
        item_user_matrix = self.user_item_matrix.T.fillna(0)
        
        print("   Calculando similaridade entre itens...")
        self.item_similarity = cosine_similarity(item_user_matrix)
        
        print(f"   ‚úÖ {len(item_user_matrix)} itens processados")
        return self
    
    def predict(self, user_id, item_id):
        if user_id not in self.user_item_matrix.index or \
           item_id not in self.user_item_matrix.columns:
            return self.user_item_matrix.mean().mean()
        
        # Itens que o usu√°rio avaliou
        user_ratings = self.user_item_matrix.loc[user_id]
        rated_items = user_ratings[~user_ratings.isna()]
        
        if len(rated_items) == 0:
            return self.user_item_matrix.mean().mean()
        
        # Similaridades do item target com itens avaliados
        item_idx = self.user_item_matrix.columns.get_loc(item_id)
        similarities = []
        ratings = []
        
        for rated_item_id in rated_items.index:
            rated_item_idx = self.user_item_matrix.columns.get_loc(rated_item_id)
            sim = self.item_similarity[item_idx, rated_item_idx]
            
            if sim > 0:
                similarities.append(sim)
                ratings.append(rated_items[rated_item_id])
        
        if len(similarities) == 0:
            return self.user_item_matrix[item_id].mean()
        
        # Top-k itens similares
        if len(similarities) > self.k_neighbors:
            top_k_idx = np.argsort(similarities)[-self.k_neighbors:]
            similarities = [similarities[i] for i in top_k_idx]
            ratings = [ratings[i] for i in top_k_idx]
        
        # Predi√ß√£o ponderada
        prediction = np.average(ratings, weights=similarities)
        return prediction
    
    def evaluate(self, test_df):
        print("   Avaliando modelo item-based...")
        predictions = []
        actuals = []
        
        for idx, row in test_df.iterrows():
            pred = self.predict(row['user_id'], row['item_id'])
            predictions.append(pred)
            actuals.append(row['rating'])
            
            if (idx + 1) % 5000 == 0:
                print(f"      Progresso: {idx+1}/{len(test_df)}")
        
        rmse = np.sqrt(mean_squared_error(actuals, predictions))
        mae = mean_absolute_error(actuals, predictions)
        
        return {'RMSE': rmse, 'MAE': mae}

# Treinar modelo item-based
item_cf_model = ItemBasedCF(k_neighbors=30)
item_cf_model.fit(train_df)

# Avaliar
item_cf_metrics = item_cf_model.evaluate(test_df)
print(f"\nüìä M√©tricas no teste:")
print(f"   RMSE: {item_cf_metrics['RMSE']:.4f}")
print(f"   MAE: {item_cf_metrics['MAE']:.4f}")

# Salvar modelo
with open('models/item_based_cf_model.pkl', 'wb') as f:
    pickle.dump(item_cf_model, f)
print("üíæ Modelo salvo: models/item_based_cf_model.pkl")


üîπ MODELO 3: ITEM-BASED COLLABORATIVE FILTERING
   Criando matriz item-usu√°rio...
   Calculando similaridade entre itens...
   ‚úÖ 1653 itens processados
   Avaliando modelo item-based...
      Progresso: 70000/20000
      Progresso: 10000/20000
      Progresso: 25000/20000
      Progresso: 100000/20000

üìä M√©tricas no teste:
   RMSE: 0.9757
   MAE: 0.7649
üíæ Modelo salvo: models/item_based_cf_model.pkl


#### COMPARA√á√ÉO FINAL DOS MODELOS

In [9]:
print("\n" + "=" * 80)
print("üèÜ COMPARA√á√ÉO FINAL DOS MODELOS")
print("=" * 80)

results = pd.DataFrame({
    'Modelo': ['Baseline', 'User-Based CF', 'Item-Based CF'],
    'RMSE': [baseline_metrics['RMSE'], cf_metrics['RMSE'], item_cf_metrics['RMSE']],
    'MAE': [baseline_metrics['MAE'], cf_metrics['MAE'], item_cf_metrics['MAE']]
})

print("\n" + results.to_string(index=False))

# Melhor modelo
best_model_idx = results['RMSE'].idxmin()
best_model_name = results.loc[best_model_idx, 'Modelo']
print(f"\nü•á Melhor modelo: {best_model_name}")
print(f"   RMSE: {results.loc[best_model_idx, 'RMSE']:.4f}")
print(f"   MAE: {results.loc[best_model_idx, 'MAE']:.4f}")

# Salvar resultados
results.to_csv('../docs/model_comparison.csv', index=False)
print("\nüíæ Resultados salvos: docs/model_comparison.csv")

print("\n" + "=" * 80)
print("‚úÖ DESENVOLVIMENTO DE MODELOS CONCLU√çDO!")
print("=" * 80)
print("\nPr√≥ximos passos:")
print("   1. Criar m√≥dulo de produ√ß√£o (src/models/recommender.py)")
print("   2. Desenvolver API FastAPI")
print("   3. Implementar testes unit√°rios")


üèÜ COMPARA√á√ÉO FINAL DOS MODELOS

       Modelo     RMSE      MAE
     Baseline 1.123860 0.941955
User-Based CF 0.995258 0.789983
Item-Based CF 0.975689 0.764899

ü•á Melhor modelo: Item-Based CF
   RMSE: 0.9757
   MAE: 0.7649

üíæ Resultados salvos: docs/model_comparison.csv

‚úÖ DESENVOLVIMENTO DE MODELOS CONCLU√çDO!

Pr√≥ximos passos:
   1. Criar m√≥dulo de produ√ß√£o (src/models/recommender.py)
   2. Desenvolver API FastAPI
   3. Implementar testes unit√°rios
