# Counterfactual Evaluation: Off-Policy Evaluation and Incremental Action Value

This notebook demonstrates:
1. **Off-Policy Evaluation (OPE)** - Section 5.4.2
2. **Incremental Action Value** - Section 5.4.3

Using a movie recommendation system example.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

plt.rcParams['figure.figsize'] = [12, 6]
np.random.seed(42)

## 1. Generate Synthetic Logged Data

In [None]:
class MovieRecommendationLogger:
    def __init__(self, num_users=300, num_movies=20, num_sessions=1000):
        self.num_users = num_users
        self.num_movies = num_movies
        self.num_sessions = num_sessions
        
        self.movies = self._generate_movies()
        self.users = self._generate_users()
        self.logged_data = self._generate_logged_data()
    
    def _generate_movies(self):
        genres = ['Action', 'Comedy', 'Drama', 'Horror', 'Romance', 'Sci-Fi']
        movies = []
        for i in range(self.num_movies):
            movie = {
                'movie_id': i,
                'title': f'Movie_{i}',
                'genre': np.random.choice(genres),
                'rating': np.random.uniform(3.0, 9.0),
                'popularity': np.random.exponential(1)
            }
            movies.append(movie)
        return pd.DataFrame(movies)
    
    def _generate_users(self):
        genres = self.movies['genre'].unique()
        users = []
        for i in range(self.num_users):
            preferred_genres = np.random.choice(genres, size=np.random.randint(1, 3), replace=False)
            user = {
                'user_id': i,
                'preferred_genres': list(preferred_genres),
                'engagement_level': np.random.uniform(0.2, 1.0)
            }
            users.append(user)
        return pd.DataFrame(users)
    
    def _calculate_affinity(self, user_row, movie_row):
        base_affinity = 0.3
        if movie_row['genre'] in user_row['preferred_genres']:
            base_affinity += 0.4
        rating_boost = (movie_row['rating'] - 5.0) / 10.0
        base_affinity += rating_boost * 0.3
        base_affinity *= user_row['engagement_level']
        return np.clip(base_affinity, 0.0, 1.0)
    
    def _logged_policy_probabilities(self, user_row, available_movies):
        scores = []
        for _, movie in available_movies.iterrows():
            score = movie['popularity']
            if movie['genre'] in user_row['preferred_genres']:
                score *= 1.5
            scores.append(score)
        
        scores = np.array(scores)
        exp_scores = np.exp(scores - np.max(scores))
        return exp_scores / np.sum(exp_scores)
    
    def _generate_logged_data(self):
        logged_sessions = []
        
        for session_id in range(self.num_sessions):
            user_id = np.random.randint(0, self.num_users)
            user_row = self.users.iloc[user_id]
            
            context = {
                'time_of_day': np.random.choice(['morning', 'afternoon', 'evening', 'night']),
                'device': np.random.choice(['mobile', 'desktop', 'tv'])
            }
            
            num_available = np.random.randint(5, 10)
            available_movie_indices = np.random.choice(self.num_movies, size=num_available, replace=False)
            available_movies = self.movies.iloc[available_movie_indices]
            
            action_probs = self._logged_policy_probabilities(user_row, available_movies)
            
            chosen_idx = np.random.choice(len(available_movies), p=action_probs)
            chosen_movie = available_movies.iloc[chosen_idx]
            chosen_prob = action_probs[chosen_idx]
            
            affinity = self._calculate_affinity(user_row, chosen_movie)
            
            clicked = np.random.random() < affinity
            watch_time = np.random.exponential(affinity * 120) if clicked else 0
            
            session_data = {
                'session_id': session_id,
                'user_id': user_id,
                'context_time': context['time_of_day'],
                'context_device': context['device'],
                'recommended_movie_id': chosen_movie['movie_id'],
                'action_probability': chosen_prob,
                'available_movies': list(available_movies['movie_id']),
                'clicked': clicked,
                'watch_time': watch_time,
                'true_affinity': affinity
            }
            
            logged_sessions.append(session_data)
        
        return pd.DataFrame(logged_sessions)

# Generate dataset
logger = MovieRecommendationLogger()
logged_data = logger.logged_data
movies_catalog = logger.movies
users_catalog = logger.users

print(f"Generated {len(logged_data)} sessions")
print(f"Click-through rate: {logged_data['clicked'].mean():.3f}")
print(f"Average watch time: {logged_data['watch_time'].mean():.1f} minutes")

## 2. Off-Policy Evaluation (OPE)

In [None]:
class OffPolicyEvaluator:
    def __init__(self, logged_data, movies_catalog, users_catalog):
        self.logged_data = logged_data
        self.movies_catalog = movies_catalog
        self.users_catalog = users_catalog
    
    def new_policy_probabilities(self, user_id, available_movie_ids):
        """Quality-focused policy (vs popularity-based logged policy)"""
        user_prefs = self.users_catalog.iloc[user_id]['preferred_genres']
        available_movies = self.movies_catalog[self.movies_catalog['movie_id'].isin(available_movie_ids)]
        
        scores = []
        for _, movie in available_movies.iterrows():
            score = movie['rating']  # Focus on quality
            if movie['genre'] in user_prefs:
                score *= 2.0  # Strong preference boost
            scores.append(score)
        
        scores = np.array(scores)
        if len(scores) == 0:
            return np.array([])
        
        exp_scores = np.exp(scores - np.max(scores))
        return exp_scores / np.sum(exp_scores)
    
    def calculate_importance_weights(self):
        weights = []
        for _, session in self.logged_data.iterrows():
            new_probs = self.new_policy_probabilities(session['user_id'], session['available_movies'])
            
            if len(new_probs) == 0:
                weights.append(0)
                continue
            
            try:
                action_idx = session['available_movies'].index(session['recommended_movie_id'])
                new_policy_prob = new_probs[action_idx]
                logged_policy_prob = session['action_probability']
                
                weight = new_policy_prob / logged_policy_prob if logged_policy_prob > 0 else 0
            except (ValueError, IndexError):
                weight = 0
            
            weights.append(weight)
        
        return np.array(weights)
    
    def evaluate_policy(self, metric='engagement_score'):
        weights = self.calculate_importance_weights()
        
        if metric == 'click_rate':
            rewards = self.logged_data['clicked'].astype(float)
        elif metric == 'watch_time':
            rewards = self.logged_data['watch_time']
        else:  # engagement_score
            rewards = (self.logged_data['clicked'].astype(float) + self.logged_data['watch_time'] / 100)
        
        new_policy_value = np.sum(weights * rewards) / np.sum(weights) if np.sum(weights) > 0 else 0
        original_policy_value = np.mean(rewards)
        
        return {
            'original_policy_value': original_policy_value,
            'new_policy_value': new_policy_value,
            'improvement': new_policy_value - original_policy_value,
            'relative_improvement': ((new_policy_value / original_policy_value - 1) * 100 if original_policy_value > 0 else 0),
            'effective_sample_size': (np.sum(weights) ** 2) / np.sum(weights ** 2) if np.sum(weights) > 0 else 0
        }

# Evaluate new policy
ope_evaluator = OffPolicyEvaluator(logged_data, movies_catalog, users_catalog)
result = ope_evaluator.evaluate_policy('engagement_score')

print("OFF-POLICY EVALUATION RESULTS:")
print(f"Original Policy Value: {result['original_policy_value']:.4f}")
print(f"New Policy Value: {result['new_policy_value']:.4f}")
print(f"Improvement: {result['improvement']:.4f} ({result['relative_improvement']:.1f}%)")
print(f"Effective Sample Size: {result['effective_sample_size']:.0f}")

In [None]:
# Visualize OPE results
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Policy comparison
policies = ['Original', 'New']
values = [result['original_policy_value'], result['new_policy_value']]
ax1.bar(policies, values, alpha=0.7, color=['blue', 'green'])
ax1.set_ylabel('Policy Value')
ax1.set_title('Policy Value Comparison')

# Importance weights distribution
weights = ope_evaluator.calculate_importance_weights()
ax2.hist(weights, bins=30, alpha=0.7, edgecolor='black')
ax2.axvline(x=np.mean(weights), color='red', linestyle='--', label=f'Mean: {np.mean(weights):.3f}')
ax2.set_xlabel('Importance Weight')
ax2.set_ylabel('Frequency')
ax2.set_title('Distribution of Importance Weights')
ax2.legend()

plt.tight_layout()
plt.show()

## 3. Incremental Action Value

In [None]:
class IncrementalActionValueAnalyzer:
    def __init__(self, logged_data, movies_catalog, users_catalog):
        self.logged_data = logged_data
        self.movies_catalog = movies_catalog
        self.users_catalog = users_catalog
    
    def estimate_counterfactual_outcome(self, user_id, movie_id, context=None):
        user_prefs = self.users_catalog.iloc[user_id]['preferred_genres']
        user_engagement = self.users_catalog.iloc[user_id]['engagement_level']
        movie = self.movies_catalog[self.movies_catalog['movie_id'] == movie_id].iloc[0]
        
        base_affinity = 0.3
        if movie['genre'] in user_prefs:
            base_affinity += 0.4
        
        rating_boost = (movie['rating'] - 5.0) / 10.0
        base_affinity += rating_boost * 0.3
        base_affinity *= user_engagement
        
        if context and context.get('time_of_day') == 'evening' and movie['genre'] in ['Horror']:
            base_affinity *= 1.2
        
        affinity = np.clip(base_affinity, 0.0, 1.0)
        
        return {
            'prob_click': affinity,
            'expected_watch_time': affinity * 120
        }
    
    def calculate_incremental_values(self, sample_size=300):
        sample_sessions = self.logged_data.sample(n=min(sample_size, len(self.logged_data)))
        incremental_results = []
        
        for _, session in sample_sessions.iterrows():
            user_id = session['user_id']
            recommended_movie = session['recommended_movie_id']
            available_movies = session['available_movies']
            
            context = {
                'time_of_day': session['context_time'],
                'device': session['context_device']
            }
            
            recommended_estimate = self.estimate_counterfactual_outcome(user_id, recommended_movie, context)
            
            best_alternative = None
            best_alternative_value = 0
            best_alternative_estimate = None
            
            for alt_movie_id in available_movies:
                if alt_movie_id != recommended_movie:
                    alt_estimate = self.estimate_counterfactual_outcome(user_id, alt_movie_id, context)
                    if alt_estimate['expected_watch_time'] > best_alternative_value:
                        best_alternative = alt_movie_id
                        best_alternative_value = alt_estimate['expected_watch_time']
                        best_alternative_estimate = alt_estimate
            
            if best_alternative is not None:
                incremental_click = (best_alternative_estimate['prob_click'] - recommended_estimate['prob_click'])
                incremental_watch_time = (best_alternative_estimate['expected_watch_time'] - recommended_estimate['expected_watch_time'])
                incremental_engagement = incremental_click + (incremental_watch_time / 100)
                
                result = {
                    'session_id': session['session_id'],
                    'recommended_movie': recommended_movie,
                    'best_alternative': best_alternative,
                    'actual_clicked': session['clicked'],
                    'actual_watch_time': session['watch_time'],
                    'incremental_engagement': incremental_engagement,
                    'context_time': context['time_of_day'],
                    'context_device': context['device']
                }
                incremental_results.append(result)
        
        return pd.DataFrame(incremental_results)

# Analyze incremental values
iav_analyzer = IncrementalActionValueAnalyzer(logged_data, movies_catalog, users_catalog)
incremental_results = iav_analyzer.calculate_incremental_values()

print("INCREMENTAL ACTION VALUE RESULTS:")
print(f"Analyzed {len(incremental_results)} sessions")
print(f"Average incremental engagement: {incremental_results['incremental_engagement'].mean():.4f}")
print(f"Sessions with positive incremental value: {(incremental_results['incremental_engagement'] > 0).mean() * 100:.1f}%")
print(f"Maximum potential gain: {incremental_results['incremental_engagement'].max():.4f}")

In [None]:
# Visualize incremental results
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Distribution of incremental values
ax1.hist(incremental_results['incremental_engagement'], bins=30, alpha=0.7, edgecolor='black')
ax1.axvline(x=0, color='red', linestyle='--', label='No improvement')
ax1.axvline(x=incremental_results['incremental_engagement'].mean(), 
           color='green', linestyle='--', label=f'Mean: {incremental_results["incremental_engagement"].mean():.3f}')
ax1.set_xlabel('Incremental Engagement Value')
ax1.set_ylabel('Frequency')
ax1.set_title('Distribution of Incremental Values')
ax1.legend()

# By time of day
time_groups = incremental_results.groupby('context_time')['incremental_engagement'].mean()
ax2.bar(time_groups.index, time_groups.values, alpha=0.7)
ax2.set_xlabel('Time of Day')
ax2.set_ylabel('Mean Incremental Engagement')
ax2.set_title('Incremental Value by Time of Day')
ax2.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Show top opportunities
print("TOP 5 IMPROVEMENT OPPORTUNITIES:")
top_opportunities = incremental_results.nlargest(5, 'incremental_engagement')

for i, (_, opp) in enumerate(top_opportunities.iterrows(), 1):
    rec_movie = movies_catalog[movies_catalog['movie_id'] == opp['recommended_movie']].iloc[0]
    alt_movie = movies_catalog[movies_catalog['movie_id'] == opp['best_alternative']].iloc[0]
    
    print(f"{i}. Session {opp['session_id']}:")
    print(f"   Recommended: {rec_movie['genre']} (Rating: {rec_movie['rating']:.1f})")
    print(f"   Alternative: {alt_movie['genre']} (Rating: {alt_movie['rating']:.1f})")
    print(f"   Potential gain: {opp['incremental_engagement']:.3f}")
    print(f"   Context: {opp['context_time']} on {opp['context_device']}\n")