# Notebook 2: Model Training and Evaluation

Train uplift, LTV, and churn models on synthetic data.

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.features import FeatureEngineer
from src.uplift_model import TLearner, ResponseModel
from src.ltv_model import LTVModel
from src.churn_model import ChurnModel

%matplotlib inline

## 1. Load Data

In [None]:
train_df = pd.read_csv('../data/player_data_train.csv')
test_df = pd.read_csv('../data/player_data_test.csv')

print(f"Training samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")

## 2. Feature Engineering

In [None]:
fe = FeatureEngineer(scale_features=True)

# Prepare training features
X_train = fe.prepare_features(train_df, fit_scaler=True)
y_train = train_df['outcome'].values
treatment_train = train_df['treatment'].values

# Prepare test features
X_test = fe.prepare_features(test_df, fit_scaler=False)
y_test = test_df['outcome'].values
treatment_test = test_df['treatment'].values

print(f"Feature matrix shape: {X_train.shape}")
print(f"Features: {fe.get_feature_importance_names()}")

## 3. Train Uplift Model (T-Learner)

In [None]:
# Train T-Learner
uplift_model = TLearner(random_state=42)
uplift_model.fit(X_train, y_train, treatment_train)

# Predict uplift
uplift_pred = uplift_model.predict_uplift(X_test)

print(f"Uplift predictions:")
print(f"  Mean: {uplift_pred.mean():.4f}")
print(f"  Std: {uplift_pred.std():.4f}")
print(f"  Range: [{uplift_pred.min():.4f}, {uplift_pred.max():.4f}]")

# Save model
uplift_model.save('../data/uplift_model.pkl')

## 4. Train Response Model (Baseline)

In [None]:
# Train response model
response_model = ResponseModel(random_state=42)
response_model.fit(X_train, y_train, treatment_train)

# Predict response
response_pred = response_model.predict_response(X_test)

print(f"Response predictions:")
print(f"  Mean: {response_pred.mean():.4f}")
print(f"  Std: {response_pred.std():.4f}")

response_model.save('../data/response_model.pkl')

## 5. Train LTV and Churn Models

In [None]:
# Train LTV model
ltv_model = LTVModel(random_state=42)
ltv_model.fit(X_train, train_df['ltv'].values)
ltv_model.save('../data/ltv_model.pkl')

# Train churn model
churn_model = ChurnModel(random_state=42)
churn_model.fit(X_train, train_df['churn_probability'].values)
churn_model.save('../data/churn_model.pkl')

## 6. Compare Predictions with Ground Truth

In [None]:
# Add predictions to test dataframe
test_df['uplift_pred'] = uplift_pred
test_df['response_pred'] = response_pred

# Compare by segment
comparison = test_df.groupby('segment').agg({
    'true_uplift': 'mean',
    'uplift_pred': 'mean',
    'response_pred': 'mean'
}).round(3)

print("\nPredictions by Segment:")
print(comparison)

## 7. Feature Importance

In [None]:
importance_df = uplift_model.get_feature_importance(fe.get_feature_importance_names())

fig, ax = plt.subplots(figsize=(10, 6))
top_10 = importance_df.head(10)
ax.barh(top_10['feature'], top_10['avg_importance'])
ax.set_xlabel('Importance')
ax.set_title('Top 10 Features for Uplift Prediction', fontweight='bold')
ax.invert_yaxis()
plt.tight_layout()
plt.show()

print("\nâœ“ Model training complete!")