In [10]:
# Import Libraries - Enhanced for Statistical Analysis
import psycopg2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

# Set Up Database Connection
# Replace the placeholders with your actual database credentials
DB_CONFIG = {
    "dbname": "csgo_parsed",
    "user": "csgo_parser",
    "password": "3?6B7yTGPrkJF34p",
    "host": "192.168.1.100",
    "port": "5444"
}

In [11]:


# Ensure we have a fresh connection
try:
    conn = psycopg2.connect(**DB_CONFIG)
    print("‚úÖ Database connection established")
except Exception as e:
    print(f"‚ùå Connection error: {e}")

def get_descriptive_stats(data, column_name):
    """Calculate comprehensive descriptive statistics"""
    stats_dict = {
        'count': len(data),
        'min': data.min(),
        'max': data.max(),
        'mean': data.mean(),
        'median': data.median(),
        'std': data.std(),
        'q25': data.quantile(0.25),
        'q75': data.quantile(0.75)
    }
    return stats_dict

def get_top_values(data, n=15):
    """Get top N occurring values with percentages"""
    value_counts = data.value_counts().head(n)
    percentages = (value_counts / len(data) * 100).round(2)
    return pd.DataFrame({
        'value': value_counts.index,
        'count': value_counts.values,
        'percentage': percentages.values
    })

def create_distribution_plots(data, title, bins=50):
    """Create histogram and box plot for a variable"""
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    
    # Histogram
    ax1.hist(data.dropna(), bins=bins, alpha=0.7, edgecolor='black')
    ax1.set_title(f'{title} - Distribution')
    ax1.set_xlabel('Value')
    ax1.set_ylabel('Frequency')
    ax1.grid(True, alpha=0.3)
    
    # Box plot
    ax2.boxplot(data.dropna())
    ax2.set_title(f'{title} - Box Plot')
    ax2.set_ylabel('Value')
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

print("üìä Helper functions defined successfully!")

‚úÖ Database connection established
üìä Helper functions defined successfully!


# CS:GO Economy Agent-Based Model (ABM) - Data Analysis v2
## Objective: Extract real-game statistics to parameterize an ABM for CS:GO economy decisions

This notebook analyzes real CS:GO match data to determine key distributions and probabilities for:
1. **Win probability** based on team equipment value/spending and other factors 
Using different methods




In [12]:
# 1. Query Classification Data with Skill Controls

print("üîç Querying round outcome data with HLTV rankings...")

classification_query = """
WITH team_round_data AS (
    SELECT 
        r.id as round_id,
        r.id_demo_exports as id_demo_exports,
        r.round_num,
        r.team1_winner,
        -- Team 1 aggregated metrics
        AVG(CASE WHEN pr.team = 1 THEN pr.eq_val_fte END) as t1_eq_val,
        SUM(CASE WHEN pr.team = 1 THEN pe.money_spent END) as t1_money_spent,
        -- Team 2 aggregated metrics  
        AVG(CASE WHEN pr.team = 2 THEN pr.eq_val_fte END) as t2_eq_val,
        SUM(CASE WHEN pr.team = 2 THEN pe.money_spent END) as t2_money_spent,
        -- HLTV rankings
        hmi.team_1_id,
        hmi.team_2_id,
        hmi.event_id
    FROM rounds_ed r
    JOIN player_round_ed pr ON r.id = pr.round_id
    LEFT JOIN player_economy_ed pe ON pr.id = pe.player_round_id
    LEFT JOIN hltv_match_info hmi ON r.match_id = hmi.match_id
    WHERE r.team1_winner IS NOT NULL
        AND pr.team IN (1, 2)
        AND pr.eq_val_fte IS NOT NULL
        AND hmi.event_id IS NOT NULL
    GROUP BY r.id, r.id_demo_exports, r.round_num, r.team1_winner,
             hmi.team_1_id, hmi.team_2_id, hmi.event_id
    HAVING COUNT(CASE WHEN pr.team = 1 THEN 1 END) = 5 
       AND COUNT(CASE WHEN pr.team = 2 THEN 1 END) = 5
)
SELECT 
    trd.round_id,
    trd.id_demo_exports,
    trd.round_num,
    trd.team1_winner,
    -- Team 1 metrics
    trd.t1_eq_val,
    trd.t1_money_spent,
    CAST(het1.rank_during AS INTEGER) as t1_rank,
    -- Team 2 metrics
    trd.t2_eq_val,
    trd.t2_money_spent,
    CAST(het2.rank_during AS INTEGER) as t2_rank,
    -- Differences (advantages for team 1)
    (trd.t1_eq_val - trd.t2_eq_val) as diff_eq_val,
    (trd.t1_money_spent - trd.t2_money_spent) as diff_money_spent,
    (CAST(het2.rank_during AS INTEGER) - CAST(het1.rank_during AS INTEGER)) as diff_rank
FROM team_round_data trd
LEFT JOIN hltv_events_teams het1 ON het1.team_id = trd.team_1_id AND het1.event_id = trd.event_id
LEFT JOIN hltv_events_teams het2 ON het2.team_id = trd.team_2_id AND het2.event_id = trd.event_id
WHERE het1.rank_during IS NOT NULL 
    AND het2.rank_during IS NOT NULL
    AND trd.t1_eq_val IS NOT NULL 
    AND trd.t2_eq_val IS NOT NULL
ORDER BY trd.id_demo_exports, trd.round_num
"""

try:
    classification_data = pd.read_sql(classification_query, conn)
    print(f"‚úÖ Retrieved {len(classification_data):,} rounds from {classification_data['id_demo_exports'].nunique():,} matches")
    
    # Display summary statistics
    print("\nüìä Data Summary:")
    print(f"  Win rate (Team 1): {classification_data['team1_winner'].mean():.3f}")
    print(f"  Avg equipment diff: ${classification_data['diff_eq_val'].mean():.0f}")
    print(f"  Avg rank diff: {classification_data['diff_rank'].mean():.1f}")
    print(f"  Missing values: {classification_data.isnull().sum().sum()}")
    
    # Preview data
    print("\nüîç Sample data:")
    print(classification_data.head(10))
    
except Exception as e:
    print(f"‚ùå Query error: {e}")
    classification_data = None

üîç Querying round outcome data with HLTV rankings...


  classification_data = pd.read_sql(classification_query, conn)


‚úÖ Retrieved 1,822,909 rounds from 68,776 matches

üìä Data Summary:
  Win rate (Team 1): 0.512
  Avg equipment diff: $59
  Avg rank diff: 10.1
  Missing values: 263010

üîç Sample data:
   round_id  id_demo_exports  round_num  team1_winner  t1_eq_val  \
0     13465                1          1         False      860.0   
1     13468                1          2          True     3250.0   
2     13471                1          3          True     4450.0   
3     13474                1          4          True     5060.0   
4     13477                1          5          True     5610.0   
5     13480                1          6          True     5550.0   
6     13483                1          7         False     5730.0   
7     13486                1          8         False     5530.0   
8     13489                1          9         False     4930.0   
9     13492                1         10          True     4890.0   

   t1_money_spent  t1_rank  t2_eq_val  t2_money_spent  t2_ran

In [13]:
# 2. Prepare Features for Classification

print("üîß Preparing features for classification models...")

if classification_data is not None and len(classification_data) > 0:
    # Create feature matrix
    feature_columns = [
        't1_eq_val', 't1_money_spent', 't1_rank',
        't2_eq_val', 't2_money_spent', 't2_rank',
        'diff_eq_val', 'diff_money_spent', 'diff_rank'
    ]
    
    X = classification_data[feature_columns].copy()
    y = classification_data['team1_winner'].astype(int)
    
    # Handle any missing values
    X = X.fillna(X.median())
    
    # Add interaction terms (critical for understanding combined effects)
    X['eq_x_rank'] = X['diff_eq_val'] * X['diff_rank']
    X['spending_x_rank'] = X['diff_money_spent'] * X['diff_rank']
    X['eq_ratio'] = X['t1_eq_val'] / (X['t2_eq_val'] + 1)
    
    print(f"‚úÖ Feature matrix created: {X.shape[0]:,} samples √ó {X.shape[1]} features")
    print(f"\nüìã Features included:")
    for i, col in enumerate(X.columns, 1):
        print(f"  {i}. {col}")
    
    # Check for class imbalance
    class_distribution = y.value_counts()
    print(f"\n‚öñÔ∏è Class Distribution:")
    print(f"  Team 1 Wins (1): {class_distribution.get(1, 0):,} ({class_distribution.get(1, 0)/len(y)*100:.1f}%)")
    print(f"  Team 1 Loses (0): {class_distribution.get(0, 0):,} ({class_distribution.get(0, 0)/len(y)*100:.1f}%)")
    
    # Descriptive statistics for features
    print(f"\nüìä Feature Statistics:")
    print(X.describe().round(2))
    
else:
    print("‚ùå No data available for feature preparation")
    X = None
    y = None

üîß Preparing features for classification models...
‚úÖ Feature matrix created: 1,822,909 samples √ó 12 features

üìã Features included:
  1. t1_eq_val
  2. t1_money_spent
  3. t1_rank
  4. t2_eq_val
  5. t2_money_spent
  6. t2_rank
  7. diff_eq_val
  8. diff_money_spent
  9. diff_rank
  10. eq_x_rank
  11. spending_x_rank
  12. eq_ratio

‚öñÔ∏è Class Distribution:
  Team 1 Wins (1): 933,772 (51.2%)
  Team 1 Loses (0): 889,137 (48.8%)

üìä Feature Statistics:
‚úÖ Feature matrix created: 1,822,909 samples √ó 12 features

üìã Features included:
  1. t1_eq_val
  2. t1_money_spent
  3. t1_rank
  4. t2_eq_val
  5. t2_money_spent
  6. t2_rank
  7. diff_eq_val
  8. diff_money_spent
  9. diff_rank
  10. eq_x_rank
  11. spending_x_rank
  12. eq_ratio

‚öñÔ∏è Class Distribution:
  Team 1 Wins (1): 933,772 (51.2%)
  Team 1 Loses (0): 889,137 (48.8%)

üìä Feature Statistics:
        t1_eq_val  t1_money_spent     t1_rank   t2_eq_val  t2_money_spent  \
count  1822909.00      1822909.00  1822909

In [None]:
# 3. Build and Compare Multiple Classification Models

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print("ü§ñ Building classification models...")

if X is not None and y is not None:
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )
    
    print(f"üìä Data split:")
    print(f"  Training set: {len(X_train):,} samples")
    print(f"  Test set: {len(X_test):,} samples")
    
    # Standardize features (important for logistic regression)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Define models to compare
    models = {
        'Logistic Regression': LogisticRegression(
            max_iter=1000, 
            class_weight='balanced',
            random_state=42
        ),
        'Random Forest': RandomForestClassifier(
            n_estimators=200,
            max_depth=10,
            min_samples_split=50,
            min_samples_leaf=20,
            class_weight='balanced',
            random_state=42,
            n_jobs=-1
        ),
        'Gradient Boosting': GradientBoostingClassifier(
            n_estimators=200,
            learning_rate=0.1,
            max_depth=5,
            min_samples_split=50,
            min_samples_leaf=20,
            random_state=42
        )
    }
    
    # Train and evaluate models
    print("\n" + "="*70)
    print("üèÜ MODEL COMPARISON - 5-Fold Stratified Cross-Validation")
    print("="*70)
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    model_results = {}
    trained_models = {}
    
    for name, model in models.items():
        print(f"\nüìà Training {name}...")
        
        # Use scaled data for Logistic Regression, original for tree-based
        if name == 'Logistic Regression':
            X_train_use = X_train_scaled
            X_test_use = X_test_scaled
        else:
            X_train_use = X_train
            X_test_use = X_test
        
        # Cross-validation scores
        cv_accuracy = cross_val_score(model, X_train_use, y_train, cv=cv, scoring='accuracy')
        cv_roc_auc = cross_val_score(model, X_train_use, y_train, cv=cv, scoring='roc_auc')
        cv_f1 = cross_val_score(model, X_train_use, y_train, cv=cv, scoring='f1')
        
        # Train on full training set
        model.fit(X_train_use, y_train)
        
        # Test set predictions
        y_pred = model.predict(X_test_use)
        y_prob = model.predict_proba(X_test_use)[:, 1]
        
        # Calculate metrics
        test_accuracy = accuracy_score(y_test, y_pred)
        test_roc_auc = roc_auc_score(y_test, y_prob)
        test_precision = precision_score(y_test, y_pred)
        test_recall = recall_score(y_test, y_pred)
        test_f1 = f1_score(y_test, y_pred)
        
        # Store results
        model_results[name] = {
            'cv_accuracy_mean': cv_accuracy.mean(),
            'cv_accuracy_std': cv_accuracy.std(),
            'cv_roc_auc_mean': cv_roc_auc.mean(),
            'cv_roc_auc_std': cv_roc_auc.std(),
            'cv_f1_mean': cv_f1.mean(),
            'cv_f1_std': cv_f1.std(),
            'test_accuracy': test_accuracy,
            'test_roc_auc': test_roc_auc,
            'test_precision': test_precision,
            'test_recall': test_recall,
            'test_f1': test_f1,
            'predictions': y_pred,
            'probabilities': y_prob
        }
        
        trained_models[name] = model
        
        # Print results
        print(f"  Cross-Validation Accuracy: {cv_accuracy.mean():.4f} (¬±{cv_accuracy.std()*2:.4f})")
        print(f"  Cross-Validation ROC-AUC:  {cv_roc_auc.mean():.4f} (¬±{cv_roc_auc.std()*2:.4f})")
        print(f"  Cross-Validation F1-Score: {cv_f1.mean():.4f} (¬±{cv_f1.std()*2:.4f})")
        print(f"  Test Set Accuracy:         {test_accuracy:.4f}")
        print(f"  Test Set ROC-AUC:          {test_roc_auc:.4f}")
        print(f"  Test Set Precision:        {test_precision:.4f}")
        print(f"  Test Set Recall:           {test_recall:.4f}")
        print(f"  Test Set F1-Score:         {test_f1:.4f}")
    
    # Create comparison dataframe
    comparison_df = pd.DataFrame(model_results).T
    
    print("\n" + "="*70)
    print("üìä MODEL COMPARISON SUMMARY")
    print("="*70)
    print(comparison_df[['cv_roc_auc_mean', 'test_roc_auc', 'test_accuracy', 'test_f1']].round(4))
    
    # Identify best model
    best_model_name = comparison_df['test_roc_auc'].idxmax()
    print(f"\nüèÜ Best Model: {best_model_name}")
    print(f"   Test ROC-AUC: {comparison_df.loc[best_model_name, 'test_roc_auc']:.4f}")
    
else:
    print("‚ùå No data available for model training")
    trained_models = {}
    model_results = {}
    best_model_name = None

ü§ñ Building classification models...
üìä Data split:
  Training set: 1,458,327 samples
  Test set: 364,582 samples
üìä Data split:
  Training set: 1,458,327 samples
  Test set: 364,582 samples

üèÜ MODEL COMPARISON - 5-Fold Stratified Cross-Validation

üìà Training Logistic Regression...

üèÜ MODEL COMPARISON - 5-Fold Stratified Cross-Validation

üìà Training Logistic Regression...
  Cross-Validation Accuracy: 0.6556 (¬±0.0013)
  Cross-Validation ROC-AUC:  0.7384 (¬±0.0011)
  Cross-Validation F1-Score: 0.6459 (¬±0.0013)
  Test Set Accuracy:         0.6552
  Test Set ROC-AUC:          0.7374
  Test Set Precision:        0.6818
  Test Set Recall:           0.6130
  Test Set F1-Score:         0.6456

üìà Training Random Forest...
  Cross-Validation Accuracy: 0.6556 (¬±0.0013)
  Cross-Validation ROC-AUC:  0.7384 (¬±0.0011)
  Cross-Validation F1-Score: 0.6459 (¬±0.0013)
  Test Set Accuracy:         0.6552
  Test Set ROC-AUC:          0.7374
  Test Set Precision:        0.6818
  Tes

In [None]:
# 4. Detailed Evaluation of Best Model

print(f"üîç Detailed evaluation of best model: {best_model_name}")

if best_model_name is not None and len(model_results) > 0:
    # Get best model predictions
    best_predictions = model_results[best_model_name]['predictions']
    best_probabilities = model_results[best_model_name]['probabilities']
    
    # Confusion Matrix
    print("\nüìä CONFUSION MATRIX")
    print("="*50)
    cm = confusion_matrix(y_test, best_predictions)
    print(f"\n                 Predicted")
    print(f"              Loss (0)  Win (1)")
    print(f"Actual Loss    {cm[0,0]:6d}   {cm[0,1]:6d}")
    print(f"Actual Win     {cm[1,0]:6d}   {cm[1,1]:6d}")
    
    # Calculate additional metrics
    tn, fp, fn, tp = cm.ravel()
    specificity = tn / (tn + fp)
    sensitivity = tp / (tp + fn)
    
    print(f"\nüìà Additional Metrics:")
    print(f"  True Negatives:  {tn:,}")
    print(f"  False Positives: {fp:,}")
    print(f"  False Negatives: {fn:,}")
    print(f"  True Positives:  {tp:,}")
    print(f"  Specificity:     {specificity:.4f}")
    print(f"  Sensitivity:     {sensitivity:.4f}")
    
    # Classification Report
    print("\nüìã CLASSIFICATION REPORT")
    print("="*50)
    print(classification_report(y_test, best_predictions, 
                                target_names=['Team 1 Loss', 'Team 1 Win'],
                                digits=4))
    
    # Plot ROC Curve
    print("\nüìâ Plotting ROC Curve...")
    fpr, tpr, thresholds = roc_curve(y_test, best_probabilities)
    roc_auc = roc_auc_score(y_test, best_probabilities)
    
    plt.figure(figsize=(10, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, 
             label=f'{best_model_name} (AUC = {roc_auc:.4f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate', fontsize=12)
    plt.ylabel('True Positive Rate', fontsize=12)
    plt.title(f'ROC Curve - {best_model_name}', fontsize=14, fontweight='bold')
    plt.legend(loc="lower right", fontsize=11)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    # Plot Probability Distribution
    print("\nüìä Plotting Prediction Probability Distribution...")
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    
    # Histogram of probabilities by actual class
    win_probs = best_probabilities[y_test == 1]
    loss_probs = best_probabilities[y_test == 0]
    
    ax1.hist(win_probs, bins=50, alpha=0.7, label='Actual Wins', color='green', edgecolor='black')
    ax1.hist(loss_probs, bins=50, alpha=0.7, label='Actual Losses', color='red', edgecolor='black')
    ax1.axvline(x=0.5, color='black', linestyle='--', linewidth=2, label='Decision Threshold')
    ax1.set_xlabel('Predicted Probability of Win', fontsize=12)
    ax1.set_ylabel('Frequency', fontsize=12)
    ax1.set_title('Distribution of Predicted Probabilities', fontsize=14, fontweight='bold')
    ax1.legend(fontsize=11)
    ax1.grid(True, alpha=0.3)
    
    # Box plot comparison
    ax2.boxplot([loss_probs, win_probs], labels=['Actual Losses', 'Actual Wins'])
    ax2.axhline(y=0.5, color='black', linestyle='--', linewidth=2, label='Decision Threshold')
    ax2.set_ylabel('Predicted Probability of Win', fontsize=12)
    ax2.set_title('Probability Distribution by Actual Outcome', fontsize=14, fontweight='bold')
    ax2.grid(True, alpha=0.3, axis='y')
    ax2.legend(fontsize=11)
    
    plt.tight_layout()
    plt.show()
    
else:
    print("‚ùå No model results available for evaluation")

üîç Detailed evaluation of best model: None
‚ùå No model results available for evaluation


In [None]:
# 5. Feature Importance Analysis

print("üéØ Analyzing Feature Importance...")

if best_model_name is not None and len(trained_models) > 0:
    best_model = trained_models[best_model_name]
    
    if best_model_name == 'Logistic Regression':
        # For logistic regression, analyze coefficients
        print("\nüìä LOGISTIC REGRESSION COEFFICIENTS")
        print("="*70)
        
        coefficients = best_model.coef_[0]
        feature_names = X.columns
        
        # Create importance dataframe
        importance_df = pd.DataFrame({
            'feature': feature_names,
            'coefficient': coefficients,
            'abs_coefficient': np.abs(coefficients),
            'odds_ratio': np.exp(coefficients)
        }).sort_values('abs_coefficient', ascending=False)
        
        print("\nFeature Coefficients (sorted by absolute value):")
        print(importance_df.to_string(index=False))
        
        print("\nüí° Interpretation:")
        print("  - Positive coefficient: increases win probability")
        print("  - Negative coefficient: decreases win probability")
        print("  - Odds ratio > 1: multiplicative increase in odds")
        print("  - Odds ratio < 1: multiplicative decrease in odds")
        
        # Plot coefficients
        plt.figure(figsize=(12, 8))
        colors = ['green' if x > 0 else 'red' for x in importance_df['coefficient']]
        plt.barh(importance_df['feature'], importance_df['coefficient'], color=colors, alpha=0.7, edgecolor='black')
        plt.xlabel('Coefficient Value', fontsize=12)
        plt.ylabel('Feature', fontsize=12)
        plt.title('Logistic Regression Coefficients', fontsize=14, fontweight='bold')
        plt.axvline(x=0, color='black', linestyle='-', linewidth=1)
        plt.grid(True, alpha=0.3, axis='x')
        plt.tight_layout()
        plt.show()
        
    elif best_model_name in ['Random Forest', 'Gradient Boosting']:
        # For tree-based models, use feature importance
        print(f"\nüìä {best_model_name.upper()} FEATURE IMPORTANCE")
        print("="*70)
        
        importances = best_model.feature_importances_
        feature_names = X.columns
        
        # Create importance dataframe
        importance_df = pd.DataFrame({
            'feature': feature_names,
            'importance': importances,
            'importance_pct': importances * 100
        }).sort_values('importance', ascending=False)
        
        print("\nFeature Importance (sorted by importance):")
        print(importance_df.to_string(index=False))
        
        print("\nüí° Interpretation:")
        print("  - Higher importance: feature contributes more to predictions")
        print("  - Importance measures average reduction in impurity")
        
        # Plot importance
        plt.figure(figsize=(12, 8))
        plt.barh(importance_df['feature'], importance_df['importance'], 
                color='steelblue', alpha=0.7, edgecolor='black')
        plt.xlabel('Feature Importance', fontsize=12)
        plt.ylabel('Feature', fontsize=12)
        plt.title(f'{best_model_name} Feature Importance', fontsize=14, fontweight='bold')
        plt.grid(True, alpha=0.3, axis='x')
        plt.tight_layout()
        plt.show()
    
    # Summary of key insights
    print("\n" + "="*70)
    print("üîë KEY INSIGHTS FROM FEATURE IMPORTANCE")
    print("="*70)
    
    top_features = importance_df.head(5)
    print("\nTop 5 Most Important Features:")
    for idx, row in top_features.iterrows():
        print(f"  {row['feature']}")
    
    print("\nüí° For ABM Implementation:")
    print("  - Focus calibration on top 3-5 features")
    print("  - Equipment differences and rank interactions are likely critical")
    print("  - Use these features for scenario sensitivity analysis")
    
else:
    print("‚ùå No model available for feature importance analysis")

üéØ Analyzing Feature Importance...
‚ùå No model available for feature importance analysis


In [None]:
# 6. Model Calibration Analysis - Win Probability by Feature Values

print("üìä Analyzing Win Probability across Feature Ranges...")

if classification_data is not None and best_model_name is not None:
    
    # Prepare data for predictions
    if best_model_name == 'Logistic Regression':
        X_for_pred = scaler.transform(X)
    else:
        X_for_pred = X
    
    # Get all predictions
    all_predictions = trained_models[best_model_name].predict_proba(X_for_pred)[:, 1]
    
    # Add predictions to original data
    analysis_df = classification_data.copy()
    analysis_df['predicted_win_prob'] = all_predictions
    
    print("\nüìà WIN PROBABILITY BY EQUIPMENT ADVANTAGE")
    print("="*60)
    
    # Bin equipment advantage
    eq_bins = [-float('inf'), -3000, -1500, -500, 500, 1500, 3000, float('inf')]
    eq_labels = ['< -$3000', '-$3000 to -$1500', '-$1500 to -$500', 
                 '-$500 to $500', '$500 to $1500', '$1500 to $3000', '> $3000']
    
    analysis_df['eq_advantage_bin'] = pd.cut(analysis_df['diff_eq_val'], 
                                            bins=eq_bins, labels=eq_labels)
    
    eq_analysis = analysis_df.groupby('eq_advantage_bin').agg({
        'team1_winner': ['count', 'mean'],
        'predicted_win_prob': 'mean',
        'diff_eq_val': 'mean'
    }).round(4)
    
    eq_analysis.columns = ['Count', 'Actual_Win_Rate', 'Predicted_Win_Prob', 'Avg_Eq_Diff']
    print(eq_analysis)
    
    # Plot equipment advantage vs win probability
    plt.figure(figsize=(14, 6))
    
    x_pos = range(len(eq_analysis))
    width = 0.35
    
    plt.bar([p - width/2 for p in x_pos], eq_analysis['Actual_Win_Rate'], 
            width, label='Actual Win Rate', alpha=0.8, color='steelblue', edgecolor='black')
    plt.bar([p + width/2 for p in x_pos], eq_analysis['Predicted_Win_Prob'], 
            width, label='Predicted Win Prob', alpha=0.8, color='orange', edgecolor='black')
    
    plt.xlabel('Equipment Advantage Range', fontsize=12)
    plt.ylabel('Win Probability', fontsize=12)
    plt.title('Win Probability by Equipment Advantage (Team 1)', fontsize=14, fontweight='bold')
    plt.xticks(x_pos, eq_analysis.index, rotation=45, ha='right')
    plt.legend(fontsize=11)
    plt.grid(True, alpha=0.3, axis='y')
    plt.tight_layout()
    plt.show()
    
    print("\nüìà WIN PROBABILITY BY RANK ADVANTAGE")
    print("="*60)
    
    # Bin rank advantage
    rank_bins = [-float('inf'), -10, -5, -2, 2, 5, 10, float('inf')]
    rank_labels = ['Much Weaker', 'Weaker', 'Slightly Weaker', 
                   'Even', 'Slightly Stronger', 'Stronger', 'Much Stronger']
    
    analysis_df['rank_advantage_bin'] = pd.cut(analysis_df['diff_rank'], 
                                              bins=rank_bins, labels=rank_labels)
    
    rank_analysis = analysis_df.groupby('rank_advantage_bin').agg({
        'team1_winner': ['count', 'mean'],
        'predicted_win_prob': 'mean',
        'diff_rank': 'mean'
    }).round(4)
    
    rank_analysis.columns = ['Count', 'Actual_Win_Rate', 'Predicted_Win_Prob', 'Avg_Rank_Diff']
    print(rank_analysis)
    
    # Plot rank advantage vs win probability
    plt.figure(figsize=(14, 6))
    
    x_pos = range(len(rank_analysis))
    
    plt.bar([p - width/2 for p in x_pos], rank_analysis['Actual_Win_Rate'], 
            width, label='Actual Win Rate', alpha=0.8, color='green', edgecolor='black')
    plt.bar([p + width/2 for p in x_pos], rank_analysis['Predicted_Win_Prob'], 
            width, label='Predicted Win Prob', alpha=0.8, color='red', edgecolor='black')
    
    plt.xlabel('Rank Advantage Category', fontsize=12)
    plt.ylabel('Win Probability', fontsize=12)
    plt.title('Win Probability by Rank Advantage (Team 1)', fontsize=14, fontweight='bold')
    plt.xticks(x_pos, rank_analysis.index, rotation=45, ha='right')
    plt.legend(fontsize=11)
    plt.grid(True, alpha=0.3, axis='y')
    plt.tight_layout()
    plt.show()
    
    # Interaction analysis
    print("\nüìä WIN PROBABILITY MATRIX: Equipment √ó Rank")
    print("="*60)
    
    # Create simplified bins for matrix
    analysis_df['eq_simple'] = pd.cut(analysis_df['diff_eq_val'], 
                                     bins=[-float('inf'), -1500, 0, 1500, float('inf')],
                                     labels=['Large Disadv', 'Small Disadv', 'Small Adv', 'Large Adv'])
    
    analysis_df['rank_simple'] = pd.cut(analysis_df['diff_rank'], 
                                       bins=[-float('inf'), -5, 0, 5, float('inf')],
                                       labels=['Weaker', 'Even', 'Stronger', 'Much Stronger'])
    
    interaction_matrix = pd.crosstab(
        analysis_df['rank_simple'],
        analysis_df['eq_simple'],
        values=analysis_df['predicted_win_prob'],
        aggfunc='mean'
    ).round(3)
    
    print("\nPredicted Win Probability Matrix:")
    print("(Rows: Rank Advantage, Columns: Equipment Advantage)")
    print(interaction_matrix)
    
    # Heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(interaction_matrix, annot=True, fmt='.3f', cmap='RdYlGn', 
                center=0.5, vmin=0, vmax=1, cbar_kws={'label': 'Win Probability'},
                linewidths=1, linecolor='black')
    plt.title('Win Probability Heatmap: Rank √ó Equipment', fontsize=14, fontweight='bold')
    plt.ylabel('Rank Advantage', fontsize=12)
    plt.xlabel('Equipment Advantage', fontsize=12)
    plt.tight_layout()
    plt.show()
    
else:
    print("‚ùå No data available for calibration analysis")

üìä Analyzing Win Probability across Feature Ranges...
‚ùå No data available for calibration analysis


In [None]:
# 7. Export Model and ABM Integration Code

import pickle
import json
from datetime import datetime

print("üíæ Exporting model for ABM integration...")

if best_model_name is not None and len(trained_models) > 0:
    
    # Prepare export package
    export_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    abm_model_package = {
        'model': trained_models[best_model_name],
        'scaler': scaler if best_model_name == 'Logistic Regression' else None,
        'feature_names': list(X.columns),
        'model_type': best_model_name,
        'performance': {
            'test_accuracy': model_results[best_model_name]['test_accuracy'],
            'test_roc_auc': model_results[best_model_name]['test_roc_auc'],
            'test_f1': model_results[best_model_name]['test_f1'],
            'cv_roc_auc_mean': model_results[best_model_name]['cv_roc_auc_mean'],
            'cv_roc_auc_std': model_results[best_model_name]['cv_roc_auc_std']
        },
        'metadata': {
            'export_timestamp': export_timestamp,
            'training_samples': len(X_train),
            'test_samples': len(X_test),
            'feature_count': X.shape[1],
            'database': 'csgo_parsed',
            'description': 'CS:GO win probability classifier with HLTV ranking controls'
        }
    }
    
    # Export as pickle
    pickle_filename = f"csgo_win_classifier_{export_timestamp}.pkl"
    with open(pickle_filename, 'wb') as f:
        pickle.dump(abm_model_package, f)
    
    print(f"‚úÖ Model exported to: {pickle_filename}")
    
    # Export metadata as JSON
    json_metadata = {
        'model_type': best_model_name,
        'performance': abm_model_package['performance'],
        'metadata': abm_model_package['metadata'],
        'feature_names': abm_model_package['feature_names']
    }
    
    json_filename = f"csgo_win_classifier_metadata_{export_timestamp}.json"
    with open(json_filename, 'w') as f:
        json.dump(json_metadata, f, indent=2)
    
    print(f"‚úÖ Metadata exported to: {json_filename}")
    
    # Create ABM integration example code
    integration_code = f'''
# ============================================================================
# CS:GO ABM Win Probability Calculator - Integration Code
# Model: {best_model_name}
# Exported: {export_timestamp}
# Performance: ROC-AUC = {model_results[best_model_name]['test_roc_auc']:.4f}
# ============================================================================

import pickle
import numpy as np

class CSGOWinProbabilityCalculator:
    """
    Win probability calculator for CS:GO Agent-Based Model
    Uses {best_model_name} trained on real match data with HLTV rankings
    """
    
    def __init__(self, model_path='{pickle_filename}'):
        """Load the trained model"""
        with open(model_path, 'rb') as f:
            package = pickle.load(f)
            self.model = package['model']
            self.scaler = package['scaler']
            self.feature_names = package['feature_names']
            self.model_type = package['model_type']
            self.performance = package['performance']
        
        print(f"üìä Loaded {{self.model_type}}")
        print(f"   ROC-AUC: {{self.performance['test_roc_auc']:.4f}}")
    
    def predict_win_probability(self, t1_eq_val, t1_money_spent, t1_rank,
                                t2_eq_val, t2_money_spent, t2_rank):
        """
        Calculate Team 1 win probability
        
        Args:
            t1_eq_val: Team 1 equipment value
            t1_money_spent: Team 1 money spent this round
            t1_rank: Team 1 HLTV ranking (lower is better)
            t2_eq_val: Team 2 equipment value
            t2_money_spent: Team 2 money spent this round
            t2_rank: Team 2 HLTV ranking (lower is better)
        
        Returns:
            float: Probability of Team 1 winning (0.0 to 1.0)
        """
        
        # Calculate differences
        diff_eq_val = t1_eq_val - t2_eq_val
        diff_money_spent = t1_money_spent - t2_money_spent
        diff_rank = t2_rank - t1_rank  # Positive means T1 is stronger
        
        # Create feature vector (must match training order)
        features = np.array([[
            t1_eq_val,
            t1_money_spent,
            t1_rank,
            t2_eq_val,
            t2_money_spent,
            t2_rank,
            diff_eq_val,
            diff_money_spent,
            diff_rank,
            diff_eq_val * diff_rank,  # eq_x_rank interaction
            diff_money_spent * diff_rank,  # spending_x_rank interaction
            t1_eq_val / (t2_eq_val + 1)  # eq_ratio
        ]])
        
        # Scale features if using Logistic Regression
        if self.scaler is not None:
            features = self.scaler.transform(features)
        
        # Get probability
        probability = self.model.predict_proba(features)[0, 1]
        
        return probability
    
    def predict_batch(self, team_states):
        """
        Predict win probabilities for multiple scenarios
        
        Args:
            team_states: list of dicts with keys:
                        't1_eq_val', 't1_money_spent', 't1_rank',
                        't2_eq_val', 't2_money_spent', 't2_rank'
        
        Returns:
            list: Win probabilities for each scenario
        """
        
        probabilities = []
        for state in team_states:
            prob = self.predict_win_probability(
                state['t1_eq_val'],
                state['t1_money_spent'],
                state['t1_rank'],
                state['t2_eq_val'],
                state['t2_money_spent'],
                state['t2_rank']
            )
            probabilities.append(prob)
        
        return probabilities


# ============================================================================
# USAGE EXAMPLE IN ABM
# ============================================================================

# Initialize calculator
calculator = CSGOWinProbabilityCalculator()

# Example: Calculate win probability for a round
team1_eq = 20000  # Team 1 total equipment value
team1_spent = 18500  # Team 1 money spent
team1_rank = 5  # Team 1 is rank 5 (strong)

team2_eq = 16000  # Team 2 total equipment value
team2_spent = 15000  # Team 2 money spent
team2_rank = 12  # Team 2 is rank 12 (weaker)

win_prob = calculator.predict_win_probability(
    t1_eq_val=team1_eq,
    t1_money_spent=team1_spent,
    t1_rank=team1_rank,
    t2_eq_val=team2_eq,
    t2_money_spent=team2_spent,
    t2_rank=team2_rank
)

print(f"Team 1 Win Probability: {{win_prob:.3f}}")

# Use in ABM simulation
import random

def simulate_round_outcome(calculator, t1_state, t2_state):
    """Simulate a round outcome using the trained model"""
    
    win_prob = calculator.predict_win_probability(
        t1_state['eq_val'],
        t1_state['money_spent'],
        t1_state['rank'],
        t2_state['eq_val'],
        t2_state['money_spent'],
        t2_state['rank']
    )
    
    # Simulate outcome based on probability
    team1_wins = random.random() < win_prob
    
    return team1_wins, win_prob

# Example ABM round simulation
t1_state = {{'eq_val': 20000, 'money_spent': 18500, 'rank': 5}}
t2_state = {{'eq_val': 16000, 'money_spent': 15000, 'rank': 12}}

outcome, probability = simulate_round_outcome(calculator, t1_state, t2_state)
print(f"Round outcome: {{'Team 1 Wins' if outcome else 'Team 2 Wins'}} (p={{probability:.3f}})")
'''
    
    # Save integration code
    code_filename = f"csgo_abm_integration_{export_timestamp}.py"
    with open(code_filename, 'w') as f:
        f.write(integration_code)
    
    print(f"‚úÖ Integration code exported to: {code_filename}")
    
    print("\n" + "="*70)
    print("üéâ EXPORT COMPLETE!")
    print("="*70)
    print(f"\nüì¶ Exported Files:")
    print(f"  1. {pickle_filename} - Complete model package")
    print(f"  2. {json_filename} - Model metadata")
    print(f"  3. {code_filename} - ABM integration code")
    
    print(f"\nüìä Model Performance Summary:")
    print(f"  Model Type: {best_model_name}")
    print(f"  Test ROC-AUC: {model_results[best_model_name]['test_roc_auc']:.4f}")
    print(f"  Test Accuracy: {model_results[best_model_name]['test_accuracy']:.4f}")
    print(f"  Test F1-Score: {model_results[best_model_name]['test_f1']:.4f}")
    
    print(f"\nüöÄ Ready for ABM Integration!")
    print(f"  - Load model using pickle")
    print(f"  - Use CSGOWinProbabilityCalculator class")
    print(f"  - Input: equipment values, spending, HLTV rankings")
    print(f"  - Output: win probability (0.0 to 1.0)")
    
else:
    print("‚ùå No model available for export")

üíæ Exporting model for ABM integration...
‚ùå No model available for export
