In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

print("Building risk score on Harvard's foundation\n")



Building risk score on Harvard's foundation



In [4]:
# Fix data type issues and identify key behavioral features

def fix_data_types(df):
    print("Fixing data types and identifying key features...")
    
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = pd.to_numeric(df[col], errors='ignore')
    
    return df

df = pd.read_csv("../../early/AnalyticDataSet_Braverman_LaPlante_PAB_2013.dat.txt", delimiter='\t', low_memory=False)
df = fix_data_types(df)

print(f"Dataset shape: {df.shape}")
print(f"RG Cases: {df['RG_case'].sum()} ({df['RG_case'].mean()*100:.1f}%)")

Fixing data types and identifying key features...
Dataset shape: (4056, 114)
RG Cases: 2042 (50.3%)


In [5]:
# Identify and Engineer Key Behavioral Features

def identify_key_features(df):
    print("\nIdentifying key behavioral features")
    print("="*50)
    
    # Harvard's Validated Risk Groups (our baseline)
    risk_features = ['RiskGroup1', 'RiskGroup2', 'RiskGroupCombined']
    
    # Variability Measures (strongest predictors according to research)
    variability_features = [col for col in df.columns if 'SD' in col and 'Stakes' in col]
    print(f"Found {len(variability_features)} variability features")
    
    # Cross-Game Activity Features
    game_features = [col for col in df.columns if 'NumberofGames' in col or 'played' in col]
    print(f"Found {len(game_features)} cross-game features")
    
    # Temporal Pattern Features
    temporal_features = [col for col in df.columns if any(x in col for x in ['wk1', 'wk2', 'wk3', 'wk4', 'frequency', 'trajectory'])]
    print(f"Found {len(temporal_features)} temporal features")
    
    # Financial Behavior Features
    financial_features = [col for col in df.columns if any(x in col for x in ['sumstake', 'avgbet', 'totalactive'])]
    print(f"Found {len(financial_features)} financial features")
    
    return {
        'risk': risk_features,
        'variability': variability_features[:10],  # Top 10 to avoid overfitting
        'games': game_features,
        'temporal': temporal_features[:15],  # Top 15
        'financial': financial_features[:15]  # Top 15
    }

feature_groups = identify_key_features(df)


Identifying key behavioral features
Found 3 variability features
Found 6 cross-game features
Found 33 temporal features
Found 40 financial features


In [6]:
# Create feature set

def create_enhanced_features(df):
    print("\nCreating features")
    print("="*30)
    
    df_enhanced = df.copy()
    
    # Loss-Chasing Indicators
    loss_cols = [col for col in df.columns if 'loss' in col.lower()]
    if loss_cols:
        print(f"Found {len(loss_cols)} loss-related columns")
        # Create loss-chasing intensity score
        loss_features = []
        for col in loss_cols[:3]:
            if df[col].dtype in ['int64', 'float64']:
                loss_features.append(col)
        
        if loss_features:
            # Normalize and combine loss features
            for col in loss_features:
                df_enhanced[f'{col}_normalized'] = (df[col] - df[col].mean()) / (df[col].std() + 1e-8)
            
            df_enhanced['loss_chasing_score'] = df_enhanced[[f'{col}_normalized' for col in loss_features]].mean(axis=1)
    
    # Activity Escalation Score
    activity_cols = [col for col in df.columns if 'totalactive' in col]
    if activity_cols:
        df_enhanced['activity_escalation'] = df[activity_cols[0]]
        print("Created activity escalation feature")
    
    # Cross-Game Risk Score
    if 'NumberofGames31days' in df.columns:
        df_enhanced['cross_game_risk'] = df['NumberofGames31days'] / df['NumberofGames31days'].max()
        print("Created cross-game risk feature")
    
    # Variability Composite Score
    variability_cols = [col for col in df.columns if 'SD' in col and 'Stakes' in col]
    if len(variability_cols) >= 2:
        # Normalize variability measures
        variability_scores = []
        for col in variability_cols[:3]:
            if df[col].dtype in ['int64', 'float64'] and df[col].std() > 0:
                normalized_col = (df[col] - df[col].mean()) / df[col].std()
                variability_scores.append(normalized_col)
        
        if variability_scores:
            df_enhanced['variability_composite'] = np.mean(variability_scores, axis=0)
            print("Created variability composite score")
    
    return df_enhanced

df_enhanced = create_enhanced_features(df)


Creating features
Created activity escalation feature
Created cross-game risk feature


In [None]:
# Build enhanced risk model

def build_enhanced_risk_model(df, feature_groups):
    print("\nBuilding enhanced risk model")
    print("="*50)
    
    all_features = []
    for group_name, features in feature_groups.items():
        available_features = [f for f in features if f in df.columns]
        all_features.extend(available_features)
        print(f"{group_name}: {len(available_features)} features")
    
    enhanced_features = ['loss_chasing_score', 'activity_escalation', 'cross_game_risk', 'variability_composite']
    available_enhanced = [f for f in enhanced_features if f in df.columns]
    all_features.extend(available_enhanced)
    
    print(f"\nTotal features for modeling: {len(all_features)}")
    
    model_df = df[df['RG_case'].notna()].copy()
    
    final_features = []
    for feature in all_features:
        if feature in model_df.columns:
            # Check if feature has reasonable variance
            if model_df[feature].dtype in ['int64', 'float64']:
                if model_df[feature].std() > 0 and model_df[feature].notna().sum() > 100:
                    final_features.append(feature)
    
    print(f"Final feature set: {len(final_features)} features")
    
    if len(final_features) < 5:
        print("Not enough valid features for modeling")
        return None, None, None
    
    X = model_df[final_features].fillna(0)
    y = model_df['RG_case']
    
    if 'ValidationSet' in model_df.columns:
        train_mask = model_df['ValidationSet'] == 0
        X_train, X_test = X[train_mask], X[~train_mask]
        y_train, y_test = y[train_mask], y[~train_mask]
        print(f"Using existing validation split: {len(X_train)} train, {len(X_test)} test")
    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
        print(f"Created new split: {len(X_train)} train, {len(X_test)} test")
    
    # Train Random Forest (proven best for this problem according to research)
    rf_model = RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        random_state=42,
        class_weight='balanced'
    )
    
    rf_model.fit(X_train, y_train)
    
    # Evaluate model
    y_pred = rf_model.predict(X_test)
    y_pred_proba = rf_model.predict_proba(X_test)[:, 1]
    
    print(f"\nModel performance:")
    print(f"AUC Score: {roc_auc_score(y_test, y_pred_proba):.3f}")
    print(f"Accuracy: {(y_pred == y_test).mean():.3f}")
    
    feature_importance = pd.DataFrame({
        'feature': final_features,
        'importance': rf_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print(f"\nTop 10 most important features:")
    for i, (_, row) in enumerate(feature_importance.head(10).iterrows()):
        print(f"{i+1:2d}. {row['feature']:<30} {row['importance']:.3f}")
    
    return rf_model, final_features, feature_importance

model, features, importance = build_enhanced_risk_model(df_enhanced, feature_groups)



Building enhanced risk model
risk: 3 features
variability: 3 features
games: 6 features
temporal: 15 features
financial: 15 features

Total features for modeling: 44
Final feature set: 40 features
Using existing validation split: 3037 train, 1019 test

Model performance:
AUC Score: 0.750
Accuracy: 0.698

Top 10 most important features:
 1. p2sumstake31days               0.100
 2. p2avgbetsperday                0.064
 3. p1sumstake31days               0.064
 4. p1avgbetsperday                0.051
 5. p2avgbetsperactiveday          0.046
 6. p1avgbetsperactiveday          0.045
 7. cross_game_risk                0.041
 8. NumberofGames31days            0.035
 9. pcsumstake31days               0.035
10. p2totalactivedays_31days       0.034


In [8]:
# Create risk scoring function

def create_risk_scorer(model, features):
    if model is None:
        return None
    
    def score_user(user_data):
        """Score a single user and return risk level + explanation"""
        user_features = []
        for feature in features:
            value = user_data.get(feature, 0)
            user_features.append(value)
        
        risk_prob = model.predict_proba([user_features])[0][1]
        
        if risk_prob >= 0.7:
            risk_level = "HIGH"
            color = "Red"
        elif risk_prob >= 0.4:
            risk_level = "MEDIUM" 
            color = "Yellow"
        else:
            risk_level = "LOW"
            color = "Green"
        
        return {
            'risk_probability': risk_prob,
            'risk_level': risk_level,
            'color': color,
            'explanation': f"Risk factors: {[f for f in features[:3] if user_data.get(f, 0) > 0]}"
        }
    
    return score_user

risk_scorer = create_risk_scorer(model, features)

In [9]:
# Demo the system

if model is not None:
    print("Enhanced Risk Model Built Successfully")
    print("Risk Scoring Function Created")
    print("Ready for Dashboard Integration")
    
    print(f"\nSample Risk Assessments")
    print("-" * 50)
    
    sample_users = df_enhanced.sample(3) if len(df_enhanced) > 0 else []
    for i, (_, user) in enumerate(sample_users.iterrows()):
        if risk_scorer:
            user_dict = user.to_dict()
            assessment = risk_scorer(user_dict)
            actual_outcome = "RG Case" if user['RG_case'] == 1 else "Control"
            
            print(f"\nUser {i+1}: {assessment['color']} {assessment['risk_level']} Risk")
            print(f"  Probability: {assessment['risk_probability']:.3f}")
            print(f"  Actual Outcome: {actual_outcome}")
            print(f"  Harvard Risk Group: {user.get('RiskGroupCombined', 'Unknown')}")

Enhanced Risk Model Built Successfully
Risk Scoring Function Created
Ready for Dashboard Integration

Sample Risk Assessments
--------------------------------------------------

User 1: Red HIGH Risk
  Probability: 0.930
  Actual Outcome: RG Case
  Harvard Risk Group: 1

User 2: Green LOW Risk
  Probability: 0.256
  Actual Outcome: Control
  Harvard Risk Group: 0

User 3: Yellow MEDIUM Risk
  Probability: 0.597
  Actual Outcome: RG Case
  Harvard Risk Group: 0
