In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

plt.style.use('default')
sns.set_palette("husl")


In [7]:
# Load the datasets

def load_dataset(filepath, delimiter='\t'):
    try:
        
        df = pd.read_csv(filepath, delimiter=delimiter, low_memory=False)
        print(f"Loaded {filepath} with {delimiter} delimiter")
        return df
    except:
        try:
            df = pd.read_csv(filepath, delimiter=',', low_memory=False)
            print(f"Loaded {filepath} with comma delimiter")
            return df
        except Exception as e:
            print(f"Failed to load {filepath}: {e}")
            return None

print("Loading datasets...")

# Dataset 1: Early detection (31-day behavioral markers)
early_detection_path = "../../early/AnalyticDataSet_Braverman_LaPlante_PAB_2013.dat.txt"
df_early = load_dataset(early_detection_path)

# Dataset 2: Full history (complete behavioral journey)  
full_history_path = "../../trigger/AnalyticDataset_Gray_LaPlante_PAB_2012.dat.txt"
df_full = load_dataset(full_history_path)

Loading datasets...
Loaded ../../early/AnalyticDataSet_Braverman_LaPlante_PAB_2013.dat.txt with 	 delimiter
Loaded ../../trigger/AnalyticDataset_Gray_LaPlante_PAB_2012.dat.txt with 	 delimiter


In [8]:
# Basic Data Overview

def explore_dataset(df, name):
    
    print(f"\n{'='*50}")
    print(f"DATASET: {name}")
    print(f"{'='*50}")
    
    if df is None:
        print("Dataset not loaded")
        return
    
    print(f"Shape: {df.shape}")
    print(f"Columns: {len(df.columns)}")
    
    print(f"\nFirst 3 rows:")
    print(df.head(3))
    
    print(f"\nColumn Overview:")
    print(f"Columns: {list(df.columns[:10])}{'...' if len(df.columns) > 10 else ''}")
    
    print(f"\nMissing Data:")
    missing = df.isnull().sum()
    missing_pct = (missing / len(df)) * 100
    missing_summary = pd.DataFrame({
        'Missing_Count': missing,
        'Missing_Percentage': missing_pct
    }).sort_values('Missing_Count', ascending=False)
    print(missing_summary.head(10))
    
    return df

# Explore datasets
df_early = explore_dataset(df_early, "Early Detection (31-day)")
df_full = explore_dataset(df_full, "Full History (Complete Journey)")


DATASET: Early Detection (31-day)
Shape: (4056, 114)
Columns: 114

First 3 rows:
    USERID age gender  RG_case  ValidationSet first_active_product1_31days  \
0  5671284  25      1        0              0                   12/23/2007   
1  6408486  37      2        0              0                    5/10/2008   
2  2044508  25      1        1              0                   10/22/2005   

  first_active_product2_31days first_active_product4_31days  \
0                                                             
1                                                             
2                   10/23/2005                                

  first_active_games_31days first_active_poker_31days  ...  \
0                                                      ...   
1                                                      ...   
2                                                      ...   

   p1wkendsumstakesratio  p2wkendsumstakesratio  pcwkendsumstakesratio  \
0                      1     

In [9]:
# Key Variables Analysis

def analyze_key_variables(df, dataset_name):
    print(f"\n{'='*50}")
    print(f"KEY VARIABLES ANALYSIS: {dataset_name}")
    print(f"{'='*50}")
    
    if df is None:
        return
    
    # Looking for RG case distribution
    if 'RG_case' in df.columns:
        print("\nCase vs Control Distribution:")
        rg_dist = df['RG_case'].value_counts()
        print(f"Controls (0): {rg_dist.get(0, 0)}")
        print(f"RG Cases (1): {rg_dist.get(1, 0)}")
        print(f"RG Case Rate: {(rg_dist.get(1, 0) / len(df)) * 100:.1f}%")
    
    # Looking for risk group variables (from early detection dataset)
    risk_cols = [col for col in df.columns if 'risk' in col.lower()]
    if risk_cols:
        print(f"\nRisk group variables found:")
        for col in risk_cols:
            if df[col].dtype in ['int64', 'float64']:
                print(f"{col}: {df[col].value_counts().to_dict()}")
    
    # Looking for variability measures (key predictors)
    variability_cols = [col for col in df.columns if 'SD' in col or 'variability' in col.lower()]
    if variability_cols:
        print(f"\nVariability measures (key predictors):")
        for col in variability_cols[:5]:
            if df[col].dtype in ['int64', 'float64']:
                print(f"{col}: mean={df[col].mean():.2f}, std={df[col].std():.2f}")
    
    # Looking for loss/hold variables (for loss-chasing)
    loss_cols = [col for col in df.columns if 'loss' in col.lower() or 'hold' in col.lower()]
    if loss_cols:
        print(f"\nLoss/Hold variables (loss-chasing detection):")
        for col in loss_cols[:5]:
            if df[col].dtype in ['int64', 'float64']:
                print(f"{col}: mean={df[col].mean():.2f}")

# Analyze key variables in both datasets
analyze_key_variables(df_early, "Early Detection")
analyze_key_variables(df_full, "Full History")


KEY VARIABLES ANALYSIS: Early Detection

Case vs Control Distribution:
Controls (0): 2014
RG Cases (1): 2042
RG Case Rate: 50.3%

Risk group variables found:
RiskGroup1: {0: 3880, 1: 176}
RiskGroup2: {0: 3813, 1: 243}
RiskGroupCombined: {0: 3637, 1: 419}

Variability measures (key predictors):

KEY VARIABLES ANALYSIS: Full History

Case vs Control Distribution:
Controls (0): 2066
RG Cases (1): 2066
RG Case Rate: 50.0%

Loss/Hold variables (loss-chasing detection):


In [10]:
# Data Quality Check

def data_quality_check(df, name):
    print(f"\n{'='*30}")
    print(f"DATA QUALITY: {name}")
    print(f"{'='*30}")
    
    if df is None:
        return
    
    # Checking for duplicate user IDs
    if 'USERID' in df.columns or 'UserID' in df.columns:
        userid_col = 'USERID' if 'USERID' in df.columns else 'UserID'
        duplicates = df[userid_col].duplicated().sum()
        print(f"Duplicate UserIDs: {duplicates}")
        print(f"Unique Users: {df[userid_col].nunique()}")
    
    # Checking data types
    print(f"\nData Types Summary:")
    dtype_summary = df.dtypes.value_counts()
    print(dtype_summary)
    
    # Checking for obvious data issues
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        print(f"\nNumeric Columns Range Check:")
        for col in numeric_cols[:5]:
            print(f"{col}: [{df[col].min():.2f}, {df[col].max():.2f}]")

data_quality_check(df_early, "Early Detection")
data_quality_check(df_full, "Full History")

print("\n" + "="*60)
print("NEXT STEPS:")
print("="*60)
print("1. Data loaded and explored")
print("2. Identify key predictive features")
print("3. Build risk score model")
print("4. Create early warning dashboard")
print("5. Demo preparation")



DATA QUALITY: Early Detection
Duplicate UserIDs: 1
Unique Users: 4055

Data Types Summary:
int64      58
float64    31
object     25
Name: count, dtype: int64

Numeric Columns Range Check:
USERID: [32639.00, 9859152.00]
RG_case: [0.00, 1.00]
ValidationSet: [0.00, 1.00]
p1sumstake31days: [0.00, 31150.03]
p1sumbets31days: [0.00, 6651.00]

DATA QUALITY: Full History
Duplicate UserIDs: 0
Unique Users: 4132

Data Types Summary:
object    94
int64      3
Name: count, dtype: int64

Numeric Columns Range Check:
UserID: [31965.00, 9859152.00]
RG_case: [0.00, 1.00]
Missing_Daily_Transactions: [0.00, 1.00]

NEXT STEPS:
1. Data loaded and explored
2. Identify key predictive features
3. Build risk score model
5. Demo preparation
