In [1]:
import pandas as pd
from datetime import timedelta
from sklearn.model_selection import train_test_split

def load_and_split_players(data_file):
    # Check file extension
    if not data_file.endswith('.csv'):
        raise ValueError("Only CSV files are supported.")
    
    # Load CSV
    df = pd.read_csv(data_file)

    # Convert timestamp from seconds to datetime
    df['datetime'] = pd.to_datetime(df['time'], unit='s')
    
    # Get unique player IDs and split them
    player_ids = df['device'].unique()
    train_ids, test_ids = train_test_split(player_ids, test_size=0.2, random_state=42)
    
    # Filter data by player split
    train_df = df[df['device'].isin(train_ids)].copy()
    test_df = df[df['device'].isin(test_ids)].copy()
    
    return train_df, test_df

# Example usage:
train_df, test_df = load_and_split_players("data/game1_processed/rawdata_game1.csv")

print(f"DS1 (Train): {len(train_df['device'].unique())} players")
print(f"DS2 (Test): {len(test_df['device'].unique())} players")


DS1 (Train): 20765 players
DS2 (Test): 5192 players


In [2]:
def create_churn_features(player_df, observation_days, prediction_days=7):
    """
    Create features for a player based on their activity during the observation period
    and determine if they churned during the prediction period.
    """
    # Sort by datetime
    player_df = player_df.sort_values('datetime')
    
    # Get the last datetime in the observation period
    last_obs_time = player_df['datetime'].min() + timedelta(days=observation_days)
    
    # Split data into observation and prediction periods
    obs_df = player_df[player_df['datetime'] <= last_obs_time]
    pred_df = player_df[player_df['datetime'] > last_obs_time]
    
    # Calculate features from observation period
    features = {
        'play_count': len(obs_df),
        'active_days': obs_df['datetime'].dt.date.nunique(),
        'total_playtime': obs_df['duration'].sum() if 'duration' in obs_df.columns else 0,
        'mean_score': obs_df['score'].mean(),
        'score_std': obs_df['score'].std(),
        'best_score': obs_df['score'].max(),
        'days_since_last_play': (last_obs_time - obs_df['datetime'].max()).days,
        'avg_session_gap': obs_df['datetime'].diff().mean().total_seconds() / 3600 if len(obs_df) > 1 else 0,
        'last_7day_activity': len(obs_df[obs_df['datetime'] >= last_obs_time - timedelta(days=7)])
    }
    
    # Calculate churn label if prediction period is specified
    if prediction_days > 0:
        last_pred_time = last_obs_time + timedelta(days=prediction_days)
        pred_activity = pred_df[pred_df['datetime'] <= last_pred_time]
        features['churned'] = len(pred_activity) == 0
    
    return features

# Process training data (DS1)
print("Creating features for DS1 (training set)...")
ds1_features = []

for player_id, player_df in train_df.groupby('device'):
    try:
        features = create_churn_features(player_df, observation_days=5, prediction_days=7)
        features['device'] = player_id
        ds1_features.append(features)
    except Exception as e:
        print(f"Error processing player {player_id}: {str(e)}")
        continue

ds1 = pd.DataFrame(ds1_features)
print(f"\nCreated {len(ds1)} training samples")
print("\nSample features:")
display(ds1.head())

Creating features for DS1 (training set)...

Created 20764 training samples

Sample features:


Unnamed: 0,play_count,active_days,total_playtime,mean_score,score_std,best_score,days_since_last_play,avg_session_gap,last_7day_activity,churned,device
0,3,1,0,58.0,51.739733,115,4,0.015833,3,True,0
1,3,1,0,3.333333,3.21455,7,4,0.008889,3,True,0
2,1,1,0,21.0,,21,5,0.0,1,True,10941525590041
3,2,1,0,32.5,0.707107,33,4,0.014444,2,True,12345678901237
4,5,1,0,11.2,13.479614,30,4,0.003264,5,True,12345678912345


In [9]:
def process_test_data(test_df, observation_days=5):
    """
    Process the test data (DS2) using the same feature extraction as DS1.
    Note: We don't calculate churn labels for DS2 since it's for evaluation only.
    """
    print("Creating features for DS2 (test set)...")
    ds2_features = []
    
    for player_id, player_df in test_df.groupby('device'):
        try:
            # Use same feature extraction but without prediction period
            features = create_churn_features(player_df, observation_days, prediction_days=10)
            
            # Store player ID for reference
            features['device'] = player_id  
            ds2_features.append(features)
        except Exception as e:
            print(f"Error processing player {player_id}: {str(e)}")
            continue
    
    ds2 = pd.DataFrame(ds2_features)
    print(f"\nCreated {len(ds2)} test samples")
    print("\nSample features:")
    display(ds2.head())
    
    return ds2

# Process test data (DS2)
ds2 = process_test_data(test_df)

# Save the processed datasets
ds1.to_csv("data/game1_processed/ds1_train.csv", index=False)
ds2.to_csv("data/game1_processed/ds2_test.csv", index=False)
print("\nSaved datasets:")
print("- DS1 (Train): ../data/game1_processed/ds1_train.csv")
print("- DS2 (Test): ../data/game1_processed/ds2_test.csv")

Creating features for DS2 (test set)...

Created 5192 test samples

Sample features:


Unnamed: 0,play_count,active_days,total_playtime,mean_score,score_std,best_score,days_since_last_play,avg_session_gap,last_7day_activity,churned,device
0,4,1,0,12.5,13.892444,32,4,0.991296,4,True,14035001426733
1,14,2,0,40.571429,43.964521,168,4,1.62312,14,False,14097000302455
2,1,1,0,20.0,,20,5,0.0,1,True,14097001014984
3,13,2,0,153.923077,188.298726,507,4,1.743981,13,True,14097008062978
4,6,1,0,108.166667,118.560392,335,4,0.026111,6,True,14097008353997



Saved datasets:
- DS1 (Train): ../data/game1_processed/ds1_train.csv
- DS2 (Test): ../data/game1_processed/ds2_test.csv


In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns

def prepare_model_data(ds1, ds2):
    """Prepare features and labels for modeling"""
    # Features (exclude device ID and churn label)
    feature_cols = [col for col in ds1.columns if col not in ['device', 'churned']]
    
    # DS1 (Train)
    X_train = ds1[feature_cols]
    y_train = ds1['churned']
    
    # DS2 (Test) 
    X_test = ds2[feature_cols]
    
    # Impute missing values (using train stats)
    imputer = SimpleImputer(strategy='mean')
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)
    
    # Scale features (using train stats)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    return X_train, X_test, y_train, feature_cols

def train_models(X_train, y_train):
    """Train multiple classifiers"""
    models = {
        'Decision Tree': DecisionTreeClassifier(max_depth=5, random_state=42),
        'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42),
        'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42)
    }
    
    for name, model in models.items():
        print(f"Training {name}...")
        model.fit(X_train, y_train)
    
    return models

def evaluate_models(models, X_test, ds2, feature_cols):
    """Evaluate models on test data"""
    results = {}
    
    for name, model in models.items():
        # Get predicted probabilities
        y_proba = model.predict_proba(X_test)[:, 1]
        ds2[f'{name}_churn_prob'] = y_proba
        
        # Feature importance (for tree models)
        if hasattr(model, 'feature_importances_'):
            importance = pd.Series(model.feature_importances_, index=feature_cols)
            print(f"\n{name} Feature Importance:")
            print(importance.sort_values(ascending=False).head(10))
        
        results[name] = model
    
    # Save test results with predictions
    ds2.to_csv("data/game1_processed/ds2_with_predictions.csv", index=False)
    print("\nSaved test predictions: ../data/game1_processed/ds2_with_predictions.csv")
    
    return results

# Prepare data
X_train, X_test, y_train, feature_cols = prepare_model_data(ds1, ds2)

# Train models
models = train_models(X_train, y_train)

# Evaluate on DS2
model_results = evaluate_models(models, X_test, ds2, feature_cols)

Training Decision Tree...
Training Random Forest...
Training Logistic Regression...

Decision Tree Feature Importance:
days_since_last_play    0.581149
active_days             0.160392
avg_session_gap         0.114609
last_7day_activity      0.056582
play_count              0.056324
score_std               0.021106
best_score              0.007978
mean_score              0.001859
total_playtime          0.000000
dtype: float64

Random Forest Feature Importance:
active_days             0.327116
days_since_last_play    0.219592
avg_session_gap         0.148230
play_count              0.122257
last_7day_activity      0.109172
best_score              0.026220
mean_score              0.023752
score_std               0.023660
total_playtime          0.000000
dtype: float64

Saved test predictions: ../data/game1_processed/ds2_with_predictions.csv


In [8]:
# Save datasets in JSONL format (required by PDF)
import jsonlines

def save_to_jsonl(df, path):
    """Save DataFrame to JSONL file"""
    with jsonlines.open(path, 'w') as writer:
        writer.write_all(df.to_dict('records'))

# Save DS1 (training data)
save_to_jsonl(ds1, "data/game1_processed/ds1_train.jsonl")
print("Saved DS1 (training) as JSONL: ../data/game1_processed/ds1_train.jsonl")

# Save DS2 (test data) - without churn labels if they don't exist
if 'churned' in ds2.columns:
    save_to_jsonl(ds2.drop(columns=['churned']), "data/game1_processed/ds2_test.jsonl")
else:
    save_to_jsonl(ds2, "data/game1_processed/ds2_test.jsonl")
print("Saved DS2 (test) as JSONL: ../data/game1_processed/ds2_test.jsonl")

Saved DS1 (training) as JSONL: ../data/game1_processed/ds1_train.jsonl
Saved DS2 (test) as JSONL: ../data/game1_processed/ds2_test.jsonl
